1/* 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC Neon armv8 64 bit. 19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 20 21// Read 8 Y, 4 U and 4 V from 422 22#define READYUV422 \ 23 MEMACCESS(0) \ 24 "ld1 {v0.8b}, [%0], #8 \n" \ 25 MEMACCESS(1) \ 26 "ld1 {v1.s}[0], [%1], #4 \n" \ 27 MEMACCESS(2) \ 28 "ld1 {v1.s}[1], [%2], #4 \n" 29 30// Read 8 Y, 8 U and 8 V from 444 31#define READYUV444 \ 32 MEMACCESS(0) \ 33 "ld1 {v0.8b}, [%0], #8 \n" \ 34 MEMACCESS(1) \ 35 "ld1 {v1.d}[0], [%1], #8 \n" \ 36 MEMACCESS(2) \ 37 "ld1 {v1.d}[1], [%2], #8 \n" \ 38 "uaddlp v1.8h, v1.16b \n" \ 39 "rshrn v1.8b, v1.8h, #1 \n" 40 41// Read 8 Y, and set 4 U and 4 V to 128 42#define READYUV400 \ 43 MEMACCESS(0) \ 44 "ld1 {v0.8b}, [%0], #8 \n" \ 45 "movi v1.8b , #128 \n" 46 47// Read 8 Y and 4 UV from NV12 48#define READNV12 \ 49 MEMACCESS(0) \ 50 "ld1 {v0.8b}, [%0], #8 \n" \ 51 MEMACCESS(1) \ 52 "ld1 {v2.8b}, [%1], #8 \n" \ 53 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 54 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 55 "ins v1.s[1], v3.s[0] \n" 56 57// Read 8 Y and 4 VU from NV21 58#define READNV21 \ 59 MEMACCESS(0) \ 60 "ld1 {v0.8b}, [%0], #8 \n" \ 61 MEMACCESS(1) \ 62 "ld1 {v2.8b}, [%1], #8 \n" \ 63 "uzp1 v3.8b, v2.8b, v2.8b \n" \ 64 "uzp2 v1.8b, v2.8b, v2.8b \n" \ 65 "ins v1.s[1], v3.s[0] \n" 66 67// Read 8 YUY2 68#define READYUY2 \ 69 MEMACCESS(0) \ 70 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ 71 "uzp2 v3.8b, v1.8b, v1.8b \n" \ 72 "uzp1 v1.8b, v1.8b, v1.8b \n" \ 73 "ins v1.s[1], v3.s[0] \n" 74 75// Read 8 UYVY 76#define READUYVY \ 77 MEMACCESS(0) \ 78 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ 79 "orr v0.8b, v3.8b, v3.8b \n" \ 80 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 81 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 82 "ins v1.s[1], v3.s[0] \n" 83 84#define YUVTORGB_SETUP \ 85 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ 86 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ 87 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ 88 "ld1r {v31.4s}, [%[kYToRgb]] \n" \ 89 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ 90 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" 91 92#define YUVTORGB(vR, vG, vB) \ 93 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ 94 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ 95 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ 96 "ushll v0.4s, v0.4h, #0 \n" \ 97 "mul v3.4s, v3.4s, v31.4s \n" \ 98 "mul v0.4s, v0.4s, v31.4s \n" \ 99 "sqshrun v0.4h, v0.4s, #16 \n" \ 100 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ 101 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ 102 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ 103 "uxtl v2.8h, v2.8b \n" \ 104 "uxtl v1.8h, v1.8b \n" /* Extract U */ \ 105 "mul v3.8h, v1.8h, v27.8h \n" \ 106 "mul v5.8h, v1.8h, v29.8h \n" \ 107 "mul v6.8h, v2.8h, v30.8h \n" \ 108 "mul v7.8h, v2.8h, v28.8h \n" \ 109 "sqadd v6.8h, v6.8h, v5.8h \n" \ 110 "sqadd " #vB \ 111 ".8h, v24.8h, v0.8h \n" /* B */ \ 112 "sqadd " #vG \ 113 ".8h, v25.8h, v0.8h \n" /* G */ \ 114 "sqadd " #vR \ 115 ".8h, v26.8h, v0.8h \n" /* R */ \ 116 "sqadd " #vB ".8h, " #vB \ 117 ".8h, v3.8h \n" /* B */ \ 118 "sqsub " #vG ".8h, " #vG \ 119 ".8h, v6.8h \n" /* G */ \ 120 "sqadd " #vR ".8h, " #vR \ 121 ".8h, v7.8h \n" /* R */ \ 122 "sqshrun " #vB ".8b, " #vB \ 123 ".8h, #6 \n" /* B */ \ 124 "sqshrun " #vG ".8b, " #vG \ 125 ".8h, #6 \n" /* G */ \ 126 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ 127 128void I444ToARGBRow_NEON(const uint8* src_y, 129 const uint8* src_u, 130 const uint8* src_v, 131 uint8* dst_argb, 132 const struct YuvConstants* yuvconstants, 133 int width) { 134 asm volatile ( 135 YUVTORGB_SETUP 136 "movi v23.8b, #255 \n" /* A */ 137 "1: \n" 138 READYUV444 139 YUVTORGB(v22, v21, v20) 140 "subs %w4, %w4, #8 \n" 141 MEMACCESS(3) 142 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 143 "b.gt 1b \n" 144 : "+r"(src_y), // %0 145 "+r"(src_u), // %1 146 "+r"(src_v), // %2 147 "+r"(dst_argb), // %3 148 "+r"(width) // %4 149 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 150 [kUVToG]"r"(&yuvconstants->kUVToG), 151 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 152 [kYToRgb]"r"(&yuvconstants->kYToRgb) 153 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 154 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 155 ); 156} 157 158void I422ToARGBRow_NEON(const uint8* src_y, 159 const uint8* src_u, 160 const uint8* src_v, 161 uint8* dst_argb, 162 const struct YuvConstants* yuvconstants, 163 int width) { 164 asm volatile ( 165 YUVTORGB_SETUP 166 "movi v23.8b, #255 \n" /* A */ 167 "1: \n" 168 READYUV422 169 YUVTORGB(v22, v21, v20) 170 "subs %w4, %w4, #8 \n" 171 MEMACCESS(3) 172 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 173 "b.gt 1b \n" 174 : "+r"(src_y), // %0 175 "+r"(src_u), // %1 176 "+r"(src_v), // %2 177 "+r"(dst_argb), // %3 178 "+r"(width) // %4 179 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 180 [kUVToG]"r"(&yuvconstants->kUVToG), 181 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 182 [kYToRgb]"r"(&yuvconstants->kYToRgb) 183 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 184 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 185 ); 186} 187 188void I422AlphaToARGBRow_NEON(const uint8* src_y, 189 const uint8* src_u, 190 const uint8* src_v, 191 const uint8* src_a, 192 uint8* dst_argb, 193 const struct YuvConstants* yuvconstants, 194 int width) { 195 asm volatile ( 196 YUVTORGB_SETUP 197 "1: \n" 198 READYUV422 199 YUVTORGB(v22, v21, v20) 200 MEMACCESS(3) 201 "ld1 {v23.8b}, [%3], #8 \n" 202 "subs %w5, %w5, #8 \n" 203 MEMACCESS(4) 204 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" 205 "b.gt 1b \n" 206 : "+r"(src_y), // %0 207 "+r"(src_u), // %1 208 "+r"(src_v), // %2 209 "+r"(src_a), // %3 210 "+r"(dst_argb), // %4 211 "+r"(width) // %5 212 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 213 [kUVToG]"r"(&yuvconstants->kUVToG), 214 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 215 [kYToRgb]"r"(&yuvconstants->kYToRgb) 216 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 217 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 218 ); 219} 220 221void I422ToRGBARow_NEON(const uint8* src_y, 222 const uint8* src_u, 223 const uint8* src_v, 224 uint8* dst_rgba, 225 const struct YuvConstants* yuvconstants, 226 int width) { 227 asm volatile ( 228 YUVTORGB_SETUP 229 "movi v20.8b, #255 \n" /* A */ 230 "1: \n" 231 READYUV422 232 YUVTORGB(v23, v22, v21) 233 "subs %w4, %w4, #8 \n" 234 MEMACCESS(3) 235 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 236 "b.gt 1b \n" 237 : "+r"(src_y), // %0 238 "+r"(src_u), // %1 239 "+r"(src_v), // %2 240 "+r"(dst_rgba), // %3 241 "+r"(width) // %4 242 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 243 [kUVToG]"r"(&yuvconstants->kUVToG), 244 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 245 [kYToRgb]"r"(&yuvconstants->kYToRgb) 246 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 247 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 248 ); 249} 250 251void I422ToRGB24Row_NEON(const uint8* src_y, 252 const uint8* src_u, 253 const uint8* src_v, 254 uint8* dst_rgb24, 255 const struct YuvConstants* yuvconstants, 256 int width) { 257 asm volatile ( 258 YUVTORGB_SETUP 259 "1: \n" 260 READYUV422 261 YUVTORGB(v22, v21, v20) 262 "subs %w4, %w4, #8 \n" 263 MEMACCESS(3) 264 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 265 "b.gt 1b \n" 266 : "+r"(src_y), // %0 267 "+r"(src_u), // %1 268 "+r"(src_v), // %2 269 "+r"(dst_rgb24), // %3 270 "+r"(width) // %4 271 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 272 [kUVToG]"r"(&yuvconstants->kUVToG), 273 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 274 [kYToRgb]"r"(&yuvconstants->kYToRgb) 275 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 276 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 277 ); 278} 279 280#define ARGBTORGB565 \ 281 "shll v0.8h, v22.8b, #8 \n" /* R */ \ 282 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 283 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 284 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ 285 "sri v0.8h, v20.8h, #11 \n" /* RGB */ 286 287void I422ToRGB565Row_NEON(const uint8* src_y, 288 const uint8* src_u, 289 const uint8* src_v, 290 uint8* dst_rgb565, 291 const struct YuvConstants* yuvconstants, 292 int width) { 293 asm volatile ( 294 YUVTORGB_SETUP 295 "1: \n" 296 READYUV422 297 YUVTORGB(v22, v21, v20) 298 "subs %w4, %w4, #8 \n" 299 ARGBTORGB565 300 MEMACCESS(3) 301 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 302 "b.gt 1b \n" 303 : "+r"(src_y), // %0 304 "+r"(src_u), // %1 305 "+r"(src_v), // %2 306 "+r"(dst_rgb565), // %3 307 "+r"(width) // %4 308 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 309 [kUVToG]"r"(&yuvconstants->kUVToG), 310 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 311 [kYToRgb]"r"(&yuvconstants->kYToRgb) 312 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 313 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 314 ); 315} 316 317#define ARGBTOARGB1555 \ 318 "shll v0.8h, v23.8b, #8 \n" /* A */ \ 319 "shll v22.8h, v22.8b, #8 \n" /* R */ \ 320 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 321 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 322 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ 323 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ 324 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ 325 326void I422ToARGB1555Row_NEON(const uint8* src_y, 327 const uint8* src_u, 328 const uint8* src_v, 329 uint8* dst_argb1555, 330 const struct YuvConstants* yuvconstants, 331 int width) { 332 asm volatile ( 333 YUVTORGB_SETUP 334 "movi v23.8b, #255 \n" 335 "1: \n" 336 READYUV422 337 YUVTORGB(v22, v21, v20) 338 "subs %w4, %w4, #8 \n" 339 ARGBTOARGB1555 340 MEMACCESS(3) 341 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 342 "b.gt 1b \n" 343 : "+r"(src_y), // %0 344 "+r"(src_u), // %1 345 "+r"(src_v), // %2 346 "+r"(dst_argb1555), // %3 347 "+r"(width) // %4 348 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 349 [kUVToG]"r"(&yuvconstants->kUVToG), 350 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 351 [kYToRgb]"r"(&yuvconstants->kYToRgb) 352 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 353 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 354 ); 355} 356 357#define ARGBTOARGB4444 \ 358 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ 359 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ 360 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ 361 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ 362 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ 363 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ 364 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ 365 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ 366 367void I422ToARGB4444Row_NEON(const uint8* src_y, 368 const uint8* src_u, 369 const uint8* src_v, 370 uint8* dst_argb4444, 371 const struct YuvConstants* yuvconstants, 372 int width) { 373 asm volatile ( 374 YUVTORGB_SETUP 375 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 376 "1: \n" 377 READYUV422 378 YUVTORGB(v22, v21, v20) 379 "subs %w4, %w4, #8 \n" 380 "movi v23.8b, #255 \n" 381 ARGBTOARGB4444 382 MEMACCESS(3) 383 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. 384 "b.gt 1b \n" 385 : "+r"(src_y), // %0 386 "+r"(src_u), // %1 387 "+r"(src_v), // %2 388 "+r"(dst_argb4444), // %3 389 "+r"(width) // %4 390 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 391 [kUVToG]"r"(&yuvconstants->kUVToG), 392 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 393 [kYToRgb]"r"(&yuvconstants->kYToRgb) 394 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 395 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 396 ); 397} 398 399void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { 400 asm volatile ( 401 YUVTORGB_SETUP 402 "movi v23.8b, #255 \n" 403 "1: \n" 404 READYUV400 405 YUVTORGB(v22, v21, v20) 406 "subs %w2, %w2, #8 \n" 407 MEMACCESS(1) 408 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 409 "b.gt 1b \n" 410 : "+r"(src_y), // %0 411 "+r"(dst_argb), // %1 412 "+r"(width) // %2 413 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), 414 [kUVToG]"r"(&kYuvI601Constants.kUVToG), 415 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), 416 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) 417 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 418 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 419 ); 420} 421 422void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { 423 asm volatile ( 424 "movi v23.8b, #255 \n" 425 "1: \n" 426 MEMACCESS(0) 427 "ld1 {v20.8b}, [%0], #8 \n" 428 "orr v21.8b, v20.8b, v20.8b \n" 429 "orr v22.8b, v20.8b, v20.8b \n" 430 "subs %w2, %w2, #8 \n" 431 MEMACCESS(1) 432 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 433 "b.gt 1b \n" 434 : "+r"(src_y), // %0 435 "+r"(dst_argb), // %1 436 "+r"(width) // %2 437 : 438 : "cc", "memory", "v20", "v21", "v22", "v23" 439 ); 440} 441 442void NV12ToARGBRow_NEON(const uint8* src_y, 443 const uint8* src_uv, 444 uint8* dst_argb, 445 const struct YuvConstants* yuvconstants, 446 int width) { 447 asm volatile ( 448 YUVTORGB_SETUP 449 "movi v23.8b, #255 \n" 450 "1: \n" 451 READNV12 452 YUVTORGB(v22, v21, v20) 453 "subs %w3, %w3, #8 \n" 454 MEMACCESS(2) 455 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 456 "b.gt 1b \n" 457 : "+r"(src_y), // %0 458 "+r"(src_uv), // %1 459 "+r"(dst_argb), // %2 460 "+r"(width) // %3 461 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 462 [kUVToG]"r"(&yuvconstants->kUVToG), 463 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 464 [kYToRgb]"r"(&yuvconstants->kYToRgb) 465 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 466 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 467 ); 468} 469 470void NV21ToARGBRow_NEON(const uint8* src_y, 471 const uint8* src_vu, 472 uint8* dst_argb, 473 const struct YuvConstants* yuvconstants, 474 int width) { 475 asm volatile ( 476 YUVTORGB_SETUP 477 "movi v23.8b, #255 \n" 478 "1: \n" 479 READNV21 480 YUVTORGB(v22, v21, v20) 481 "subs %w3, %w3, #8 \n" 482 MEMACCESS(2) 483 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 484 "b.gt 1b \n" 485 : "+r"(src_y), // %0 486 "+r"(src_vu), // %1 487 "+r"(dst_argb), // %2 488 "+r"(width) // %3 489 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 490 [kUVToG]"r"(&yuvconstants->kUVToG), 491 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 492 [kYToRgb]"r"(&yuvconstants->kYToRgb) 493 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 494 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 495 ); 496} 497 498void NV12ToRGB565Row_NEON(const uint8* src_y, 499 const uint8* src_uv, 500 uint8* dst_rgb565, 501 const struct YuvConstants* yuvconstants, 502 int width) { 503 asm volatile ( 504 YUVTORGB_SETUP 505 "1: \n" 506 READNV12 507 YUVTORGB(v22, v21, v20) 508 "subs %w3, %w3, #8 \n" 509 ARGBTORGB565 510 MEMACCESS(2) 511 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 512 "b.gt 1b \n" 513 : "+r"(src_y), // %0 514 "+r"(src_uv), // %1 515 "+r"(dst_rgb565), // %2 516 "+r"(width) // %3 517 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 518 [kUVToG]"r"(&yuvconstants->kUVToG), 519 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 520 [kYToRgb]"r"(&yuvconstants->kYToRgb) 521 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 522 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 523 ); 524} 525 526void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 527 uint8* dst_argb, 528 const struct YuvConstants* yuvconstants, 529 int width) { 530 asm volatile ( 531 YUVTORGB_SETUP 532 "movi v23.8b, #255 \n" 533 "1: \n" 534 READYUY2 535 YUVTORGB(v22, v21, v20) 536 "subs %w2, %w2, #8 \n" 537 MEMACCESS(1) 538 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 539 "b.gt 1b \n" 540 : "+r"(src_yuy2), // %0 541 "+r"(dst_argb), // %1 542 "+r"(width) // %2 543 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 544 [kUVToG]"r"(&yuvconstants->kUVToG), 545 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 546 [kYToRgb]"r"(&yuvconstants->kYToRgb) 547 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 548 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 549 ); 550} 551 552void UYVYToARGBRow_NEON(const uint8* src_uyvy, 553 uint8* dst_argb, 554 const struct YuvConstants* yuvconstants, 555 int width) { 556 asm volatile ( 557 YUVTORGB_SETUP 558 "movi v23.8b, #255 \n" 559 "1: \n" 560 READUYVY 561 YUVTORGB(v22, v21, v20) 562 "subs %w2, %w2, #8 \n" 563 MEMACCESS(1) 564 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" 565 "b.gt 1b \n" 566 : "+r"(src_uyvy), // %0 567 "+r"(dst_argb), // %1 568 "+r"(width) // %2 569 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 570 [kUVToG]"r"(&yuvconstants->kUVToG), 571 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 572 [kYToRgb]"r"(&yuvconstants->kYToRgb) 573 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 574 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 575 ); 576} 577 578// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 579void SplitUVRow_NEON(const uint8* src_uv, 580 uint8* dst_u, 581 uint8* dst_v, 582 int width) { 583 asm volatile ( 584 "1: \n" 585 MEMACCESS(0) 586 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 587 "subs %w3, %w3, #16 \n" // 16 processed per loop 588 MEMACCESS(1) 589 "st1 {v0.16b}, [%1], #16 \n" // store U 590 MEMACCESS(2) 591 "st1 {v1.16b}, [%2], #16 \n" // store V 592 "b.gt 1b \n" 593 : "+r"(src_uv), // %0 594 "+r"(dst_u), // %1 595 "+r"(dst_v), // %2 596 "+r"(width) // %3 // Output registers 597 : // Input registers 598 : "cc", "memory", "v0", "v1" // Clobber List 599 ); 600} 601 602// Reads 16 U's and V's and writes out 16 pairs of UV. 603void MergeUVRow_NEON(const uint8* src_u, 604 const uint8* src_v, 605 uint8* dst_uv, 606 int width) { 607 asm volatile ( 608 "1: \n" 609 MEMACCESS(0) 610 "ld1 {v0.16b}, [%0], #16 \n" // load U 611 MEMACCESS(1) 612 "ld1 {v1.16b}, [%1], #16 \n" // load V 613 "subs %w3, %w3, #16 \n" // 16 processed per loop 614 MEMACCESS(2) 615 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 616 "b.gt 1b \n" 617 : 618 "+r"(src_u), // %0 619 "+r"(src_v), // %1 620 "+r"(dst_uv), // %2 621 "+r"(width) // %3 // Output registers 622 : // Input registers 623 : "cc", "memory", "v0", "v1" // Clobber List 624 ); 625} 626 627// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 628void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 629 asm volatile ( 630 "1: \n" 631 MEMACCESS(0) 632 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 633 "subs %w2, %w2, #32 \n" // 32 processed per loop 634 MEMACCESS(1) 635 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 636 "b.gt 1b \n" 637 : "+r"(src), // %0 638 "+r"(dst), // %1 639 "+r"(count) // %2 // Output registers 640 : // Input registers 641 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 642 ); 643} 644 645// SetRow writes 'count' bytes using an 8 bit value repeated. 646void SetRow_NEON(uint8* dst, uint8 v8, int count) { 647 asm volatile ( 648 "dup v0.16b, %w2 \n" // duplicate 16 bytes 649 "1: \n" 650 "subs %w1, %w1, #16 \n" // 16 bytes per loop 651 MEMACCESS(0) 652 "st1 {v0.16b}, [%0], #16 \n" // store 653 "b.gt 1b \n" 654 : "+r"(dst), // %0 655 "+r"(count) // %1 656 : "r"(v8) // %2 657 : "cc", "memory", "v0" 658 ); 659} 660 661void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { 662 asm volatile ( 663 "dup v0.4s, %w2 \n" // duplicate 4 ints 664 "1: \n" 665 "subs %w1, %w1, #4 \n" // 4 ints per loop 666 MEMACCESS(0) 667 "st1 {v0.16b}, [%0], #16 \n" // store 668 "b.gt 1b \n" 669 : "+r"(dst), // %0 670 "+r"(count) // %1 671 : "r"(v32) // %2 672 : "cc", "memory", "v0" 673 ); 674} 675 676void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 677 asm volatile ( 678 // Start at end of source row. 679 "add %0, %0, %w2, sxtw \n" 680 "sub %0, %0, #16 \n" 681 "1: \n" 682 MEMACCESS(0) 683 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 684 "subs %w2, %w2, #16 \n" // 16 pixels per loop. 685 "rev64 v0.16b, v0.16b \n" 686 MEMACCESS(1) 687 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 688 MEMACCESS(1) 689 "st1 {v0.D}[0], [%1], #8 \n" 690 "b.gt 1b \n" 691 : "+r"(src), // %0 692 "+r"(dst), // %1 693 "+r"(width) // %2 694 : "r"((ptrdiff_t)-16) // %3 695 : "cc", "memory", "v0" 696 ); 697} 698 699void MirrorUVRow_NEON(const uint8* src_uv, 700 uint8* dst_u, 701 uint8* dst_v, 702 int width) { 703 asm volatile ( 704 // Start at end of source row. 705 "add %0, %0, %w3, sxtw #1 \n" 706 "sub %0, %0, #16 \n" 707 "1: \n" 708 MEMACCESS(0) 709 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 710 "subs %w3, %w3, #8 \n" // 8 pixels per loop. 711 "rev64 v0.8b, v0.8b \n" 712 "rev64 v1.8b, v1.8b \n" 713 MEMACCESS(1) 714 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 715 MEMACCESS(2) 716 "st1 {v1.8b}, [%2], #8 \n" 717 "b.gt 1b \n" 718 : "+r"(src_uv), // %0 719 "+r"(dst_u), // %1 720 "+r"(dst_v), // %2 721 "+r"(width) // %3 722 : "r"((ptrdiff_t)-16) // %4 723 : "cc", "memory", "v0", "v1" 724 ); 725} 726 727void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 728 asm volatile ( 729 // Start at end of source row. 730 "add %0, %0, %w2, sxtw #2 \n" 731 "sub %0, %0, #16 \n" 732 "1: \n" 733 MEMACCESS(0) 734 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 735 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 736 "rev64 v0.4s, v0.4s \n" 737 MEMACCESS(1) 738 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 739 MEMACCESS(1) 740 "st1 {v0.D}[0], [%1], #8 \n" 741 "b.gt 1b \n" 742 : "+r"(src), // %0 743 "+r"(dst), // %1 744 "+r"(width) // %2 745 : "r"((ptrdiff_t)-16) // %3 746 : "cc", "memory", "v0" 747 ); 748} 749 750void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { 751 asm volatile ( 752 "movi v4.8b, #255 \n" // Alpha 753 "1: \n" 754 MEMACCESS(0) 755 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 756 "subs %w2, %w2, #8 \n" // 8 processed per loop. 757 MEMACCESS(1) 758 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 759 "b.gt 1b \n" 760 : "+r"(src_rgb24), // %0 761 "+r"(dst_argb), // %1 762 "+r"(width) // %2 763 : 764 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 765 ); 766} 767 768void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { 769 asm volatile ( 770 "movi v5.8b, #255 \n" // Alpha 771 "1: \n" 772 MEMACCESS(0) 773 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 774 "subs %w2, %w2, #8 \n" // 8 processed per loop. 775 "orr v3.8b, v1.8b, v1.8b \n" // move g 776 "orr v4.8b, v0.8b, v0.8b \n" // move r 777 MEMACCESS(1) 778 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 779 "b.gt 1b \n" 780 : "+r"(src_raw), // %0 781 "+r"(dst_argb), // %1 782 "+r"(width) // %2 783 : 784 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 785 ); 786} 787 788void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 789 asm volatile ( 790 "1: \n" 791 MEMACCESS(0) 792 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 793 "subs %w2, %w2, #8 \n" // 8 processed per loop. 794 "orr v3.8b, v1.8b, v1.8b \n" // move g 795 "orr v4.8b, v0.8b, v0.8b \n" // move r 796 MEMACCESS(1) 797 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r 798 "b.gt 1b \n" 799 : "+r"(src_raw), // %0 800 "+r"(dst_rgb24), // %1 801 "+r"(width) // %2 802 : 803 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 804 ); 805} 806 807#define RGB565TOARGB \ 808 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ 809 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ 810 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 811 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 812 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 813 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 814 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 815 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 816 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 817 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 818 "dup v2.2D, v0.D[1] \n" /* R */ 819 820void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { 821 asm volatile ( 822 "movi v3.8b, #255 \n" // Alpha 823 "1: \n" 824 MEMACCESS(0) 825 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 826 "subs %w2, %w2, #8 \n" // 8 processed per loop. 827 RGB565TOARGB 828 MEMACCESS(1) 829 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 830 "b.gt 1b \n" 831 : "+r"(src_rgb565), // %0 832 "+r"(dst_argb), // %1 833 "+r"(width) // %2 834 : 835 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 836 ); 837} 838 839#define ARGB1555TOARGB \ 840 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 841 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 842 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 843 \ 844 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ 845 "xtn2 v3.16b, v2.8h \n" \ 846 \ 847 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 848 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 849 \ 850 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ 851 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 852 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 853 \ 854 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 855 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ 856 "dup v1.2D, v0.D[1] \n" \ 857 "dup v3.2D, v2.D[1] \n" 858 859// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. 860#define RGB555TOARGB \ 861 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 862 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 863 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ 864 \ 865 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 866 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 867 \ 868 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 869 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 870 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 871 \ 872 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 873 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 874 "dup v1.2D, v0.D[1] \n" /* G */ 875 876void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, 877 uint8* dst_argb, 878 int width) { 879 asm volatile ( 880 "movi v3.8b, #255 \n" // Alpha 881 "1: \n" 882 MEMACCESS(0) 883 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 884 "subs %w2, %w2, #8 \n" // 8 processed per loop. 885 ARGB1555TOARGB 886 MEMACCESS(1) 887 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 888 "b.gt 1b \n" 889 : "+r"(src_argb1555), // %0 890 "+r"(dst_argb), // %1 891 "+r"(width) // %2 892 : 893 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 894 ); 895} 896 897#define ARGB4444TOARGB \ 898 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ 899 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ 900 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ 901 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ 902 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ 903 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ 904 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ 905 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ 906 "dup v0.2D, v2.D[1] \n" \ 907 "dup v1.2D, v3.D[1] \n" 908 909void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, 910 uint8* dst_argb, 911 int width) { 912 asm volatile ( 913 "1: \n" 914 MEMACCESS(0) 915 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 916 "subs %w2, %w2, #8 \n" // 8 processed per loop. 917 ARGB4444TOARGB 918 MEMACCESS(1) 919 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 920 "b.gt 1b \n" 921 : "+r"(src_argb4444), // %0 922 "+r"(dst_argb), // %1 923 "+r"(width) // %2 924 : 925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 926 ); 927} 928 929void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 930 asm volatile ( 931 "1: \n" 932 MEMACCESS(0) 933 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels 934 "subs %w2, %w2, #8 \n" // 8 processed per loop. 935 MEMACCESS(1) 936 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 937 "b.gt 1b \n" 938 : "+r"(src_argb), // %0 939 "+r"(dst_rgb24), // %1 940 "+r"(width) // %2 941 : 942 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 943 ); 944} 945 946void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 947 asm volatile ( 948 "1: \n" 949 MEMACCESS(0) 950 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 951 "subs %w2, %w2, #8 \n" // 8 processed per loop. 952 "orr v4.8b, v2.8b, v2.8b \n" // mov g 953 "orr v5.8b, v1.8b, v1.8b \n" // mov b 954 MEMACCESS(1) 955 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 956 "b.gt 1b \n" 957 : "+r"(src_argb), // %0 958 "+r"(dst_raw), // %1 959 "+r"(width) // %2 960 : 961 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 962 ); 963} 964 965void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 966 asm volatile ( 967 "1: \n" 968 MEMACCESS(0) 969 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 970 "subs %w2, %w2, #16 \n" // 16 processed per loop. 971 MEMACCESS(1) 972 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 973 "b.gt 1b \n" 974 : "+r"(src_yuy2), // %0 975 "+r"(dst_y), // %1 976 "+r"(width) // %2 977 : 978 : "cc", "memory", "v0", "v1" // Clobber List 979 ); 980} 981 982void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 983 asm volatile ( 984 "1: \n" 985 MEMACCESS(0) 986 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 987 "subs %w2, %w2, #16 \n" // 16 processed per loop. 988 MEMACCESS(1) 989 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 990 "b.gt 1b \n" 991 : "+r"(src_uyvy), // %0 992 "+r"(dst_y), // %1 993 "+r"(width) // %2 994 : 995 : "cc", "memory", "v0", "v1" // Clobber List 996 ); 997} 998 999void YUY2ToUV422Row_NEON(const uint8* src_yuy2, 1000 uint8* dst_u, 1001 uint8* dst_v, 1002 int width) { 1003 asm volatile ( 1004 "1: \n" 1005 MEMACCESS(0) 1006 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels 1007 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1008 MEMACCESS(1) 1009 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1010 MEMACCESS(2) 1011 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1012 "b.gt 1b \n" 1013 : "+r"(src_yuy2), // %0 1014 "+r"(dst_u), // %1 1015 "+r"(dst_v), // %2 1016 "+r"(width) // %3 1017 : 1018 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1019 ); 1020} 1021 1022void UYVYToUV422Row_NEON(const uint8* src_uyvy, 1023 uint8* dst_u, 1024 uint8* dst_v, 1025 int width) { 1026 asm volatile ( 1027 "1: \n" 1028 MEMACCESS(0) 1029 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1030 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1031 MEMACCESS(1) 1032 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1033 MEMACCESS(2) 1034 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1035 "b.gt 1b \n" 1036 : "+r"(src_uyvy), // %0 1037 "+r"(dst_u), // %1 1038 "+r"(dst_v), // %2 1039 "+r"(width) // %3 1040 : 1041 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1042 ); 1043} 1044 1045void YUY2ToUVRow_NEON(const uint8* src_yuy2, 1046 int stride_yuy2, 1047 uint8* dst_u, 1048 uint8* dst_v, 1049 int width) { 1050 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1051 asm volatile ( 1052 "1: \n" 1053 MEMACCESS(0) 1054 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1055 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1056 MEMACCESS(1) 1057 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1058 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1059 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1060 MEMACCESS(2) 1061 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1062 MEMACCESS(3) 1063 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1064 "b.gt 1b \n" 1065 : "+r"(src_yuy2), // %0 1066 "+r"(src_yuy2b), // %1 1067 "+r"(dst_u), // %2 1068 "+r"(dst_v), // %3 1069 "+r"(width) // %4 1070 : 1071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1072 "v5", "v6", "v7" // Clobber List 1073 ); 1074} 1075 1076void UYVYToUVRow_NEON(const uint8* src_uyvy, 1077 int stride_uyvy, 1078 uint8* dst_u, 1079 uint8* dst_v, 1080 int width) { 1081 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1082 asm volatile ( 1083 "1: \n" 1084 MEMACCESS(0) 1085 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1086 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1087 MEMACCESS(1) 1088 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1089 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1090 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1091 MEMACCESS(2) 1092 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1093 MEMACCESS(3) 1094 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1095 "b.gt 1b \n" 1096 : "+r"(src_uyvy), // %0 1097 "+r"(src_uyvyb), // %1 1098 "+r"(dst_u), // %2 1099 "+r"(dst_v), // %3 1100 "+r"(width) // %4 1101 : 1102 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1103 "v5", "v6", "v7" // Clobber List 1104 ); 1105} 1106 1107// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1108void ARGBShuffleRow_NEON(const uint8* src_argb, 1109 uint8* dst_argb, 1110 const uint8* shuffler, 1111 int width) { 1112 asm volatile ( 1113 MEMACCESS(3) 1114 "ld1 {v2.16b}, [%3] \n" // shuffler 1115 "1: \n" 1116 MEMACCESS(0) 1117 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1118 "subs %w2, %w2, #4 \n" // 4 processed per loop 1119 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1120 MEMACCESS(1) 1121 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1122 "b.gt 1b \n" 1123 : "+r"(src_argb), // %0 1124 "+r"(dst_argb), // %1 1125 "+r"(width) // %2 1126 : "r"(shuffler) // %3 1127 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1128 ); 1129} 1130 1131void I422ToYUY2Row_NEON(const uint8* src_y, 1132 const uint8* src_u, 1133 const uint8* src_v, 1134 uint8* dst_yuy2, 1135 int width) { 1136 asm volatile ( 1137 "1: \n" 1138 MEMACCESS(0) 1139 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1140 "orr v2.8b, v1.8b, v1.8b \n" 1141 MEMACCESS(1) 1142 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1143 MEMACCESS(2) 1144 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1145 "subs %w4, %w4, #16 \n" // 16 pixels 1146 MEMACCESS(3) 1147 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1148 "b.gt 1b \n" 1149 : "+r"(src_y), // %0 1150 "+r"(src_u), // %1 1151 "+r"(src_v), // %2 1152 "+r"(dst_yuy2), // %3 1153 "+r"(width) // %4 1154 : 1155 : "cc", "memory", "v0", "v1", "v2", "v3" 1156 ); 1157} 1158 1159void I422ToUYVYRow_NEON(const uint8* src_y, 1160 const uint8* src_u, 1161 const uint8* src_v, 1162 uint8* dst_uyvy, 1163 int width) { 1164 asm volatile ( 1165 "1: \n" 1166 MEMACCESS(0) 1167 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1168 "orr v3.8b, v2.8b, v2.8b \n" 1169 MEMACCESS(1) 1170 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1171 MEMACCESS(2) 1172 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1173 "subs %w4, %w4, #16 \n" // 16 pixels 1174 MEMACCESS(3) 1175 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1176 "b.gt 1b \n" 1177 : "+r"(src_y), // %0 1178 "+r"(src_u), // %1 1179 "+r"(src_v), // %2 1180 "+r"(dst_uyvy), // %3 1181 "+r"(width) // %4 1182 : 1183 : "cc", "memory", "v0", "v1", "v2", "v3" 1184 ); 1185} 1186 1187void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1188 asm volatile ( 1189 "1: \n" 1190 MEMACCESS(0) 1191 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1192 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1193 ARGBTORGB565 1194 MEMACCESS(1) 1195 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1196 "b.gt 1b \n" 1197 : "+r"(src_argb), // %0 1198 "+r"(dst_rgb565), // %1 1199 "+r"(width) // %2 1200 : 1201 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1202 ); 1203} 1204 1205void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, 1206 uint8* dst_rgb, 1207 const uint32 dither4, 1208 int width) { 1209 asm volatile ( 1210 "dup v1.4s, %w2 \n" // dither4 1211 "1: \n" 1212 MEMACCESS(1) 1213 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1214 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1215 "uqadd v20.8b, v20.8b, v1.8b \n" 1216 "uqadd v21.8b, v21.8b, v1.8b \n" 1217 "uqadd v22.8b, v22.8b, v1.8b \n" 1218 ARGBTORGB565 1219 MEMACCESS(0) 1220 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. 1221 "b.gt 1b \n" 1222 : "+r"(dst_rgb) // %0 1223 : "r"(src_argb), // %1 1224 "r"(dither4), // %2 1225 "r"(width) // %3 1226 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" 1227 ); 1228} 1229 1230void ARGBToARGB1555Row_NEON(const uint8* src_argb, 1231 uint8* dst_argb1555, 1232 int width) { 1233 asm volatile ( 1234 "1: \n" 1235 MEMACCESS(0) 1236 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1237 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1238 ARGBTOARGB1555 1239 MEMACCESS(1) 1240 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1241 "b.gt 1b \n" 1242 : "+r"(src_argb), // %0 1243 "+r"(dst_argb1555), // %1 1244 "+r"(width) // %2 1245 : 1246 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1247 ); 1248} 1249 1250void ARGBToARGB4444Row_NEON(const uint8* src_argb, 1251 uint8* dst_argb4444, 1252 int width) { 1253 asm volatile ( 1254 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1255 "1: \n" 1256 MEMACCESS(0) 1257 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1258 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1259 ARGBTOARGB4444 1260 MEMACCESS(1) 1261 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1262 "b.gt 1b \n" 1263 : "+r"(src_argb), // %0 1264 "+r"(dst_argb4444), // %1 1265 "+r"(width) // %2 1266 : 1267 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1268 ); 1269} 1270 1271void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1272 asm volatile ( 1273 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1274 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1275 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1276 "movi v7.8b, #16 \n" // Add 16 constant 1277 "1: \n" 1278 MEMACCESS(0) 1279 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1280 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1281 "umull v3.8h, v0.8b, v4.8b \n" // B 1282 "umlal v3.8h, v1.8b, v5.8b \n" // G 1283 "umlal v3.8h, v2.8b, v6.8b \n" // R 1284 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1285 "uqadd v0.8b, v0.8b, v7.8b \n" 1286 MEMACCESS(1) 1287 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1288 "b.gt 1b \n" 1289 : "+r"(src_argb), // %0 1290 "+r"(dst_y), // %1 1291 "+r"(width) // %2 1292 : 1293 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1294 ); 1295} 1296 1297void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1298 asm volatile ( 1299 "1: \n" 1300 MEMACCESS(0) 1301 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels 1302 "subs %w2, %w2, #16 \n" // 16 processed per loop 1303 MEMACCESS(1) 1304 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. 1305 "b.gt 1b \n" 1306 : "+r"(src_argb), // %0 1307 "+r"(dst_a), // %1 1308 "+r"(width) // %2 1309 : 1310 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1311 ); 1312} 1313 1314void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1315 asm volatile ( 1316 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1317 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1318 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1319 "1: \n" 1320 MEMACCESS(0) 1321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1322 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1323 "umull v3.8h, v0.8b, v4.8b \n" // B 1324 "umlal v3.8h, v1.8b, v5.8b \n" // G 1325 "umlal v3.8h, v2.8b, v6.8b \n" // R 1326 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1327 MEMACCESS(1) 1328 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1329 "b.gt 1b \n" 1330 : "+r"(src_argb), // %0 1331 "+r"(dst_y), // %1 1332 "+r"(width) // %2 1333 : 1334 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1335 ); 1336} 1337 1338// 8x1 pixels. 1339void ARGBToUV444Row_NEON(const uint8* src_argb, 1340 uint8* dst_u, 1341 uint8* dst_v, 1342 int width) { 1343 asm volatile ( 1344 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1345 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1346 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1347 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1348 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1349 "movi v29.16b,#0x80 \n" // 128.5 1350 "1: \n" 1351 MEMACCESS(0) 1352 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1353 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1354 "umull v4.8h, v0.8b, v24.8b \n" // B 1355 "umlsl v4.8h, v1.8b, v25.8b \n" // G 1356 "umlsl v4.8h, v2.8b, v26.8b \n" // R 1357 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned 1358 1359 "umull v3.8h, v2.8b, v24.8b \n" // R 1360 "umlsl v3.8h, v1.8b, v28.8b \n" // G 1361 "umlsl v3.8h, v0.8b, v27.8b \n" // B 1362 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned 1363 1364 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U 1365 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1366 1367 MEMACCESS(1) 1368 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1369 MEMACCESS(2) 1370 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1371 "b.gt 1b \n" 1372 : "+r"(src_argb), // %0 1373 "+r"(dst_u), // %1 1374 "+r"(dst_v), // %2 1375 "+r"(width) // %3 1376 : 1377 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1378 "v24", "v25", "v26", "v27", "v28", "v29" 1379 ); 1380} 1381 1382#define RGBTOUV_SETUP_REG \ 1383 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ 1384 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ 1385 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ 1386 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ 1387 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ 1388 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ 1389 1390// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1391#define RGBTOUV(QB, QG, QR) \ 1392 "mul v3.8h, " #QB \ 1393 ",v20.8h \n" /* B */ \ 1394 "mul v4.8h, " #QR \ 1395 ",v20.8h \n" /* R */ \ 1396 "mls v3.8h, " #QG \ 1397 ",v21.8h \n" /* G */ \ 1398 "mls v4.8h, " #QG \ 1399 ",v24.8h \n" /* G */ \ 1400 "mls v3.8h, " #QR \ 1401 ",v22.8h \n" /* R */ \ 1402 "mls v4.8h, " #QB \ 1403 ",v23.8h \n" /* B */ \ 1404 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1405 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1406 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1407 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1408 1409// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1410// TODO(fbarchard): consider ptrdiff_t for all strides. 1411 1412void ARGBToUVRow_NEON(const uint8* src_argb, 1413 int src_stride_argb, 1414 uint8* dst_u, 1415 uint8* dst_v, 1416 int width) { 1417 const uint8* src_argb_1 = src_argb + src_stride_argb; 1418 asm volatile ( 1419 RGBTOUV_SETUP_REG 1420 "1: \n" 1421 MEMACCESS(0) 1422 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1423 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1424 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1425 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1426 1427 MEMACCESS(1) 1428 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1429 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1430 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1431 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1432 1433 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1434 "urshr v1.8h, v1.8h, #1 \n" 1435 "urshr v2.8h, v2.8h, #1 \n" 1436 1437 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1438 RGBTOUV(v0.8h, v1.8h, v2.8h) 1439 MEMACCESS(2) 1440 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1441 MEMACCESS(3) 1442 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1443 "b.gt 1b \n" 1444 : "+r"(src_argb), // %0 1445 "+r"(src_argb_1), // %1 1446 "+r"(dst_u), // %2 1447 "+r"(dst_v), // %3 1448 "+r"(width) // %4 1449 : 1450 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1451 "v20", "v21", "v22", "v23", "v24", "v25" 1452 ); 1453} 1454 1455// TODO(fbarchard): Subsample match C code. 1456void ARGBToUVJRow_NEON(const uint8* src_argb, 1457 int src_stride_argb, 1458 uint8* dst_u, 1459 uint8* dst_v, 1460 int width) { 1461 const uint8* src_argb_1 = src_argb + src_stride_argb; 1462 asm volatile ( 1463 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 1464 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 1465 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 1466 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 1467 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 1468 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1469 "1: \n" 1470 MEMACCESS(0) 1471 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1472 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1473 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1474 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1475 MEMACCESS(1) 1476 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1477 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1478 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1479 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1480 1481 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1482 "urshr v1.8h, v1.8h, #1 \n" 1483 "urshr v2.8h, v2.8h, #1 \n" 1484 1485 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1486 RGBTOUV(v0.8h, v1.8h, v2.8h) 1487 MEMACCESS(2) 1488 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1489 MEMACCESS(3) 1490 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1491 "b.gt 1b \n" 1492 : "+r"(src_argb), // %0 1493 "+r"(src_argb_1), // %1 1494 "+r"(dst_u), // %2 1495 "+r"(dst_v), // %3 1496 "+r"(width) // %4 1497 : 1498 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1499 "v20", "v21", "v22", "v23", "v24", "v25" 1500 ); 1501} 1502 1503void BGRAToUVRow_NEON(const uint8* src_bgra, 1504 int src_stride_bgra, 1505 uint8* dst_u, 1506 uint8* dst_v, 1507 int width) { 1508 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; 1509 asm volatile ( 1510 RGBTOUV_SETUP_REG 1511 "1: \n" 1512 MEMACCESS(0) 1513 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1514 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. 1515 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1516 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. 1517 MEMACCESS(1) 1518 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more 1519 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. 1520 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1521 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. 1522 1523 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1524 "urshr v1.8h, v3.8h, #1 \n" 1525 "urshr v2.8h, v2.8h, #1 \n" 1526 1527 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1528 RGBTOUV(v0.8h, v1.8h, v2.8h) 1529 MEMACCESS(2) 1530 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1531 MEMACCESS(3) 1532 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1533 "b.gt 1b \n" 1534 : "+r"(src_bgra), // %0 1535 "+r"(src_bgra_1), // %1 1536 "+r"(dst_u), // %2 1537 "+r"(dst_v), // %3 1538 "+r"(width) // %4 1539 : 1540 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1541 "v20", "v21", "v22", "v23", "v24", "v25" 1542 ); 1543} 1544 1545void ABGRToUVRow_NEON(const uint8* src_abgr, 1546 int src_stride_abgr, 1547 uint8* dst_u, 1548 uint8* dst_v, 1549 int width) { 1550 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; 1551 asm volatile ( 1552 RGBTOUV_SETUP_REG 1553 "1: \n" 1554 MEMACCESS(0) 1555 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1556 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1557 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1558 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1559 MEMACCESS(1) 1560 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1561 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1562 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1563 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1564 1565 "urshr v0.8h, v3.8h, #1 \n" // 2x average 1566 "urshr v2.8h, v2.8h, #1 \n" 1567 "urshr v1.8h, v1.8h, #1 \n" 1568 1569 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1570 RGBTOUV(v0.8h, v2.8h, v1.8h) 1571 MEMACCESS(2) 1572 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1573 MEMACCESS(3) 1574 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1575 "b.gt 1b \n" 1576 : "+r"(src_abgr), // %0 1577 "+r"(src_abgr_1), // %1 1578 "+r"(dst_u), // %2 1579 "+r"(dst_v), // %3 1580 "+r"(width) // %4 1581 : 1582 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1583 "v20", "v21", "v22", "v23", "v24", "v25" 1584 ); 1585} 1586 1587void RGBAToUVRow_NEON(const uint8* src_rgba, 1588 int src_stride_rgba, 1589 uint8* dst_u, 1590 uint8* dst_v, 1591 int width) { 1592 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; 1593 asm volatile ( 1594 RGBTOUV_SETUP_REG 1595 "1: \n" 1596 MEMACCESS(0) 1597 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1598 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. 1599 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1600 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. 1601 MEMACCESS(1) 1602 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1603 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. 1604 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1605 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. 1606 1607 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1608 "urshr v1.8h, v1.8h, #1 \n" 1609 "urshr v2.8h, v2.8h, #1 \n" 1610 1611 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1612 RGBTOUV(v0.8h, v1.8h, v2.8h) 1613 MEMACCESS(2) 1614 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1615 MEMACCESS(3) 1616 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1617 "b.gt 1b \n" 1618 : "+r"(src_rgba), // %0 1619 "+r"(src_rgba_1), // %1 1620 "+r"(dst_u), // %2 1621 "+r"(dst_v), // %3 1622 "+r"(width) // %4 1623 : 1624 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1625 "v20", "v21", "v22", "v23", "v24", "v25" 1626 ); 1627} 1628 1629void RGB24ToUVRow_NEON(const uint8* src_rgb24, 1630 int src_stride_rgb24, 1631 uint8* dst_u, 1632 uint8* dst_v, 1633 int width) { 1634 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; 1635 asm volatile ( 1636 RGBTOUV_SETUP_REG 1637 "1: \n" 1638 MEMACCESS(0) 1639 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. 1640 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1641 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1642 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1643 MEMACCESS(1) 1644 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. 1645 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1646 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1647 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1648 1649 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1650 "urshr v1.8h, v1.8h, #1 \n" 1651 "urshr v2.8h, v2.8h, #1 \n" 1652 1653 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1654 RGBTOUV(v0.8h, v1.8h, v2.8h) 1655 MEMACCESS(2) 1656 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1657 MEMACCESS(3) 1658 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1659 "b.gt 1b \n" 1660 : "+r"(src_rgb24), // %0 1661 "+r"(src_rgb24_1), // %1 1662 "+r"(dst_u), // %2 1663 "+r"(dst_v), // %3 1664 "+r"(width) // %4 1665 : 1666 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1667 "v20", "v21", "v22", "v23", "v24", "v25" 1668 ); 1669} 1670 1671void RAWToUVRow_NEON(const uint8* src_raw, 1672 int src_stride_raw, 1673 uint8* dst_u, 1674 uint8* dst_v, 1675 int width) { 1676 const uint8* src_raw_1 = src_raw + src_stride_raw; 1677 asm volatile ( 1678 RGBTOUV_SETUP_REG 1679 "1: \n" 1680 MEMACCESS(0) 1681 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. 1682 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1683 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1684 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1685 MEMACCESS(1) 1686 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels 1687 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1688 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1689 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1690 1691 "urshr v2.8h, v2.8h, #1 \n" // 2x average 1692 "urshr v1.8h, v1.8h, #1 \n" 1693 "urshr v0.8h, v0.8h, #1 \n" 1694 1695 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1696 RGBTOUV(v2.8h, v1.8h, v0.8h) 1697 MEMACCESS(2) 1698 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1699 MEMACCESS(3) 1700 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1701 "b.gt 1b \n" 1702 : "+r"(src_raw), // %0 1703 "+r"(src_raw_1), // %1 1704 "+r"(dst_u), // %2 1705 "+r"(dst_v), // %3 1706 "+r"(width) // %4 1707 : 1708 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1709 "v20", "v21", "v22", "v23", "v24", "v25" 1710 ); 1711} 1712 1713// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1714void RGB565ToUVRow_NEON(const uint8* src_rgb565, 1715 int src_stride_rgb565, 1716 uint8* dst_u, 1717 uint8* dst_v, 1718 int width) { 1719 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; 1720 asm volatile ( 1721 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 1722 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1723 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1724 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1725 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1726 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1727 "1: \n" 1728 MEMACCESS(0) 1729 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1730 RGB565TOARGB 1731 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1732 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1733 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1734 MEMACCESS(0) 1735 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. 1736 RGB565TOARGB 1737 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1738 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1739 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1740 1741 MEMACCESS(1) 1742 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. 1743 RGB565TOARGB 1744 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1745 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1746 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1747 MEMACCESS(1) 1748 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. 1749 RGB565TOARGB 1750 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1751 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1752 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1753 1754 "ins v16.D[1], v17.D[0] \n" 1755 "ins v18.D[1], v19.D[0] \n" 1756 "ins v20.D[1], v21.D[0] \n" 1757 1758 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1759 "urshr v5.8h, v18.8h, #1 \n" 1760 "urshr v6.8h, v20.8h, #1 \n" 1761 1762 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1763 "mul v16.8h, v4.8h, v22.8h \n" // B 1764 "mls v16.8h, v5.8h, v23.8h \n" // G 1765 "mls v16.8h, v6.8h, v24.8h \n" // R 1766 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned 1767 "mul v17.8h, v6.8h, v22.8h \n" // R 1768 "mls v17.8h, v5.8h, v26.8h \n" // G 1769 "mls v17.8h, v4.8h, v25.8h \n" // B 1770 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned 1771 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U 1772 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1773 MEMACCESS(2) 1774 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1775 MEMACCESS(3) 1776 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1777 "b.gt 1b \n" 1778 : "+r"(src_rgb565), // %0 1779 "+r"(src_rgb565_1), // %1 1780 "+r"(dst_u), // %2 1781 "+r"(dst_v), // %3 1782 "+r"(width) // %4 1783 : 1784 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1785 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 1786 "v25", "v26", "v27" 1787 ); 1788} 1789 1790// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1791void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, 1792 int src_stride_argb1555, 1793 uint8* dst_u, 1794 uint8* dst_v, 1795 int width) { 1796 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; 1797 asm volatile ( 1798 RGBTOUV_SETUP_REG 1799 "1: \n" 1800 MEMACCESS(0) 1801 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1802 RGB555TOARGB 1803 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1804 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1805 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1806 MEMACCESS(0) 1807 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. 1808 RGB555TOARGB 1809 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1810 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1811 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1812 1813 MEMACCESS(1) 1814 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. 1815 RGB555TOARGB 1816 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1817 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1818 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1819 MEMACCESS(1) 1820 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. 1821 RGB555TOARGB 1822 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1823 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1824 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1825 1826 "ins v16.D[1], v26.D[0] \n" 1827 "ins v17.D[1], v27.D[0] \n" 1828 "ins v18.D[1], v28.D[0] \n" 1829 1830 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1831 "urshr v5.8h, v17.8h, #1 \n" 1832 "urshr v6.8h, v18.8h, #1 \n" 1833 1834 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1835 "mul v2.8h, v4.8h, v20.8h \n" // B 1836 "mls v2.8h, v5.8h, v21.8h \n" // G 1837 "mls v2.8h, v6.8h, v22.8h \n" // R 1838 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1839 "mul v3.8h, v6.8h, v20.8h \n" // R 1840 "mls v3.8h, v5.8h, v24.8h \n" // G 1841 "mls v3.8h, v4.8h, v23.8h \n" // B 1842 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1843 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1844 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1845 MEMACCESS(2) 1846 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1847 MEMACCESS(3) 1848 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1849 "b.gt 1b \n" 1850 : "+r"(src_argb1555), // %0 1851 "+r"(src_argb1555_1), // %1 1852 "+r"(dst_u), // %2 1853 "+r"(dst_v), // %3 1854 "+r"(width) // %4 1855 : 1856 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1857 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1858 "v26", "v27", "v28" 1859 ); 1860} 1861 1862// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1863void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, 1864 int src_stride_argb4444, 1865 uint8* dst_u, 1866 uint8* dst_v, 1867 int width) { 1868 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; 1869 asm volatile ( 1870 RGBTOUV_SETUP_REG 1871 "1: \n" 1872 MEMACCESS(0) 1873 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1874 ARGB4444TOARGB 1875 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1876 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1877 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1878 MEMACCESS(0) 1879 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. 1880 ARGB4444TOARGB 1881 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1882 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1883 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1884 1885 MEMACCESS(1) 1886 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. 1887 ARGB4444TOARGB 1888 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1889 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1890 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1891 MEMACCESS(1) 1892 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. 1893 ARGB4444TOARGB 1894 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1895 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1896 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1897 1898 "ins v16.D[1], v26.D[0] \n" 1899 "ins v17.D[1], v27.D[0] \n" 1900 "ins v18.D[1], v28.D[0] \n" 1901 1902 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1903 "urshr v5.8h, v17.8h, #1 \n" 1904 "urshr v6.8h, v18.8h, #1 \n" 1905 1906 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1907 "mul v2.8h, v4.8h, v20.8h \n" // B 1908 "mls v2.8h, v5.8h, v21.8h \n" // G 1909 "mls v2.8h, v6.8h, v22.8h \n" // R 1910 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1911 "mul v3.8h, v6.8h, v20.8h \n" // R 1912 "mls v3.8h, v5.8h, v24.8h \n" // G 1913 "mls v3.8h, v4.8h, v23.8h \n" // B 1914 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1915 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1916 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1917 MEMACCESS(2) 1918 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1919 MEMACCESS(3) 1920 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1921 "b.gt 1b \n" 1922 : "+r"(src_argb4444), // %0 1923 "+r"(src_argb4444_1), // %1 1924 "+r"(dst_u), // %2 1925 "+r"(dst_v), // %3 1926 "+r"(width) // %4 1927 : 1928 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1929 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1930 "v26", "v27", "v28" 1931 1932 ); 1933} 1934 1935void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { 1936 asm volatile ( 1937 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1938 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1939 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1940 "movi v27.8b, #16 \n" // Add 16 constant 1941 "1: \n" 1942 MEMACCESS(0) 1943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1944 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1945 RGB565TOARGB 1946 "umull v3.8h, v0.8b, v24.8b \n" // B 1947 "umlal v3.8h, v1.8b, v25.8b \n" // G 1948 "umlal v3.8h, v2.8b, v26.8b \n" // R 1949 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1950 "uqadd v0.8b, v0.8b, v27.8b \n" 1951 MEMACCESS(1) 1952 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1953 "b.gt 1b \n" 1954 : "+r"(src_rgb565), // %0 1955 "+r"(dst_y), // %1 1956 "+r"(width) // %2 1957 : 1958 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 1959 "v24", "v25", "v26", "v27" 1960 ); 1961} 1962 1963void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { 1964 asm volatile ( 1965 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1966 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1967 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1968 "movi v7.8b, #16 \n" // Add 16 constant 1969 "1: \n" 1970 MEMACCESS(0) 1971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1972 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1973 ARGB1555TOARGB 1974 "umull v3.8h, v0.8b, v4.8b \n" // B 1975 "umlal v3.8h, v1.8b, v5.8b \n" // G 1976 "umlal v3.8h, v2.8b, v6.8b \n" // R 1977 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1978 "uqadd v0.8b, v0.8b, v7.8b \n" 1979 MEMACCESS(1) 1980 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1981 "b.gt 1b \n" 1982 : "+r"(src_argb1555), // %0 1983 "+r"(dst_y), // %1 1984 "+r"(width) // %2 1985 : 1986 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1987 ); 1988} 1989 1990void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { 1991 asm volatile ( 1992 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1993 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1994 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1995 "movi v27.8b, #16 \n" // Add 16 constant 1996 "1: \n" 1997 MEMACCESS(0) 1998 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1999 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2000 ARGB4444TOARGB 2001 "umull v3.8h, v0.8b, v24.8b \n" // B 2002 "umlal v3.8h, v1.8b, v25.8b \n" // G 2003 "umlal v3.8h, v2.8b, v26.8b \n" // R 2004 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2005 "uqadd v0.8b, v0.8b, v27.8b \n" 2006 MEMACCESS(1) 2007 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2008 "b.gt 1b \n" 2009 : "+r"(src_argb4444), // %0 2010 "+r"(dst_y), // %1 2011 "+r"(width) // %2 2012 : 2013 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2014 ); 2015} 2016 2017void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { 2018 asm volatile ( 2019 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2020 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2021 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2022 "movi v7.8b, #16 \n" // Add 16 constant 2023 "1: \n" 2024 MEMACCESS(0) 2025 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2026 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2027 "umull v16.8h, v1.8b, v4.8b \n" // R 2028 "umlal v16.8h, v2.8b, v5.8b \n" // G 2029 "umlal v16.8h, v3.8b, v6.8b \n" // B 2030 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2031 "uqadd v0.8b, v0.8b, v7.8b \n" 2032 MEMACCESS(1) 2033 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2034 "b.gt 1b \n" 2035 : "+r"(src_bgra), // %0 2036 "+r"(dst_y), // %1 2037 "+r"(width) // %2 2038 : 2039 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2040 ); 2041} 2042 2043void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { 2044 asm volatile ( 2045 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2046 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2047 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2048 "movi v7.8b, #16 \n" // Add 16 constant 2049 "1: \n" 2050 MEMACCESS(0) 2051 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2052 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2053 "umull v16.8h, v0.8b, v4.8b \n" // R 2054 "umlal v16.8h, v1.8b, v5.8b \n" // G 2055 "umlal v16.8h, v2.8b, v6.8b \n" // B 2056 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2057 "uqadd v0.8b, v0.8b, v7.8b \n" 2058 MEMACCESS(1) 2059 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2060 "b.gt 1b \n" 2061 : "+r"(src_abgr), // %0 2062 "+r"(dst_y), // %1 2063 "+r"(width) // %2 2064 : 2065 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2066 ); 2067} 2068 2069void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { 2070 asm volatile ( 2071 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2072 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2073 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2074 "movi v7.8b, #16 \n" // Add 16 constant 2075 "1: \n" 2076 MEMACCESS(0) 2077 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2078 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2079 "umull v16.8h, v1.8b, v4.8b \n" // B 2080 "umlal v16.8h, v2.8b, v5.8b \n" // G 2081 "umlal v16.8h, v3.8b, v6.8b \n" // R 2082 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2083 "uqadd v0.8b, v0.8b, v7.8b \n" 2084 MEMACCESS(1) 2085 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2086 "b.gt 1b \n" 2087 : "+r"(src_rgba), // %0 2088 "+r"(dst_y), // %1 2089 "+r"(width) // %2 2090 : 2091 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2092 ); 2093} 2094 2095void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { 2096 asm volatile ( 2097 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2098 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2099 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2100 "movi v7.8b, #16 \n" // Add 16 constant 2101 "1: \n" 2102 MEMACCESS(0) 2103 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2104 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2105 "umull v16.8h, v0.8b, v4.8b \n" // B 2106 "umlal v16.8h, v1.8b, v5.8b \n" // G 2107 "umlal v16.8h, v2.8b, v6.8b \n" // R 2108 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2109 "uqadd v0.8b, v0.8b, v7.8b \n" 2110 MEMACCESS(1) 2111 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2112 "b.gt 1b \n" 2113 : "+r"(src_rgb24), // %0 2114 "+r"(dst_y), // %1 2115 "+r"(width) // %2 2116 : 2117 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2118 ); 2119} 2120 2121void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { 2122 asm volatile ( 2123 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2124 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2125 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2126 "movi v7.8b, #16 \n" // Add 16 constant 2127 "1: \n" 2128 MEMACCESS(0) 2129 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2130 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2131 "umull v16.8h, v0.8b, v4.8b \n" // B 2132 "umlal v16.8h, v1.8b, v5.8b \n" // G 2133 "umlal v16.8h, v2.8b, v6.8b \n" // R 2134 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2135 "uqadd v0.8b, v0.8b, v7.8b \n" 2136 MEMACCESS(1) 2137 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2138 "b.gt 1b \n" 2139 : "+r"(src_raw), // %0 2140 "+r"(dst_y), // %1 2141 "+r"(width) // %2 2142 : 2143 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2144 ); 2145} 2146 2147// Bilinear filter 16x2 -> 16x1 2148void InterpolateRow_NEON(uint8* dst_ptr, 2149 const uint8* src_ptr, 2150 ptrdiff_t src_stride, 2151 int dst_width, 2152 int source_y_fraction) { 2153 int y1_fraction = source_y_fraction; 2154 int y0_fraction = 256 - y1_fraction; 2155 const uint8* src_ptr1 = src_ptr + src_stride; 2156 asm volatile ( 2157 "cmp %w4, #0 \n" 2158 "b.eq 100f \n" 2159 "cmp %w4, #128 \n" 2160 "b.eq 50f \n" 2161 2162 "dup v5.16b, %w4 \n" 2163 "dup v4.16b, %w5 \n" 2164 // General purpose row blend. 2165 "1: \n" 2166 MEMACCESS(1) 2167 "ld1 {v0.16b}, [%1], #16 \n" 2168 MEMACCESS(2) 2169 "ld1 {v1.16b}, [%2], #16 \n" 2170 "subs %w3, %w3, #16 \n" 2171 "umull v2.8h, v0.8b, v4.8b \n" 2172 "umull2 v3.8h, v0.16b, v4.16b \n" 2173 "umlal v2.8h, v1.8b, v5.8b \n" 2174 "umlal2 v3.8h, v1.16b, v5.16b \n" 2175 "rshrn v0.8b, v2.8h, #8 \n" 2176 "rshrn2 v0.16b, v3.8h, #8 \n" 2177 MEMACCESS(0) 2178 "st1 {v0.16b}, [%0], #16 \n" 2179 "b.gt 1b \n" 2180 "b 99f \n" 2181 2182 // Blend 50 / 50. 2183 "50: \n" 2184 MEMACCESS(1) 2185 "ld1 {v0.16b}, [%1], #16 \n" 2186 MEMACCESS(2) 2187 "ld1 {v1.16b}, [%2], #16 \n" 2188 "subs %w3, %w3, #16 \n" 2189 "urhadd v0.16b, v0.16b, v1.16b \n" 2190 MEMACCESS(0) 2191 "st1 {v0.16b}, [%0], #16 \n" 2192 "b.gt 50b \n" 2193 "b 99f \n" 2194 2195 // Blend 100 / 0 - Copy row unchanged. 2196 "100: \n" 2197 MEMACCESS(1) 2198 "ld1 {v0.16b}, [%1], #16 \n" 2199 "subs %w3, %w3, #16 \n" 2200 MEMACCESS(0) 2201 "st1 {v0.16b}, [%0], #16 \n" 2202 "b.gt 100b \n" 2203 2204 "99: \n" 2205 : "+r"(dst_ptr), // %0 2206 "+r"(src_ptr), // %1 2207 "+r"(src_ptr1), // %2 2208 "+r"(dst_width), // %3 2209 "+r"(y1_fraction), // %4 2210 "+r"(y0_fraction) // %5 2211 : 2212 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" 2213 ); 2214} 2215 2216// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2217void ARGBBlendRow_NEON(const uint8* src_argb0, 2218 const uint8* src_argb1, 2219 uint8* dst_argb, 2220 int width) { 2221 asm volatile ( 2222 "subs %w3, %w3, #8 \n" 2223 "b.lt 89f \n" 2224 // Blend 8 pixels. 2225 "8: \n" 2226 MEMACCESS(0) 2227 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels 2228 MEMACCESS(1) 2229 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels 2230 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2231 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2232 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2233 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2234 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2235 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2236 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2237 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2238 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2239 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2240 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2241 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2242 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2243 "movi v3.8b, #255 \n" // a = 255 2244 MEMACCESS(2) 2245 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2246 "b.ge 8b \n" 2247 2248 "89: \n" 2249 "adds %w3, %w3, #8-1 \n" 2250 "b.lt 99f \n" 2251 2252 // Blend 1 pixels. 2253 "1: \n" 2254 MEMACCESS(0) 2255 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2256 MEMACCESS(1) 2257 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. 2258 "subs %w3, %w3, #1 \n" // 1 processed per loop. 2259 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2260 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2261 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2262 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2263 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2264 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2265 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2266 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2267 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2268 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2269 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2270 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2271 "movi v3.8b, #255 \n" // a = 255 2272 MEMACCESS(2) 2273 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. 2274 "b.ge 1b \n" 2275 2276 "99: \n" 2277 2278 : "+r"(src_argb0), // %0 2279 "+r"(src_argb1), // %1 2280 "+r"(dst_argb), // %2 2281 "+r"(width) // %3 2282 : 2283 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2284 "v16", "v17", "v18" 2285 ); 2286} 2287 2288// Attenuate 8 pixels at a time. 2289void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2290 asm volatile ( 2291 // Attenuate 8 pixels. 2292 "1: \n" 2293 MEMACCESS(0) 2294 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels 2295 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2296 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2297 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2298 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2299 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2300 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2301 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2302 MEMACCESS(1) 2303 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 2304 "b.gt 1b \n" 2305 : "+r"(src_argb), // %0 2306 "+r"(dst_argb), // %1 2307 "+r"(width) // %2 2308 : 2309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2310 ); 2311} 2312 2313// Quantize 8 ARGB pixels (32 bytes). 2314// dst = (dst * scale >> 16) * interval_size + interval_offset; 2315void ARGBQuantizeRow_NEON(uint8* dst_argb, 2316 int scale, 2317 int interval_size, 2318 int interval_offset, 2319 int width) { 2320 asm volatile ( 2321 "dup v4.8h, %w2 \n" 2322 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2323 "dup v5.8h, %w3 \n" // interval multiply. 2324 "dup v6.8h, %w4 \n" // interval add 2325 2326 // 8 pixel loop. 2327 "1: \n" 2328 MEMACCESS(0) 2329 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. 2330 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2331 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) 2332 "uxtl v1.8h, v1.8b \n" 2333 "uxtl v2.8h, v2.8b \n" 2334 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale 2335 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g 2336 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r 2337 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size 2338 "mul v1.8h, v1.8h, v5.8h \n" // g 2339 "mul v2.8h, v2.8h, v5.8h \n" // r 2340 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset 2341 "add v1.8h, v1.8h, v6.8h \n" // g 2342 "add v2.8h, v2.8h, v6.8h \n" // r 2343 "uqxtn v0.8b, v0.8h \n" 2344 "uqxtn v1.8b, v1.8h \n" 2345 "uqxtn v2.8b, v2.8h \n" 2346 MEMACCESS(0) 2347 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels 2348 "b.gt 1b \n" 2349 : "+r"(dst_argb), // %0 2350 "+r"(width) // %1 2351 : "r"(scale), // %2 2352 "r"(interval_size), // %3 2353 "r"(interval_offset) // %4 2354 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2355 ); 2356} 2357 2358// Shade 8 pixels at a time by specified value. 2359// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2360// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2361void ARGBShadeRow_NEON(const uint8* src_argb, 2362 uint8* dst_argb, 2363 int width, 2364 uint32 value) { 2365 asm volatile ( 2366 "dup v0.4s, %w3 \n" // duplicate scale value. 2367 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2368 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2369 2370 // 8 pixel loop. 2371 "1: \n" 2372 MEMACCESS(0) 2373 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2374 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2375 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) 2376 "uxtl v5.8h, v5.8b \n" 2377 "uxtl v6.8h, v6.8b \n" 2378 "uxtl v7.8h, v7.8b \n" 2379 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 2380 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g 2381 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r 2382 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a 2383 "uqxtn v4.8b, v4.8h \n" 2384 "uqxtn v5.8b, v5.8h \n" 2385 "uqxtn v6.8b, v6.8h \n" 2386 "uqxtn v7.8b, v7.8h \n" 2387 MEMACCESS(1) 2388 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels 2389 "b.gt 1b \n" 2390 : "+r"(src_argb), // %0 2391 "+r"(dst_argb), // %1 2392 "+r"(width) // %2 2393 : "r"(value) // %3 2394 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" 2395 ); 2396} 2397 2398// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 2399// Similar to ARGBToYJ but stores ARGB. 2400// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2401void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2402 asm volatile ( 2403 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2404 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2405 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2406 "1: \n" 2407 MEMACCESS(0) 2408 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2409 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2410 "umull v4.8h, v0.8b, v24.8b \n" // B 2411 "umlal v4.8h, v1.8b, v25.8b \n" // G 2412 "umlal v4.8h, v2.8b, v26.8b \n" // R 2413 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2414 "orr v1.8b, v0.8b, v0.8b \n" // G 2415 "orr v2.8b, v0.8b, v0.8b \n" // R 2416 MEMACCESS(1) 2417 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2418 "b.gt 1b \n" 2419 : "+r"(src_argb), // %0 2420 "+r"(dst_argb), // %1 2421 "+r"(width) // %2 2422 : 2423 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" 2424 ); 2425} 2426 2427// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 2428// b = (r * 35 + g * 68 + b * 17) >> 7 2429// g = (r * 45 + g * 88 + b * 22) >> 7 2430// r = (r * 50 + g * 98 + b * 24) >> 7 2431 2432void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2433 asm volatile ( 2434 "movi v20.8b, #17 \n" // BB coefficient 2435 "movi v21.8b, #68 \n" // BG coefficient 2436 "movi v22.8b, #35 \n" // BR coefficient 2437 "movi v24.8b, #22 \n" // GB coefficient 2438 "movi v25.8b, #88 \n" // GG coefficient 2439 "movi v26.8b, #45 \n" // GR coefficient 2440 "movi v28.8b, #24 \n" // BB coefficient 2441 "movi v29.8b, #98 \n" // BG coefficient 2442 "movi v30.8b, #50 \n" // BR coefficient 2443 "1: \n" 2444 MEMACCESS(0) 2445 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2446 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2447 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B 2448 "umlal v4.8h, v1.8b, v21.8b \n" // G 2449 "umlal v4.8h, v2.8b, v22.8b \n" // R 2450 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G 2451 "umlal v5.8h, v1.8b, v25.8b \n" // G 2452 "umlal v5.8h, v2.8b, v26.8b \n" // R 2453 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R 2454 "umlal v6.8h, v1.8b, v29.8b \n" // G 2455 "umlal v6.8h, v2.8b, v30.8b \n" // R 2456 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B 2457 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G 2458 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R 2459 MEMACCESS(0) 2460 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. 2461 "b.gt 1b \n" 2462 : "+r"(dst_argb), // %0 2463 "+r"(width) // %1 2464 : 2465 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2466 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" 2467 ); 2468} 2469 2470// Tranform 8 ARGB pixels (32 bytes) with color matrix. 2471// TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2472// needs to saturate. Consider doing a non-saturating version. 2473void ARGBColorMatrixRow_NEON(const uint8* src_argb, 2474 uint8* dst_argb, 2475 const int8* matrix_argb, 2476 int width) { 2477 asm volatile ( 2478 MEMACCESS(3) 2479 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. 2480 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. 2481 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2482 2483 "1: \n" 2484 MEMACCESS(0) 2485 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. 2486 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2487 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit 2488 "uxtl v17.8h, v17.8b \n" // g 2489 "uxtl v18.8h, v18.8b \n" // r 2490 "uxtl v19.8h, v19.8b \n" // a 2491 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B 2492 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G 2493 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R 2494 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A 2495 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B 2496 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G 2497 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R 2498 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A 2499 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2500 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2501 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2502 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2503 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B 2504 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G 2505 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R 2506 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A 2507 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2508 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2509 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2510 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2511 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B 2512 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G 2513 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R 2514 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A 2515 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2516 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2517 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2518 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2519 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B 2520 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G 2521 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R 2522 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A 2523 MEMACCESS(1) 2524 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. 2525 "b.gt 1b \n" 2526 : "+r"(src_argb), // %0 2527 "+r"(dst_argb), // %1 2528 "+r"(width) // %2 2529 : "r"(matrix_argb) // %3 2530 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 2531 "v18", "v19", "v22", "v23", "v24", "v25" 2532 ); 2533} 2534 2535// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 2536// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 2537void ARGBMultiplyRow_NEON(const uint8* src_argb0, 2538 const uint8* src_argb1, 2539 uint8* dst_argb, 2540 int width) { 2541 asm volatile ( 2542 // 8 pixel loop. 2543 "1: \n" 2544 MEMACCESS(0) 2545 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2546 MEMACCESS(1) 2547 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2548 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2549 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2550 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2551 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2552 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2553 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2554 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2555 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2556 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2557 MEMACCESS(2) 2558 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2559 "b.gt 1b \n" 2560 2561 : "+r"(src_argb0), // %0 2562 "+r"(src_argb1), // %1 2563 "+r"(dst_argb), // %2 2564 "+r"(width) // %3 2565 : 2566 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2567 ); 2568} 2569 2570// Add 2 rows of ARGB pixels together, 8 pixels at a time. 2571void ARGBAddRow_NEON(const uint8* src_argb0, 2572 const uint8* src_argb1, 2573 uint8* dst_argb, 2574 int width) { 2575 asm volatile ( 2576 // 8 pixel loop. 2577 "1: \n" 2578 MEMACCESS(0) 2579 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2580 MEMACCESS(1) 2581 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2582 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2583 "uqadd v0.8b, v0.8b, v4.8b \n" 2584 "uqadd v1.8b, v1.8b, v5.8b \n" 2585 "uqadd v2.8b, v2.8b, v6.8b \n" 2586 "uqadd v3.8b, v3.8b, v7.8b \n" 2587 MEMACCESS(2) 2588 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2589 "b.gt 1b \n" 2590 2591 : "+r"(src_argb0), // %0 2592 "+r"(src_argb1), // %1 2593 "+r"(dst_argb), // %2 2594 "+r"(width) // %3 2595 : 2596 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2597 ); 2598} 2599 2600// Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2601void ARGBSubtractRow_NEON(const uint8* src_argb0, 2602 const uint8* src_argb1, 2603 uint8* dst_argb, 2604 int width) { 2605 asm volatile ( 2606 // 8 pixel loop. 2607 "1: \n" 2608 MEMACCESS(0) 2609 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2610 MEMACCESS(1) 2611 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2612 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2613 "uqsub v0.8b, v0.8b, v4.8b \n" 2614 "uqsub v1.8b, v1.8b, v5.8b \n" 2615 "uqsub v2.8b, v2.8b, v6.8b \n" 2616 "uqsub v3.8b, v3.8b, v7.8b \n" 2617 MEMACCESS(2) 2618 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2619 "b.gt 1b \n" 2620 2621 : "+r"(src_argb0), // %0 2622 "+r"(src_argb1), // %1 2623 "+r"(dst_argb), // %2 2624 "+r"(width) // %3 2625 : 2626 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2627 ); 2628} 2629 2630// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 2631// A = 255 2632// R = Sobel 2633// G = Sobel 2634// B = Sobel 2635void SobelRow_NEON(const uint8* src_sobelx, 2636 const uint8* src_sobely, 2637 uint8* dst_argb, 2638 int width) { 2639 asm volatile ( 2640 "movi v3.8b, #255 \n" // alpha 2641 // 8 pixel loop. 2642 "1: \n" 2643 MEMACCESS(0) 2644 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2645 MEMACCESS(1) 2646 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2647 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2648 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2649 "orr v1.8b, v0.8b, v0.8b \n" 2650 "orr v2.8b, v0.8b, v0.8b \n" 2651 MEMACCESS(2) 2652 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2653 "b.gt 1b \n" 2654 : "+r"(src_sobelx), // %0 2655 "+r"(src_sobely), // %1 2656 "+r"(dst_argb), // %2 2657 "+r"(width) // %3 2658 : 2659 : "cc", "memory", "v0", "v1", "v2", "v3" 2660 ); 2661} 2662 2663// Adds Sobel X and Sobel Y and stores Sobel into plane. 2664void SobelToPlaneRow_NEON(const uint8* src_sobelx, 2665 const uint8* src_sobely, 2666 uint8* dst_y, 2667 int width) { 2668 asm volatile ( 2669 // 16 pixel loop. 2670 "1: \n" 2671 MEMACCESS(0) 2672 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2673 MEMACCESS(1) 2674 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2675 "subs %w3, %w3, #16 \n" // 16 processed per loop. 2676 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2677 MEMACCESS(2) 2678 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2679 "b.gt 1b \n" 2680 : "+r"(src_sobelx), // %0 2681 "+r"(src_sobely), // %1 2682 "+r"(dst_y), // %2 2683 "+r"(width) // %3 2684 : 2685 : "cc", "memory", "v0", "v1" 2686 ); 2687} 2688 2689// Mixes Sobel X, Sobel Y and Sobel into ARGB. 2690// A = 255 2691// R = Sobel X 2692// G = Sobel 2693// B = Sobel Y 2694void SobelXYRow_NEON(const uint8* src_sobelx, 2695 const uint8* src_sobely, 2696 uint8* dst_argb, 2697 int width) { 2698 asm volatile ( 2699 "movi v3.8b, #255 \n" // alpha 2700 // 8 pixel loop. 2701 "1: \n" 2702 MEMACCESS(0) 2703 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2704 MEMACCESS(1) 2705 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2706 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2707 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2708 MEMACCESS(2) 2709 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2710 "b.gt 1b \n" 2711 : "+r"(src_sobelx), // %0 2712 "+r"(src_sobely), // %1 2713 "+r"(dst_argb), // %2 2714 "+r"(width) // %3 2715 : 2716 : "cc", "memory", "v0", "v1", "v2", "v3" 2717 ); 2718} 2719 2720// SobelX as a matrix is 2721// -1 0 1 2722// -2 0 2 2723// -1 0 1 2724void SobelXRow_NEON(const uint8* src_y0, 2725 const uint8* src_y1, 2726 const uint8* src_y2, 2727 uint8* dst_sobelx, 2728 int width) { 2729 asm volatile ( 2730 "1: \n" 2731 MEMACCESS(0) 2732 "ld1 {v0.8b}, [%0],%5 \n" // top 2733 MEMACCESS(0) 2734 "ld1 {v1.8b}, [%0],%6 \n" 2735 "usubl v0.8h, v0.8b, v1.8b \n" 2736 MEMACCESS(1) 2737 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 2738 MEMACCESS(1) 2739 "ld1 {v3.8b}, [%1],%6 \n" 2740 "usubl v1.8h, v2.8b, v3.8b \n" 2741 "add v0.8h, v0.8h, v1.8h \n" 2742 "add v0.8h, v0.8h, v1.8h \n" 2743 MEMACCESS(2) 2744 "ld1 {v2.8b}, [%2],%5 \n" // bottom 2745 MEMACCESS(2) 2746 "ld1 {v3.8b}, [%2],%6 \n" 2747 "subs %w4, %w4, #8 \n" // 8 pixels 2748 "usubl v1.8h, v2.8b, v3.8b \n" 2749 "add v0.8h, v0.8h, v1.8h \n" 2750 "abs v0.8h, v0.8h \n" 2751 "uqxtn v0.8b, v0.8h \n" 2752 MEMACCESS(3) 2753 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 2754 "b.gt 1b \n" 2755 : "+r"(src_y0), // %0 2756 "+r"(src_y1), // %1 2757 "+r"(src_y2), // %2 2758 "+r"(dst_sobelx), // %3 2759 "+r"(width) // %4 2760 : "r"(2LL), // %5 2761 "r"(6LL) // %6 2762 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2763 ); 2764} 2765 2766// SobelY as a matrix is 2767// -1 -2 -1 2768// 0 0 0 2769// 1 2 1 2770void SobelYRow_NEON(const uint8* src_y0, 2771 const uint8* src_y1, 2772 uint8* dst_sobely, 2773 int width) { 2774 asm volatile ( 2775 "1: \n" 2776 MEMACCESS(0) 2777 "ld1 {v0.8b}, [%0],%4 \n" // left 2778 MEMACCESS(1) 2779 "ld1 {v1.8b}, [%1],%4 \n" 2780 "usubl v0.8h, v0.8b, v1.8b \n" 2781 MEMACCESS(0) 2782 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 2783 MEMACCESS(1) 2784 "ld1 {v3.8b}, [%1],%4 \n" 2785 "usubl v1.8h, v2.8b, v3.8b \n" 2786 "add v0.8h, v0.8h, v1.8h \n" 2787 "add v0.8h, v0.8h, v1.8h \n" 2788 MEMACCESS(0) 2789 "ld1 {v2.8b}, [%0],%5 \n" // right 2790 MEMACCESS(1) 2791 "ld1 {v3.8b}, [%1],%5 \n" 2792 "subs %w3, %w3, #8 \n" // 8 pixels 2793 "usubl v1.8h, v2.8b, v3.8b \n" 2794 "add v0.8h, v0.8h, v1.8h \n" 2795 "abs v0.8h, v0.8h \n" 2796 "uqxtn v0.8b, v0.8h \n" 2797 MEMACCESS(2) 2798 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 2799 "b.gt 1b \n" 2800 : "+r"(src_y0), // %0 2801 "+r"(src_y1), // %1 2802 "+r"(dst_sobely), // %2 2803 "+r"(width) // %3 2804 : "r"(1LL), // %4 2805 "r"(6LL) // %5 2806 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2807 ); 2808} 2809 2810// Caveat - rounds float to half float whereas scaling version truncates. 2811void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { 2812 asm volatile ( 2813 "1: \n" 2814 MEMACCESS(0) 2815 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2816 "subs %w2, %w2, #8 \n" // 8 pixels per loop 2817 "uxtl v2.4s, v1.4h \n" // 8 int's 2818 "uxtl2 v3.4s, v1.8h \n" 2819 "scvtf v2.4s, v2.4s \n" // 8 floats 2820 "scvtf v3.4s, v3.4s \n" 2821 "fcvtn v1.4h, v2.4s \n" // 8 half floats 2822 "fcvtn2 v1.8h, v3.4s \n" 2823 MEMACCESS(1) 2824 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts 2825 "b.gt 1b \n" 2826 : "+r"(src), // %0 2827 "+r"(dst), // %1 2828 "+r"(width) // %2 2829 : 2830 : "cc", "memory", "v1", "v2", "v3" 2831 ); 2832} 2833 2834void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { 2835 asm volatile ( 2836 "1: \n" 2837 MEMACCESS(0) 2838 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts 2839 "subs %w2, %w2, #8 \n" // 8 pixels per loop 2840 "uxtl v2.4s, v1.4h \n" // 8 int's 2841 "uxtl2 v3.4s, v1.8h \n" 2842 "scvtf v2.4s, v2.4s \n" // 8 floats 2843 "scvtf v3.4s, v3.4s \n" 2844 "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent 2845 "fmul v3.4s, v3.4s, %3.s[0] \n" 2846 "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat 2847 "uqshrn2 v1.8h, v3.4s, #13 \n" 2848 MEMACCESS(1) 2849 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts 2850 "b.gt 1b \n" 2851 : "+r"(src), // %0 2852 "+r"(dst), // %1 2853 "+r"(width) // %2 2854 : "w"(scale * 1.9259299444e-34f) // %3 2855 : "cc", "memory", "v1", "v2", "v3" 2856 ); 2857} 2858 2859#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 2860 2861#ifdef __cplusplus 2862} // extern "C" 2863} // namespace libyuv 2864#endif 2865