1/* 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/scale.h" 12#include "libyuv/row.h" 13#include "libyuv/scale_row.h" 14 15#ifdef __cplusplus 16namespace libyuv { 17extern "C" { 18#endif 19 20// This module is for GCC Neon armv8 64 bit. 21#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 22 23// Read 32x1 throw away even pixels, and write 16x1. 24void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 25 uint8* dst, int dst_width) { 26 asm volatile ( 27 "1: \n" 28 // load even pixels into v0, odd into v1 29 MEMACCESS(0) 30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" 31 "subs %w2, %w2, #16 \n" // 16 processed per loop 32 MEMACCESS(1) 33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 34 "b.gt 1b \n" 35 : "+r"(src_ptr), // %0 36 "+r"(dst), // %1 37 "+r"(dst_width) // %2 38 : 39 : "v0", "v1" // Clobber List 40 ); 41} 42 43// Read 32x1 average down and write 16x1. 44void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 45 uint8* dst, int dst_width) { 46 asm volatile ( 47 "1: \n" 48 MEMACCESS(0) 49 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc 50 "subs %w2, %w2, #16 \n" // 16 processed per loop 51 "uaddlp v0.8h, v0.16b \n" // add adjacent 52 "uaddlp v1.8h, v1.16b \n" 53 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 54 "rshrn2 v0.16b, v1.8h, #1 \n" 55 MEMACCESS(1) 56 "st1 {v0.16b}, [%1], #16 \n" 57 "b.gt 1b \n" 58 : "+r"(src_ptr), // %0 59 "+r"(dst), // %1 60 "+r"(dst_width) // %2 61 : 62 : "v0", "v1" // Clobber List 63 ); 64} 65 66// Read 32x2 average down and write 16x1. 67void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 68 uint8* dst, int dst_width) { 69 asm volatile ( 70 // change the stride to row 2 pointer 71 "add %1, %1, %0 \n" 72 "1: \n" 73 MEMACCESS(0) 74 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc 75 MEMACCESS(1) 76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc 77 "subs %w3, %w3, #16 \n" // 16 processed per loop 78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent 79 "uaddlp v1.8h, v1.16b \n" 80 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 81 "uadalp v1.8h, v3.16b \n" 82 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack 83 "rshrn2 v0.16b, v1.8h, #2 \n" 84 MEMACCESS(2) 85 "st1 {v0.16b}, [%2], #16 \n" 86 "b.gt 1b \n" 87 : "+r"(src_ptr), // %0 88 "+r"(src_stride), // %1 89 "+r"(dst), // %2 90 "+r"(dst_width) // %3 91 : 92 : "v0", "v1", "v2", "v3" // Clobber List 93 ); 94} 95 96void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 97 uint8* dst_ptr, int dst_width) { 98 asm volatile ( 99 "1: \n" 100 MEMACCESS(0) 101 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 102 "subs %w2, %w2, #8 \n" // 8 processed per loop 103 MEMACCESS(1) 104 "st1 {v2.8b}, [%1], #8 \n" 105 "b.gt 1b \n" 106 : "+r"(src_ptr), // %0 107 "+r"(dst_ptr), // %1 108 "+r"(dst_width) // %2 109 : 110 : "v0", "v1", "v2", "v3", "memory", "cc" 111 ); 112} 113 114void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 115 uint8* dst_ptr, int dst_width) { 116 const uint8* src_ptr1 = src_ptr + src_stride; 117 const uint8* src_ptr2 = src_ptr + src_stride * 2; 118 const uint8* src_ptr3 = src_ptr + src_stride * 3; 119asm volatile ( 120 "1: \n" 121 MEMACCESS(0) 122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 123 MEMACCESS(3) 124 "ld1 {v1.16b}, [%2], #16 \n" 125 MEMACCESS(4) 126 "ld1 {v2.16b}, [%3], #16 \n" 127 MEMACCESS(5) 128 "ld1 {v3.16b}, [%4], #16 \n" 129 "subs %w5, %w5, #4 \n" 130 "uaddlp v0.8h, v0.16b \n" 131 "uadalp v0.8h, v1.16b \n" 132 "uadalp v0.8h, v2.16b \n" 133 "uadalp v0.8h, v3.16b \n" 134 "addp v0.8h, v0.8h, v0.8h \n" 135 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding 136 MEMACCESS(1) 137 "st1 {v0.s}[0], [%1], #4 \n" 138 "b.gt 1b \n" 139 : "+r"(src_ptr), // %0 140 "+r"(dst_ptr), // %1 141 "+r"(src_ptr1), // %2 142 "+r"(src_ptr2), // %3 143 "+r"(src_ptr3), // %4 144 "+r"(dst_width) // %5 145 : 146 : "v0", "v1", "v2", "v3", "memory", "cc" 147 ); 148} 149 150// Down scale from 4 to 3 pixels. Use the neon multilane read/write 151// to load up the every 4th pixel into a 4 different registers. 152// Point samples 32 pixels to 24 pixels. 153void ScaleRowDown34_NEON(const uint8* src_ptr, 154 ptrdiff_t src_stride, 155 uint8* dst_ptr, int dst_width) { 156 asm volatile ( 157 "1: \n" 158 MEMACCESS(0) 159 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 160 "subs %w2, %w2, #24 \n" 161 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 162 MEMACCESS(1) 163 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 164 "b.gt 1b \n" 165 : "+r"(src_ptr), // %0 166 "+r"(dst_ptr), // %1 167 "+r"(dst_width) // %2 168 : 169 : "v0", "v1", "v2", "v3", "memory", "cc" 170 ); 171} 172 173void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, 174 ptrdiff_t src_stride, 175 uint8* dst_ptr, int dst_width) { 176 asm volatile ( 177 "movi v20.8b, #3 \n" 178 "add %3, %3, %0 \n" 179 "1: \n" 180 MEMACCESS(0) 181 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 182 MEMACCESS(3) 183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 184 "subs %w2, %w2, #24 \n" 185 186 // filter src line 0 with src line 1 187 // expand chars to shorts to allow for room 188 // when adding lines together 189 "ushll v16.8h, v4.8b, #0 \n" 190 "ushll v17.8h, v5.8b, #0 \n" 191 "ushll v18.8h, v6.8b, #0 \n" 192 "ushll v19.8h, v7.8b, #0 \n" 193 194 // 3 * line_0 + line_1 195 "umlal v16.8h, v0.8b, v20.8b \n" 196 "umlal v17.8h, v1.8b, v20.8b \n" 197 "umlal v18.8h, v2.8b, v20.8b \n" 198 "umlal v19.8h, v3.8b, v20.8b \n" 199 200 // (3 * line_0 + line_1) >> 2 201 "uqrshrn v0.8b, v16.8h, #2 \n" 202 "uqrshrn v1.8b, v17.8h, #2 \n" 203 "uqrshrn v2.8b, v18.8h, #2 \n" 204 "uqrshrn v3.8b, v19.8h, #2 \n" 205 206 // a0 = (src[0] * 3 + s[1] * 1) >> 2 207 "ushll v16.8h, v1.8b, #0 \n" 208 "umlal v16.8h, v0.8b, v20.8b \n" 209 "uqrshrn v0.8b, v16.8h, #2 \n" 210 211 // a1 = (src[1] * 1 + s[2] * 1) >> 1 212 "urhadd v1.8b, v1.8b, v2.8b \n" 213 214 // a2 = (src[2] * 1 + s[3] * 3) >> 2 215 "ushll v16.8h, v2.8b, #0 \n" 216 "umlal v16.8h, v3.8b, v20.8b \n" 217 "uqrshrn v2.8b, v16.8h, #2 \n" 218 219 MEMACCESS(1) 220 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 221 222 "b.gt 1b \n" 223 : "+r"(src_ptr), // %0 224 "+r"(dst_ptr), // %1 225 "+r"(dst_width), // %2 226 "+r"(src_stride) // %3 227 : 228 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", 229 "v20", "memory", "cc" 230 ); 231} 232 233void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, 234 ptrdiff_t src_stride, 235 uint8* dst_ptr, int dst_width) { 236 asm volatile ( 237 "movi v20.8b, #3 \n" 238 "add %3, %3, %0 \n" 239 "1: \n" 240 MEMACCESS(0) 241 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 242 MEMACCESS(3) 243 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 244 "subs %w2, %w2, #24 \n" 245 // average src line 0 with src line 1 246 "urhadd v0.8b, v0.8b, v4.8b \n" 247 "urhadd v1.8b, v1.8b, v5.8b \n" 248 "urhadd v2.8b, v2.8b, v6.8b \n" 249 "urhadd v3.8b, v3.8b, v7.8b \n" 250 251 // a0 = (src[0] * 3 + s[1] * 1) >> 2 252 "ushll v4.8h, v1.8b, #0 \n" 253 "umlal v4.8h, v0.8b, v20.8b \n" 254 "uqrshrn v0.8b, v4.8h, #2 \n" 255 256 // a1 = (src[1] * 1 + s[2] * 1) >> 1 257 "urhadd v1.8b, v1.8b, v2.8b \n" 258 259 // a2 = (src[2] * 1 + s[3] * 3) >> 2 260 "ushll v4.8h, v2.8b, #0 \n" 261 "umlal v4.8h, v3.8b, v20.8b \n" 262 "uqrshrn v2.8b, v4.8h, #2 \n" 263 264 MEMACCESS(1) 265 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" 266 "b.gt 1b \n" 267 : "+r"(src_ptr), // %0 268 "+r"(dst_ptr), // %1 269 "+r"(dst_width), // %2 270 "+r"(src_stride) // %3 271 : 272 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" 273 ); 274} 275 276static uvec8 kShuf38 = 277 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 278static uvec8 kShuf38_2 = 279 { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; 280static vec16 kMult38_Div6 = 281 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 282 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 283static vec16 kMult38_Div9 = 284 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 285 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 286 287// 32 -> 12 288void ScaleRowDown38_NEON(const uint8* src_ptr, 289 ptrdiff_t src_stride, 290 uint8* dst_ptr, int dst_width) { 291 asm volatile ( 292 MEMACCESS(3) 293 "ld1 {v3.16b}, [%3] \n" 294 "1: \n" 295 MEMACCESS(0) 296 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" 297 "subs %w2, %w2, #12 \n" 298 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" 299 MEMACCESS(1) 300 "st1 {v2.8b}, [%1], #8 \n" 301 MEMACCESS(1) 302 "st1 {v2.s}[2], [%1], #4 \n" 303 "b.gt 1b \n" 304 : "+r"(src_ptr), // %0 305 "+r"(dst_ptr), // %1 306 "+r"(dst_width) // %2 307 : "r"(&kShuf38) // %3 308 : "v0", "v1", "v2", "v3", "memory", "cc" 309 ); 310} 311 312// 32x3 -> 12x1 313void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, 314 ptrdiff_t src_stride, 315 uint8* dst_ptr, int dst_width) { 316 const uint8* src_ptr1 = src_ptr + src_stride * 2; 317 ptrdiff_t tmp_src_stride = src_stride; 318 319 asm volatile ( 320 MEMACCESS(5) 321 "ld1 {v29.8h}, [%5] \n" 322 MEMACCESS(6) 323 "ld1 {v30.16b}, [%6] \n" 324 MEMACCESS(7) 325 "ld1 {v31.8h}, [%7] \n" 326 "add %2, %2, %0 \n" 327 "1: \n" 328 329 // 00 40 01 41 02 42 03 43 330 // 10 50 11 51 12 52 13 53 331 // 20 60 21 61 22 62 23 63 332 // 30 70 31 71 32 72 33 73 333 MEMACCESS(0) 334 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 335 MEMACCESS(3) 336 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 337 MEMACCESS(4) 338 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" 339 "subs %w4, %w4, #12 \n" 340 341 // Shuffle the input data around to get align the data 342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 343 // 00 10 01 11 02 12 03 13 344 // 40 50 41 51 42 52 43 53 345 "trn1 v20.8b, v0.8b, v1.8b \n" 346 "trn2 v21.8b, v0.8b, v1.8b \n" 347 "trn1 v22.8b, v4.8b, v5.8b \n" 348 "trn2 v23.8b, v4.8b, v5.8b \n" 349 "trn1 v24.8b, v16.8b, v17.8b \n" 350 "trn2 v25.8b, v16.8b, v17.8b \n" 351 352 // 20 30 21 31 22 32 23 33 353 // 60 70 61 71 62 72 63 73 354 "trn1 v0.8b, v2.8b, v3.8b \n" 355 "trn2 v1.8b, v2.8b, v3.8b \n" 356 "trn1 v4.8b, v6.8b, v7.8b \n" 357 "trn2 v5.8b, v6.8b, v7.8b \n" 358 "trn1 v16.8b, v18.8b, v19.8b \n" 359 "trn2 v17.8b, v18.8b, v19.8b \n" 360 361 // 00+10 01+11 02+12 03+13 362 // 40+50 41+51 42+52 43+53 363 "uaddlp v20.4h, v20.8b \n" 364 "uaddlp v21.4h, v21.8b \n" 365 "uaddlp v22.4h, v22.8b \n" 366 "uaddlp v23.4h, v23.8b \n" 367 "uaddlp v24.4h, v24.8b \n" 368 "uaddlp v25.4h, v25.8b \n" 369 370 // 60+70 61+71 62+72 63+73 371 "uaddlp v1.4h, v1.8b \n" 372 "uaddlp v5.4h, v5.8b \n" 373 "uaddlp v17.4h, v17.8b \n" 374 375 // combine source lines 376 "add v20.4h, v20.4h, v22.4h \n" 377 "add v21.4h, v21.4h, v23.4h \n" 378 "add v20.4h, v20.4h, v24.4h \n" 379 "add v21.4h, v21.4h, v25.4h \n" 380 "add v2.4h, v1.4h, v5.4h \n" 381 "add v2.4h, v2.4h, v17.4h \n" 382 383 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 384 // + s[6 + st * 1] + s[7 + st * 1] 385 // + s[6 + st * 2] + s[7 + st * 2]) / 6 386 "sqrdmulh v2.8h, v2.8h, v29.8h \n" 387 "xtn v2.8b, v2.8h \n" 388 389 // Shuffle 2,3 reg around so that 2 can be added to the 390 // 0,1 reg and 3 can be added to the 4,5 reg. This 391 // requires expanding from u8 to u16 as the 0,1 and 4,5 392 // registers are already expanded. Then do transposes 393 // to get aligned. 394 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 395 "ushll v16.8h, v16.8b, #0 \n" 396 "uaddl v0.8h, v0.8b, v4.8b \n" 397 398 // combine source lines 399 "add v0.8h, v0.8h, v16.8h \n" 400 401 // xx 20 xx 21 xx 22 xx 23 402 // xx 30 xx 31 xx 32 xx 33 403 "trn1 v1.8h, v0.8h, v0.8h \n" 404 "trn2 v4.8h, v0.8h, v0.8h \n" 405 "xtn v0.4h, v1.4s \n" 406 "xtn v4.4h, v4.4s \n" 407 408 // 0+1+2, 3+4+5 409 "add v20.8h, v20.8h, v0.8h \n" 410 "add v21.8h, v21.8h, v4.8h \n" 411 412 // Need to divide, but can't downshift as the the value 413 // isn't a power of 2. So multiply by 65536 / n 414 // and take the upper 16 bits. 415 "sqrdmulh v0.8h, v20.8h, v31.8h \n" 416 "sqrdmulh v1.8h, v21.8h, v31.8h \n" 417 418 // Align for table lookup, vtbl requires registers to 419 // be adjacent 420 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" 421 422 MEMACCESS(1) 423 "st1 {v3.8b}, [%1], #8 \n" 424 MEMACCESS(1) 425 "st1 {v3.s}[2], [%1], #4 \n" 426 "b.gt 1b \n" 427 : "+r"(src_ptr), // %0 428 "+r"(dst_ptr), // %1 429 "+r"(tmp_src_stride), // %2 430 "+r"(src_ptr1), // %3 431 "+r"(dst_width) // %4 432 : "r"(&kMult38_Div6), // %5 433 "r"(&kShuf38_2), // %6 434 "r"(&kMult38_Div9) // %7 435 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 436 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", 437 "v30", "v31", "memory", "cc" 438 ); 439} 440 441// 32x2 -> 12x1 442void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, 443 ptrdiff_t src_stride, 444 uint8* dst_ptr, int dst_width) { 445 // TODO(fbarchard): use src_stride directly for clang 3.5+. 446 ptrdiff_t tmp_src_stride = src_stride; 447 asm volatile ( 448 MEMACCESS(4) 449 "ld1 {v30.8h}, [%4] \n" 450 MEMACCESS(5) 451 "ld1 {v31.16b}, [%5] \n" 452 "add %2, %2, %0 \n" 453 "1: \n" 454 455 // 00 40 01 41 02 42 03 43 456 // 10 50 11 51 12 52 13 53 457 // 20 60 21 61 22 62 23 63 458 // 30 70 31 71 32 72 33 73 459 MEMACCESS(0) 460 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" 461 MEMACCESS(3) 462 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" 463 "subs %w3, %w3, #12 \n" 464 465 // Shuffle the input data around to get align the data 466 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 467 // 00 10 01 11 02 12 03 13 468 // 40 50 41 51 42 52 43 53 469 "trn1 v16.8b, v0.8b, v1.8b \n" 470 "trn2 v17.8b, v0.8b, v1.8b \n" 471 "trn1 v18.8b, v4.8b, v5.8b \n" 472 "trn2 v19.8b, v4.8b, v5.8b \n" 473 474 // 20 30 21 31 22 32 23 33 475 // 60 70 61 71 62 72 63 73 476 "trn1 v0.8b, v2.8b, v3.8b \n" 477 "trn2 v1.8b, v2.8b, v3.8b \n" 478 "trn1 v4.8b, v6.8b, v7.8b \n" 479 "trn2 v5.8b, v6.8b, v7.8b \n" 480 481 // 00+10 01+11 02+12 03+13 482 // 40+50 41+51 42+52 43+53 483 "uaddlp v16.4h, v16.8b \n" 484 "uaddlp v17.4h, v17.8b \n" 485 "uaddlp v18.4h, v18.8b \n" 486 "uaddlp v19.4h, v19.8b \n" 487 488 // 60+70 61+71 62+72 63+73 489 "uaddlp v1.4h, v1.8b \n" 490 "uaddlp v5.4h, v5.8b \n" 491 492 // combine source lines 493 "add v16.4h, v16.4h, v18.4h \n" 494 "add v17.4h, v17.4h, v19.4h \n" 495 "add v2.4h, v1.4h, v5.4h \n" 496 497 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 498 "uqrshrn v2.8b, v2.8h, #2 \n" 499 500 // Shuffle 2,3 reg around so that 2 can be added to the 501 // 0,1 reg and 3 can be added to the 4,5 reg. This 502 // requires expanding from u8 to u16 as the 0,1 and 4,5 503 // registers are already expanded. Then do transposes 504 // to get aligned. 505 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 506 507 // combine source lines 508 "uaddl v0.8h, v0.8b, v4.8b \n" 509 510 // xx 20 xx 21 xx 22 xx 23 511 // xx 30 xx 31 xx 32 xx 33 512 "trn1 v1.8h, v0.8h, v0.8h \n" 513 "trn2 v4.8h, v0.8h, v0.8h \n" 514 "xtn v0.4h, v1.4s \n" 515 "xtn v4.4h, v4.4s \n" 516 517 // 0+1+2, 3+4+5 518 "add v16.8h, v16.8h, v0.8h \n" 519 "add v17.8h, v17.8h, v4.8h \n" 520 521 // Need to divide, but can't downshift as the the value 522 // isn't a power of 2. So multiply by 65536 / n 523 // and take the upper 16 bits. 524 "sqrdmulh v0.8h, v16.8h, v30.8h \n" 525 "sqrdmulh v1.8h, v17.8h, v30.8h \n" 526 527 // Align for table lookup, vtbl requires registers to 528 // be adjacent 529 530 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" 531 532 MEMACCESS(1) 533 "st1 {v3.8b}, [%1], #8 \n" 534 MEMACCESS(1) 535 "st1 {v3.s}[2], [%1], #4 \n" 536 "b.gt 1b \n" 537 : "+r"(src_ptr), // %0 538 "+r"(dst_ptr), // %1 539 "+r"(tmp_src_stride), // %2 540 "+r"(dst_width) // %3 541 : "r"(&kMult38_Div6), // %4 542 "r"(&kShuf38_2) // %5 543 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 544 "v18", "v19", "v30", "v31", "memory", "cc" 545 ); 546} 547 548void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 549 uint16* dst_ptr, int src_width, int src_height) { 550 const uint8* src_tmp; 551 asm volatile ( 552 "1: \n" 553 "mov %0, %1 \n" 554 "mov w12, %w5 \n" 555 "eor v2.16b, v2.16b, v2.16b \n" 556 "eor v3.16b, v3.16b, v3.16b \n" 557 "2: \n" 558 // load 16 pixels into q0 559 MEMACCESS(0) 560 "ld1 {v0.16b}, [%0], %3 \n" 561 "uaddw2 v3.8h, v3.8h, v0.16b \n" 562 "uaddw v2.8h, v2.8h, v0.8b \n" 563 "subs w12, w12, #1 \n" 564 "b.gt 2b \n" 565 MEMACCESS(2) 566 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels 567 "add %1, %1, #16 \n" 568 "subs %w4, %w4, #16 \n" // 16 processed per loop 569 "b.gt 1b \n" 570 : "=&r"(src_tmp), // %0 571 "+r"(src_ptr), // %1 572 "+r"(dst_ptr), // %2 573 "+r"(src_stride), // %3 574 "+r"(src_width), // %4 575 "+r"(src_height) // %5 576 : 577 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List 578 ); 579} 580 581// TODO(Yang Zhang): Investigate less load instructions for 582// the x/dx stepping 583#define LOAD2_DATA8_LANE(n) \ 584 "lsr %5, %3, #16 \n" \ 585 "add %6, %1, %5 \n" \ 586 "add %3, %3, %4 \n" \ 587 MEMACCESS(6) \ 588 "ld2 {v4.b, v5.b}["#n"], [%6] \n" 589 590void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, 591 int dst_width, int x, int dx) { 592 int dx_offset[4] = {0, 1, 2, 3}; 593 int* tmp = dx_offset; 594 const uint8* src_tmp = src_ptr; 595 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 596 int64 x64 = (int64) x; 597 int64 dx64 = (int64) dx; 598 asm volatile ( 599 "dup v0.4s, %w3 \n" // x 600 "dup v1.4s, %w4 \n" // dx 601 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 602 "shl v3.4s, v1.4s, #2 \n" // 4 * dx 603 "mul v1.4s, v1.4s, v2.4s \n" 604 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx 605 "add v1.4s, v1.4s, v0.4s \n" 606 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx 607 "add v2.4s, v1.4s, v3.4s \n" 608 "shl v0.4s, v3.4s, #1 \n" // 8 * dx 609 "1: \n" 610 LOAD2_DATA8_LANE(0) 611 LOAD2_DATA8_LANE(1) 612 LOAD2_DATA8_LANE(2) 613 LOAD2_DATA8_LANE(3) 614 LOAD2_DATA8_LANE(4) 615 LOAD2_DATA8_LANE(5) 616 LOAD2_DATA8_LANE(6) 617 LOAD2_DATA8_LANE(7) 618 "mov v6.16b, v1.16b \n" 619 "mov v7.16b, v2.16b \n" 620 "uzp1 v6.8h, v6.8h, v7.8h \n" 621 "ushll v4.8h, v4.8b, #0 \n" 622 "ushll v5.8h, v5.8b, #0 \n" 623 "ssubl v16.4s, v5.4h, v4.4h \n" 624 "ssubl2 v17.4s, v5.8h, v4.8h \n" 625 "ushll v7.4s, v6.4h, #0 \n" 626 "ushll2 v6.4s, v6.8h, #0 \n" 627 "mul v16.4s, v16.4s, v7.4s \n" 628 "mul v17.4s, v17.4s, v6.4s \n" 629 "rshrn v6.4h, v16.4s, #16 \n" 630 "rshrn2 v6.8h, v17.4s, #16 \n" 631 "add v4.8h, v4.8h, v6.8h \n" 632 "xtn v4.8b, v4.8h \n" 633 634 MEMACCESS(0) 635 "st1 {v4.8b}, [%0], #8 \n" // store pixels 636 "add v1.4s, v1.4s, v0.4s \n" 637 "add v2.4s, v2.4s, v0.4s \n" 638 "subs %w2, %w2, #8 \n" // 8 processed per loop 639 "b.gt 1b \n" 640 : "+r"(dst_ptr), // %0 641 "+r"(src_ptr), // %1 642 "+r"(dst_width64), // %2 643 "+r"(x64), // %3 644 "+r"(dx64), // %4 645 "+r"(tmp), // %5 646 "+r"(src_tmp) // %6 647 : 648 : "memory", "cc", "v0", "v1", "v2", "v3", 649 "v4", "v5", "v6", "v7", "v16", "v17" 650 ); 651} 652 653#undef LOAD2_DATA8_LANE 654 655// 16x2 -> 16x1 656void ScaleFilterRows_NEON(uint8* dst_ptr, 657 const uint8* src_ptr, ptrdiff_t src_stride, 658 int dst_width, int source_y_fraction) { 659 int y_fraction = 256 - source_y_fraction; 660 asm volatile ( 661 "cmp %w4, #0 \n" 662 "b.eq 100f \n" 663 "add %2, %2, %1 \n" 664 "cmp %w4, #64 \n" 665 "b.eq 75f \n" 666 "cmp %w4, #128 \n" 667 "b.eq 50f \n" 668 "cmp %w4, #192 \n" 669 "b.eq 25f \n" 670 671 "dup v5.8b, %w4 \n" 672 "dup v4.8b, %w5 \n" 673 // General purpose row blend. 674 "1: \n" 675 MEMACCESS(1) 676 "ld1 {v0.16b}, [%1], #16 \n" 677 MEMACCESS(2) 678 "ld1 {v1.16b}, [%2], #16 \n" 679 "subs %w3, %w3, #16 \n" 680 "umull v6.8h, v0.8b, v4.8b \n" 681 "umull2 v7.8h, v0.16b, v4.16b \n" 682 "umlal v6.8h, v1.8b, v5.8b \n" 683 "umlal2 v7.8h, v1.16b, v5.16b \n" 684 "rshrn v0.8b, v6.8h, #8 \n" 685 "rshrn2 v0.16b, v7.8h, #8 \n" 686 MEMACCESS(0) 687 "st1 {v0.16b}, [%0], #16 \n" 688 "b.gt 1b \n" 689 "b 99f \n" 690 691 // Blend 25 / 75. 692 "25: \n" 693 MEMACCESS(1) 694 "ld1 {v0.16b}, [%1], #16 \n" 695 MEMACCESS(2) 696 "ld1 {v1.16b}, [%2], #16 \n" 697 "subs %w3, %w3, #16 \n" 698 "urhadd v0.16b, v0.16b, v1.16b \n" 699 "urhadd v0.16b, v0.16b, v1.16b \n" 700 MEMACCESS(0) 701 "st1 {v0.16b}, [%0], #16 \n" 702 "b.gt 25b \n" 703 "b 99f \n" 704 705 // Blend 50 / 50. 706 "50: \n" 707 MEMACCESS(1) 708 "ld1 {v0.16b}, [%1], #16 \n" 709 MEMACCESS(2) 710 "ld1 {v1.16b}, [%2], #16 \n" 711 "subs %w3, %w3, #16 \n" 712 "urhadd v0.16b, v0.16b, v1.16b \n" 713 MEMACCESS(0) 714 "st1 {v0.16b}, [%0], #16 \n" 715 "b.gt 50b \n" 716 "b 99f \n" 717 718 // Blend 75 / 25. 719 "75: \n" 720 MEMACCESS(1) 721 "ld1 {v1.16b}, [%1], #16 \n" 722 MEMACCESS(2) 723 "ld1 {v0.16b}, [%2], #16 \n" 724 "subs %w3, %w3, #16 \n" 725 "urhadd v0.16b, v0.16b, v1.16b \n" 726 "urhadd v0.16b, v0.16b, v1.16b \n" 727 MEMACCESS(0) 728 "st1 {v0.16b}, [%0], #16 \n" 729 "b.gt 75b \n" 730 "b 99f \n" 731 732 // Blend 100 / 0 - Copy row unchanged. 733 "100: \n" 734 MEMACCESS(1) 735 "ld1 {v0.16b}, [%1], #16 \n" 736 "subs %w3, %w3, #16 \n" 737 MEMACCESS(0) 738 "st1 {v0.16b}, [%0], #16 \n" 739 "b.gt 100b \n" 740 741 "99: \n" 742 MEMACCESS(0) 743 "st1 {v0.b}[15], [%0] \n" 744 : "+r"(dst_ptr), // %0 745 "+r"(src_ptr), // %1 746 "+r"(src_stride), // %2 747 "+r"(dst_width), // %3 748 "+r"(source_y_fraction),// %4 749 "+r"(y_fraction) // %5 750 : 751 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" 752 ); 753} 754 755void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 756 uint8* dst, int dst_width) { 757 asm volatile ( 758 "1: \n" 759 // load even pixels into q0, odd into q1 760 MEMACCESS (0) 761 "ld2 {v0.4s, v1.4s}, [%0], #32 \n" 762 MEMACCESS (0) 763 "ld2 {v2.4s, v3.4s}, [%0], #32 \n" 764 "subs %w2, %w2, #8 \n" // 8 processed per loop 765 MEMACCESS (1) 766 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels 767 MEMACCESS (1) 768 "st1 {v3.16b}, [%1], #16 \n" 769 "b.gt 1b \n" 770 : "+r" (src_ptr), // %0 771 "+r" (dst), // %1 772 "+r" (dst_width) // %2 773 : 774 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 775 ); 776} 777 778void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, 779 uint8* dst_argb, int dst_width) { 780 asm volatile ( 781 "1: \n" 782 MEMACCESS (0) 783 // load 8 ARGB pixels. 784 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" 785 "subs %w2, %w2, #8 \n" // 8 processed per loop. 786 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 787 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 788 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 789 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 790 "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack 791 "rshrn v1.8b, v1.8h, #1 \n" 792 "rshrn v2.8b, v2.8h, #1 \n" 793 "rshrn v3.8b, v3.8h, #1 \n" 794 MEMACCESS (1) 795 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" 796 "b.gt 1b \n" 797 : "+r"(src_argb), // %0 798 "+r"(dst_argb), // %1 799 "+r"(dst_width) // %2 800 : 801 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List 802 ); 803} 804 805void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 806 uint8* dst, int dst_width) { 807 asm volatile ( 808 // change the stride to row 2 pointer 809 "add %1, %1, %0 \n" 810 "1: \n" 811 MEMACCESS (0) 812 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. 813 "subs %w3, %w3, #8 \n" // 8 processed per loop. 814 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 815 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 816 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 817 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. 818 MEMACCESS (1) 819 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. 820 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. 821 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. 822 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. 823 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. 824 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack 825 "rshrn v1.8b, v1.8h, #2 \n" 826 "rshrn v2.8b, v2.8h, #2 \n" 827 "rshrn v3.8b, v3.8h, #2 \n" 828 MEMACCESS (2) 829 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" 830 "b.gt 1b \n" 831 : "+r" (src_ptr), // %0 832 "+r" (src_stride), // %1 833 "+r" (dst), // %2 834 "+r" (dst_width) // %3 835 : 836 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" 837 ); 838} 839 840// Reads 4 pixels at a time. 841// Alignment requirement: src_argb 4 byte aligned. 842void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, 843 int src_stepx, uint8* dst_argb, int dst_width) { 844 asm volatile ( 845 "1: \n" 846 MEMACCESS(0) 847 "ld1 {v0.s}[0], [%0], %3 \n" 848 MEMACCESS(0) 849 "ld1 {v0.s}[1], [%0], %3 \n" 850 MEMACCESS(0) 851 "ld1 {v0.s}[2], [%0], %3 \n" 852 MEMACCESS(0) 853 "ld1 {v0.s}[3], [%0], %3 \n" 854 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 855 MEMACCESS(1) 856 "st1 {v0.16b}, [%1], #16 \n" 857 "b.gt 1b \n" 858 : "+r"(src_argb), // %0 859 "+r"(dst_argb), // %1 860 "+r"(dst_width) // %2 861 : "r"((int64)(src_stepx * 4)) // %3 862 : "memory", "cc", "v0" 863 ); 864} 865 866// Reads 4 pixels at a time. 867// Alignment requirement: src_argb 4 byte aligned. 868// TODO(Yang Zhang): Might be worth another optimization pass in future. 869// It could be upgraded to 8 pixels at a time to start with. 870void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, 871 int src_stepx, 872 uint8* dst_argb, int dst_width) { 873 asm volatile ( 874 "add %1, %1, %0 \n" 875 "1: \n" 876 MEMACCESS(0) 877 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 878 MEMACCESS(1) 879 "ld1 {v1.8b}, [%1], %4 \n" 880 MEMACCESS(0) 881 "ld1 {v2.8b}, [%0], %4 \n" 882 MEMACCESS(1) 883 "ld1 {v3.8b}, [%1], %4 \n" 884 MEMACCESS(0) 885 "ld1 {v4.8b}, [%0], %4 \n" 886 MEMACCESS(1) 887 "ld1 {v5.8b}, [%1], %4 \n" 888 MEMACCESS(0) 889 "ld1 {v6.8b}, [%0], %4 \n" 890 MEMACCESS(1) 891 "ld1 {v7.8b}, [%1], %4 \n" 892 "uaddl v0.8h, v0.8b, v1.8b \n" 893 "uaddl v2.8h, v2.8b, v3.8b \n" 894 "uaddl v4.8h, v4.8b, v5.8b \n" 895 "uaddl v6.8h, v6.8b, v7.8b \n" 896 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd 897 "mov v0.d[1], v2.d[0] \n" 898 "mov v2.d[0], v16.d[1] \n" 899 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh 900 "mov v4.d[1], v6.d[0] \n" 901 "mov v6.d[0], v16.d[1] \n" 902 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) 903 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) 904 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. 905 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. 906 "subs %w3, %w3, #4 \n" // 4 pixels per loop. 907 MEMACCESS(2) 908 "st1 {v0.16b}, [%2], #16 \n" 909 "b.gt 1b \n" 910 : "+r"(src_argb), // %0 911 "+r"(src_stride), // %1 912 "+r"(dst_argb), // %2 913 "+r"(dst_width) // %3 914 : "r"((int64)(src_stepx * 4)) // %4 915 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 916 ); 917} 918 919// TODO(Yang Zhang): Investigate less load instructions for 920// the x/dx stepping 921#define LOAD1_DATA32_LANE(vn, n) \ 922 "lsr %5, %3, #16 \n" \ 923 "add %6, %1, %5, lsl #2 \n" \ 924 "add %3, %3, %4 \n" \ 925 MEMACCESS(6) \ 926 "ld1 {"#vn".s}["#n"], [%6] \n" 927 928void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, 929 int dst_width, int x, int dx) { 930 const uint8* src_tmp = src_argb; 931 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 932 int64 x64 = (int64) x; 933 int64 dx64 = (int64) dx; 934 int64 tmp64; 935 asm volatile ( 936 "1: \n" 937 LOAD1_DATA32_LANE(v0, 0) 938 LOAD1_DATA32_LANE(v0, 1) 939 LOAD1_DATA32_LANE(v0, 2) 940 LOAD1_DATA32_LANE(v0, 3) 941 LOAD1_DATA32_LANE(v1, 0) 942 LOAD1_DATA32_LANE(v1, 1) 943 LOAD1_DATA32_LANE(v1, 2) 944 LOAD1_DATA32_LANE(v1, 3) 945 946 MEMACCESS(0) 947 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels 948 "subs %w2, %w2, #8 \n" // 8 processed per loop 949 "b.gt 1b \n" 950 : "+r"(dst_argb), // %0 951 "+r"(src_argb), // %1 952 "+r"(dst_width64), // %2 953 "+r"(x64), // %3 954 "+r"(dx64), // %4 955 "=&r"(tmp64), // %5 956 "+r"(src_tmp) // %6 957 : 958 : "memory", "cc", "v0", "v1" 959 ); 960} 961 962#undef LOAD1_DATA32_LANE 963 964// TODO(Yang Zhang): Investigate less load instructions for 965// the x/dx stepping 966#define LOAD2_DATA32_LANE(vn1, vn2, n) \ 967 "lsr %5, %3, #16 \n" \ 968 "add %6, %1, %5, lsl #2 \n" \ 969 "add %3, %3, %4 \n" \ 970 MEMACCESS(6) \ 971 "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" 972 973void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, 974 int dst_width, int x, int dx) { 975 int dx_offset[4] = {0, 1, 2, 3}; 976 int* tmp = dx_offset; 977 const uint8* src_tmp = src_argb; 978 int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. 979 int64 x64 = (int64) x; 980 int64 dx64 = (int64) dx; 981 asm volatile ( 982 "dup v0.4s, %w3 \n" // x 983 "dup v1.4s, %w4 \n" // dx 984 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 985 "shl v6.4s, v1.4s, #2 \n" // 4 * dx 986 "mul v1.4s, v1.4s, v2.4s \n" 987 "movi v3.16b, #0x7f \n" // 0x7F 988 "movi v4.8h, #0x7f \n" // 0x7F 989 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx 990 "add v5.4s, v1.4s, v0.4s \n" 991 "1: \n" 992 // d0, d1: a 993 // d2, d3: b 994 LOAD2_DATA32_LANE(v0, v1, 0) 995 LOAD2_DATA32_LANE(v0, v1, 1) 996 LOAD2_DATA32_LANE(v0, v1, 2) 997 LOAD2_DATA32_LANE(v0, v1, 3) 998 "shrn v2.4h, v5.4s, #9 \n" 999 "and v2.8b, v2.8b, v4.8b \n" 1000 "dup v16.8b, v2.b[0] \n" 1001 "dup v17.8b, v2.b[2] \n" 1002 "dup v18.8b, v2.b[4] \n" 1003 "dup v19.8b, v2.b[6] \n" 1004 "ext v2.8b, v16.8b, v17.8b, #4 \n" 1005 "ext v17.8b, v18.8b, v19.8b, #4 \n" 1006 "ins v2.d[1], v17.d[0] \n" // f 1007 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f 1008 "umull v16.8h, v0.8b, v7.8b \n" 1009 "umull2 v17.8h, v0.16b, v7.16b \n" 1010 "umull v18.8h, v1.8b, v2.8b \n" 1011 "umull2 v19.8h, v1.16b, v2.16b \n" 1012 "add v16.8h, v16.8h, v18.8h \n" 1013 "add v17.8h, v17.8h, v19.8h \n" 1014 "shrn v0.8b, v16.8h, #7 \n" 1015 "shrn2 v0.16b, v17.8h, #7 \n" 1016 1017 MEMACCESS(0) 1018 "st1 {v0.4s}, [%0], #16 \n" // store pixels 1019 "add v5.4s, v5.4s, v6.4s \n" 1020 "subs %w2, %w2, #4 \n" // 4 processed per loop 1021 "b.gt 1b \n" 1022 : "+r"(dst_argb), // %0 1023 "+r"(src_argb), // %1 1024 "+r"(dst_width64), // %2 1025 "+r"(x64), // %3 1026 "+r"(dx64), // %4 1027 "+r"(tmp), // %5 1028 "+r"(src_tmp) // %6 1029 : 1030 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", 1031 "v6", "v7", "v16", "v17", "v18", "v19" 1032 ); 1033} 1034 1035#undef LOAD2_DATA32_LANE 1036 1037#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 1038 1039#ifdef __cplusplus 1040} // extern "C" 1041} // namespace libyuv 1042#endif 1043