jsimd_arm_neon.S revision 8c60d22ff51486afacf772b6f6b8b44630ffbff8
1/* 2 * ARM NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 5 * All rights reserved. 6 * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> 7 * 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 25#if defined(__linux__) && defined(__ELF__) 26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 27#endif 28 29.text 30.fpu neon 31.arch armv7a 32.object_arch armv4 33.arm 34 35 36#define RESPECT_STRICT_ALIGNMENT 1 37 38/*****************************************************************************/ 39 40/* Supplementary macro for setting function attributes */ 41.macro asm_function fname 42#ifdef __APPLE__ 43 .func _\fname 44 .globl _\fname 45_\fname: 46#else 47 .func \fname 48 .global \fname 49#ifdef __ELF__ 50 .hidden \fname 51 .type \fname, %function 52#endif 53\fname: 54#endif 55.endm 56 57/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 58.macro transpose_4x4 x0, x1, x2, x3 59 vtrn.16 \x0, \x1 60 vtrn.16 \x2, \x3 61 vtrn.32 \x0, \x2 62 vtrn.32 \x1, \x3 63.endm 64 65/*****************************************************************************/ 66 67/* 68 * jsimd_idct_ifast_neon 69 * 70 * This function contains a fast, not so accurate integer implementation of 71 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 72 * and produces exactly the same output as IJG's original 'jpeg_idct_fast' 73 * function from jidctfst.c 74 * 75 * TODO: a bit better instructions scheduling is needed. 76 */ 77 78#define XFIX_1_082392200 d0[0] 79#define XFIX_1_414213562 d0[1] 80#define XFIX_1_847759065 d0[2] 81#define XFIX_2_613125930 d0[3] 82 83.balign 16 84jsimd_idct_ifast_neon_consts: 85 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 86 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 87 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 88 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 89 90/* 1-D IDCT helper macro */ 91 92.macro idct_helper x0, x1, x2, x3, x4, x5, x6, x7, \ 93 t10, t11, t12, t13, t14 94 95 vsub.s16 \t10, \x0, \x4 96 vadd.s16 \x4, \x0, \x4 97 vswp.s16 \t10, \x0 98 vsub.s16 \t11, \x2, \x6 99 vadd.s16 \x6, \x2, \x6 100 vswp.s16 \t11, \x2 101 vsub.s16 \t10, \x3, \x5 102 vadd.s16 \x5, \x3, \x5 103 vswp.s16 \t10, \x3 104 vsub.s16 \t11, \x1, \x7 105 vadd.s16 \x7, \x1, \x7 106 vswp.s16 \t11, \x1 107 108 vqdmulh.s16 \t13, \x2, d0[1] 109 vadd.s16 \t12, \x3, \x3 110 vadd.s16 \x2, \x2, \t13 111 vqdmulh.s16 \t13, \x3, d0[3] 112 vsub.s16 \t10, \x1, \x3 113 vadd.s16 \t12, \t12, \t13 114 vqdmulh.s16 \t13, \t10, d0[2] 115 vsub.s16 \t11, \x7, \x5 116 vadd.s16 \t10, \t10, \t13 117 vqdmulh.s16 \t13, \t11, d0[1] 118 vadd.s16 \t11, \t11, \t13 119 120 vqdmulh.s16 \t13, \x1, d0[0] 121 vsub.s16 \x2, \x6, \x2 122 vsub.s16 \t14, \x0, \x2 123 vadd.s16 \x2, \x0, \x2 124 vadd.s16 \x0, \x4, \x6 125 vsub.s16 \x4, \x4, \x6 126 vadd.s16 \x1, \x1, \t13 127 vadd.s16 \t13, \x7, \x5 128 vsub.s16 \t12, \t13, \t12 129 vsub.s16 \t12, \t12, \t10 130 vadd.s16 \t11, \t12, \t11 131 vsub.s16 \t10, \x1, \t10 132 vadd.s16 \t10, \t10, \t11 133 134 vsub.s16 \x7, \x0, \t13 135 vadd.s16 \x0, \x0, \t13 136 vadd.s16 \x6, \t14, \t12 137 vsub.s16 \x1, \t14, \t12 138 vsub.s16 \x5, \x2, \t11 139 vadd.s16 \x2, \x2, \t11 140 vsub.s16 \x3, \x4, \t10 141 vadd.s16 \x4, \x4, \t10 142.endm 143 144asm_function jsimd_idct_ifast_neon 145 146 DCT_TABLE .req r0 147 COEF_BLOCK .req r1 148 OUTPUT_BUF .req r2 149 OUTPUT_COL .req r3 150 TMP .req ip 151 152 vpush {d8-d15} 153 154 /* Load constants */ 155 adr TMP, jsimd_idct_ifast_neon_consts 156 vld1.16 {d0}, [TMP, :64] 157 158 /* Load all COEF_BLOCK into NEON registers with the following allocation: 159 * 0 1 2 3 | 4 5 6 7 160 * ---------+-------- 161 * 0 | d4 | d5 162 * 1 | d6 | d7 163 * 2 | d8 | d9 164 * 3 | d10 | d11 165 * 4 | d12 | d13 166 * 5 | d14 | d15 167 * 6 | d16 | d17 168 * 7 | d18 | d19 169 */ 170 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK]! 171 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK]! 172 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK]! 173 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK]! 174 /* Dequantize */ 175 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! 176 vmul.s16 q2, q2, q10 177 vld1.16 {d24, d25, d26, d27}, [DCT_TABLE]! 178 vmul.s16 q3, q3, q11 179 vmul.s16 q4, q4, q12 180 vld1.16 {d28, d29, d30, d31}, [DCT_TABLE]! 181 vmul.s16 q5, q5, q13 182 vmul.s16 q6, q6, q14 183 vld1.16 {d20, d21, d22, d23}, [DCT_TABLE]! 184 vmul.s16 q7, q7, q15 185 vmul.s16 q8, q8, q10 186 vmul.s16 q9, q9, q11 187 188 /* Pass 1 */ 189 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 190 /* Transpose */ 191 transpose_4x4 d4, d6, d8, d10 192 transpose_4x4 d5, d7, d9, d11 193 transpose_4x4 d12, d14, d16, d18 194 transpose_4x4 d13, d15, d17, d19 195 vswp d12, d5 196 vswp d14, d7 197 vswp d16, d9 198 vswp d18, d11 199 200 /* Pass 2 */ 201 idct_helper q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14 202 /* Transpose */ 203 transpose_4x4 d4, d6, d8, d10 204 transpose_4x4 d5, d7, d9, d11 205 transpose_4x4 d12, d14, d16, d18 206 transpose_4x4 d13, d15, d17, d19 207 vswp d12, d5 208 vswp d14, d7 209 vswp d16, d9 210 vswp d18, d11 211 212 /* Descale and range limit */ 213 vmov.s16 q15, #(0x80 << 5) 214 vqadd.s16 q2, q2, q15 215 vqadd.s16 q3, q3, q15 216 vqadd.s16 q4, q4, q15 217 vqadd.s16 q5, q5, q15 218 vqadd.s16 q6, q6, q15 219 vqadd.s16 q7, q7, q15 220 vqadd.s16 q8, q8, q15 221 vqadd.s16 q9, q9, q15 222 vqshrun.s16 d4, q2, #5 223 vqshrun.s16 d6, q3, #5 224 vqshrun.s16 d8, q4, #5 225 vqshrun.s16 d10, q5, #5 226 vqshrun.s16 d12, q6, #5 227 vqshrun.s16 d14, q7, #5 228 vqshrun.s16 d16, q8, #5 229 vqshrun.s16 d18, q9, #5 230 231 /* Store results to the output buffer */ 232 .irp x, d4, d6, d8, d10, d12, d14, d16, d18 233 ldr TMP, [OUTPUT_BUF], #4 234 add TMP, TMP, OUTPUT_COL 235 vst1.8 {\x}, [TMP]! 236 .endr 237 238 vpop {d8-d15} 239 bx lr 240 241 .unreq DCT_TABLE 242 .unreq COEF_BLOCK 243 .unreq OUTPUT_BUF 244 .unreq OUTPUT_COL 245 .unreq TMP 246.endfunc 247 248.purgem idct_helper 249 250/*****************************************************************************/ 251 252/* 253 * jsimd_idct_4x4_neon 254 * 255 * This function contains inverse-DCT code for getting reduced-size 256 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 257 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 258 * function from jpeg-6b (jidctred.c). 259 * 260 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 261 * requires much less arithmetic operations and hence should be faster. 262 * The primary purpose of this particular NEON optimized function is 263 * bit exact compatibility with jpeg-6b. 264 * 265 * TODO: a bit better instructions scheduling can be achieved by expanding 266 * idct_helper/transpose_4x4 macros and reordering instructions, 267 * but readability will suffer somewhat. 268 */ 269 270#define CONST_BITS 13 271 272#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 273#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 274#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 275#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 276#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 277#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 278#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 279#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 280#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 281#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 282#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 283#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 284#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 285#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 286 287.balign 16 288jsimd_idct_4x4_neon_consts: 289 .short FIX_1_847759065 /* d0[0] */ 290 .short -FIX_0_765366865 /* d0[1] */ 291 .short -FIX_0_211164243 /* d0[2] */ 292 .short FIX_1_451774981 /* d0[3] */ 293 .short -FIX_2_172734803 /* d1[0] */ 294 .short FIX_1_061594337 /* d1[1] */ 295 .short -FIX_0_509795579 /* d1[2] */ 296 .short -FIX_0_601344887 /* d1[3] */ 297 .short FIX_0_899976223 /* d2[0] */ 298 .short FIX_2_562915447 /* d2[1] */ 299 .short 1 << (CONST_BITS+1) /* d2[2] */ 300 .short 0 /* d2[3] */ 301 302.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 303 vmull.s16 q14, \x4, d2[2] 304 vmlal.s16 q14, \x8, d0[0] 305 vmlal.s16 q14, \x14, d0[1] 306 307 vmull.s16 q13, \x16, d1[2] 308 vmlal.s16 q13, \x12, d1[3] 309 vmlal.s16 q13, \x10, d2[0] 310 vmlal.s16 q13, \x6, d2[1] 311 312 vmull.s16 q15, \x4, d2[2] 313 vmlsl.s16 q15, \x8, d0[0] 314 vmlsl.s16 q15, \x14, d0[1] 315 316 vmull.s16 q12, \x16, d0[2] 317 vmlal.s16 q12, \x12, d0[3] 318 vmlal.s16 q12, \x10, d1[0] 319 vmlal.s16 q12, \x6, d1[1] 320 321 vadd.s32 q10, q14, q13 322 vsub.s32 q14, q14, q13 323 324.if \shift > 16 325 vrshr.s32 q10, q10, #\shift 326 vrshr.s32 q14, q14, #\shift 327 vmovn.s32 \y26, q10 328 vmovn.s32 \y29, q14 329.else 330 vrshrn.s32 \y26, q10, #\shift 331 vrshrn.s32 \y29, q14, #\shift 332.endif 333 334 vadd.s32 q10, q15, q12 335 vsub.s32 q15, q15, q12 336 337.if \shift > 16 338 vrshr.s32 q10, q10, #\shift 339 vrshr.s32 q15, q15, #\shift 340 vmovn.s32 \y27, q10 341 vmovn.s32 \y28, q15 342.else 343 vrshrn.s32 \y27, q10, #\shift 344 vrshrn.s32 \y28, q15, #\shift 345.endif 346 347.endm 348 349asm_function jsimd_idct_4x4_neon 350 351 DCT_TABLE .req r0 352 COEF_BLOCK .req r1 353 OUTPUT_BUF .req r2 354 OUTPUT_COL .req r3 355 TMP1 .req r0 356 TMP2 .req r1 357 TMP3 .req r2 358 TMP4 .req ip 359 360 vpush {d8-d15} 361 362 /* Load constants (d3 is just used for padding) */ 363 adr TMP4, jsimd_idct_4x4_neon_consts 364 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 365 366 /* Load all COEF_BLOCK into NEON registers with the following allocation: 367 * 0 1 2 3 | 4 5 6 7 368 * ---------+-------- 369 * 0 | d4 | d5 370 * 1 | d6 | d7 371 * 2 | d8 | d9 372 * 3 | d10 | d11 373 * 4 | - | - 374 * 5 | d12 | d13 375 * 6 | d14 | d15 376 * 7 | d16 | d17 377 */ 378 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 379 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! 380 add COEF_BLOCK, COEF_BLOCK, #16 381 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! 382 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 383 /* dequantize */ 384 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 385 vmul.s16 q2, q2, q9 386 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! 387 vmul.s16 q3, q3, q10 388 vmul.s16 q4, q4, q11 389 add DCT_TABLE, DCT_TABLE, #16 390 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! 391 vmul.s16 q5, q5, q12 392 vmul.s16 q6, q6, q13 393 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 394 vmul.s16 q7, q7, q14 395 vmul.s16 q8, q8, q15 396 397 /* Pass 1 */ 398 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 399 transpose_4x4 d4, d6, d8, d10 400 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 401 transpose_4x4 d5, d7, d9, d11 402 403 /* Pass 2 */ 404 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 405 transpose_4x4 d26, d27, d28, d29 406 407 /* Range limit */ 408 vmov.u16 q15, #0x80 409 vadd.s16 q13, q13, q15 410 vadd.s16 q14, q14, q15 411 vqmovun.s16 d26, q13 412 vqmovun.s16 d27, q14 413 414 /* Store results to the output buffer */ 415 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 416 add TMP1, TMP1, OUTPUT_COL 417 add TMP2, TMP2, OUTPUT_COL 418 add TMP3, TMP3, OUTPUT_COL 419 add TMP4, TMP4, OUTPUT_COL 420 421#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 422 /* We can use much less instructions on little endian systems if the 423 * OS kernel is not configured to trap unaligned memory accesses 424 */ 425 vst1.32 {d26[0]}, [TMP1]! 426 vst1.32 {d27[0]}, [TMP3]! 427 vst1.32 {d26[1]}, [TMP2]! 428 vst1.32 {d27[1]}, [TMP4]! 429#else 430 vst1.8 {d26[0]}, [TMP1]! 431 vst1.8 {d27[0]}, [TMP3]! 432 vst1.8 {d26[1]}, [TMP1]! 433 vst1.8 {d27[1]}, [TMP3]! 434 vst1.8 {d26[2]}, [TMP1]! 435 vst1.8 {d27[2]}, [TMP3]! 436 vst1.8 {d26[3]}, [TMP1]! 437 vst1.8 {d27[3]}, [TMP3]! 438 439 vst1.8 {d26[4]}, [TMP2]! 440 vst1.8 {d27[4]}, [TMP4]! 441 vst1.8 {d26[5]}, [TMP2]! 442 vst1.8 {d27[5]}, [TMP4]! 443 vst1.8 {d26[6]}, [TMP2]! 444 vst1.8 {d27[6]}, [TMP4]! 445 vst1.8 {d26[7]}, [TMP2]! 446 vst1.8 {d27[7]}, [TMP4]! 447#endif 448 449 vpop {d8-d15} 450 bx lr 451 452 .unreq DCT_TABLE 453 .unreq COEF_BLOCK 454 .unreq OUTPUT_BUF 455 .unreq OUTPUT_COL 456 .unreq TMP1 457 .unreq TMP2 458 .unreq TMP3 459 .unreq TMP4 460.endfunc 461 462.purgem idct_helper 463 464/*****************************************************************************/ 465 466/* 467 * jsimd_idct_2x2_neon 468 * 469 * This function contains inverse-DCT code for getting reduced-size 470 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 471 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 472 * function from jpeg-6b (jidctred.c). 473 * 474 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 475 * requires much less arithmetic operations and hence should be faster. 476 * The primary purpose of this particular NEON optimized function is 477 * bit exact compatibility with jpeg-6b. 478 */ 479 480.balign 8 481jsimd_idct_2x2_neon_consts: 482 .short -FIX_0_720959822 /* d0[0] */ 483 .short FIX_0_850430095 /* d0[1] */ 484 .short -FIX_1_272758580 /* d0[2] */ 485 .short FIX_3_624509785 /* d0[3] */ 486 487.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 488 vshll.s16 q14, \x4, #15 489 vmull.s16 q13, \x6, d0[3] 490 vmlal.s16 q13, \x10, d0[2] 491 vmlal.s16 q13, \x12, d0[1] 492 vmlal.s16 q13, \x16, d0[0] 493 494 vadd.s32 q10, q14, q13 495 vsub.s32 q14, q14, q13 496 497.if \shift > 16 498 vrshr.s32 q10, q10, #\shift 499 vrshr.s32 q14, q14, #\shift 500 vmovn.s32 \y26, q10 501 vmovn.s32 \y27, q14 502.else 503 vrshrn.s32 \y26, q10, #\shift 504 vrshrn.s32 \y27, q14, #\shift 505.endif 506 507.endm 508 509asm_function jsimd_idct_2x2_neon 510 511 DCT_TABLE .req r0 512 COEF_BLOCK .req r1 513 OUTPUT_BUF .req r2 514 OUTPUT_COL .req r3 515 TMP1 .req r0 516 TMP2 .req ip 517 518 vpush {d8-d15} 519 520 /* Load constants */ 521 adr TMP2, jsimd_idct_2x2_neon_consts 522 vld1.16 {d0}, [TMP2, :64] 523 524 /* Load all COEF_BLOCK into NEON registers with the following allocation: 525 * 0 1 2 3 | 4 5 6 7 526 * ---------+-------- 527 * 0 | d4 | d5 528 * 1 | d6 | d7 529 * 2 | - | - 530 * 3 | d10 | d11 531 * 4 | - | - 532 * 5 | d12 | d13 533 * 6 | - | - 534 * 7 | d16 | d17 535 */ 536 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 537 add COEF_BLOCK, COEF_BLOCK, #16 538 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! 539 add COEF_BLOCK, COEF_BLOCK, #16 540 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! 541 add COEF_BLOCK, COEF_BLOCK, #16 542 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 543 /* Dequantize */ 544 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 545 vmul.s16 q2, q2, q9 546 vmul.s16 q3, q3, q10 547 add DCT_TABLE, DCT_TABLE, #16 548 vld1.16 {d24, d25}, [DCT_TABLE, :128]! 549 vmul.s16 q5, q5, q12 550 add DCT_TABLE, DCT_TABLE, #16 551 vld1.16 {d26, d27}, [DCT_TABLE, :128]! 552 vmul.s16 q6, q6, q13 553 add DCT_TABLE, DCT_TABLE, #16 554 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 555 vmul.s16 q8, q8, q15 556 557 /* Pass 1 */ 558#if 0 559 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 560 transpose_4x4 d4, d6, d8, d10 561 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 562 transpose_4x4 d5, d7, d9, d11 563#else 564 vmull.s16 q13, d6, d0[3] 565 vmlal.s16 q13, d10, d0[2] 566 vmlal.s16 q13, d12, d0[1] 567 vmlal.s16 q13, d16, d0[0] 568 vmull.s16 q12, d7, d0[3] 569 vmlal.s16 q12, d11, d0[2] 570 vmlal.s16 q12, d13, d0[1] 571 vmlal.s16 q12, d17, d0[0] 572 vshll.s16 q14, d4, #15 573 vshll.s16 q15, d5, #15 574 vadd.s32 q10, q14, q13 575 vsub.s32 q14, q14, q13 576 vrshrn.s32 d4, q10, #13 577 vrshrn.s32 d6, q14, #13 578 vadd.s32 q10, q15, q12 579 vsub.s32 q14, q15, q12 580 vrshrn.s32 d5, q10, #13 581 vrshrn.s32 d7, q14, #13 582 vtrn.16 q2, q3 583 vtrn.32 q3, q5 584#endif 585 586 /* Pass 2 */ 587 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 588 589 /* Range limit */ 590 vmov.u16 q15, #0x80 591 vadd.s16 q13, q13, q15 592 vqmovun.s16 d26, q13 593 vqmovun.s16 d27, q13 594 595 /* Store results to the output buffer */ 596 ldmia OUTPUT_BUF, {TMP1, TMP2} 597 add TMP1, TMP1, OUTPUT_COL 598 add TMP2, TMP2, OUTPUT_COL 599 600 vst1.8 {d26[0]}, [TMP1]! 601 vst1.8 {d27[4]}, [TMP1]! 602 vst1.8 {d26[1]}, [TMP2]! 603 vst1.8 {d27[5]}, [TMP2]! 604 605 vpop {d8-d15} 606 bx lr 607 608 .unreq DCT_TABLE 609 .unreq COEF_BLOCK 610 .unreq OUTPUT_BUF 611 .unreq OUTPUT_COL 612 .unreq TMP1 613 .unreq TMP2 614.endfunc 615 616.purgem idct_helper 617 618/*****************************************************************************/ 619 620/* 621 * jsimd_ycc_extrgb_convert_neon 622 * jsimd_ycc_extbgr_convert_neon 623 * jsimd_ycc_extrgbx_convert_neon 624 * jsimd_ycc_extbgrx_convert_neon 625 * jsimd_ycc_extxbgr_convert_neon 626 * jsimd_ycc_extxrgb_convert_neon 627 * 628 * Colorspace conversion YCbCr -> RGB 629 */ 630 631 632.macro do_load size 633 .if \size == 8 634 vld1.8 {d4}, [U]! 635 vld1.8 {d5}, [V]! 636 vld1.8 {d0}, [Y]! 637 pld [Y, #64] 638 pld [U, #64] 639 pld [V, #64] 640 .elseif \size == 4 641 vld1.8 {d4[0]}, [U]! 642 vld1.8 {d4[1]}, [U]! 643 vld1.8 {d4[2]}, [U]! 644 vld1.8 {d4[3]}, [U]! 645 vld1.8 {d5[0]}, [V]! 646 vld1.8 {d5[1]}, [V]! 647 vld1.8 {d5[2]}, [V]! 648 vld1.8 {d5[3]}, [V]! 649 vld1.8 {d0[0]}, [Y]! 650 vld1.8 {d0[1]}, [Y]! 651 vld1.8 {d0[2]}, [Y]! 652 vld1.8 {d0[3]}, [Y]! 653 .elseif \size == 2 654 vld1.8 {d4[4]}, [U]! 655 vld1.8 {d4[5]}, [U]! 656 vld1.8 {d5[4]}, [V]! 657 vld1.8 {d5[5]}, [V]! 658 vld1.8 {d0[4]}, [Y]! 659 vld1.8 {d0[5]}, [Y]! 660 .elseif \size == 1 661 vld1.8 {d4[6]}, [U]! 662 vld1.8 {d5[6]}, [V]! 663 vld1.8 {d0[6]}, [Y]! 664 .else 665 .error unsupported macroblock size 666 .endif 667.endm 668 669.macro do_store bpp, size 670 .if \bpp == 24 671 .if \size == 8 672 vst3.8 {d10, d11, d12}, [RGB]! 673 .elseif \size == 4 674 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 675 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 676 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 677 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 678 .elseif \size == 2 679 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 680 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 681 .elseif \size == 1 682 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 683 .else 684 .error unsupported macroblock size 685 .endif 686 .elseif \bpp == 32 687 .if \size == 8 688 vst4.8 {d10, d11, d12, d13}, [RGB]! 689 .elseif \size == 4 690 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 691 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 692 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 693 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 694 .elseif \size == 2 695 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 696 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 697 .elseif \size == 1 698 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 699 .else 700 .error unsupported macroblock size 701 .endif 702 .else 703 .error unsupported bpp 704 .endif 705.endm 706 707.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 708 709.macro do_yuv_to_rgb 710 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 711 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 712 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 713 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 714 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 715 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 716 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 717 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 718 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 719 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 720 vrshrn.s32 d20, q10, #15 721 vrshrn.s32 d21, q11, #15 722 vrshrn.s32 d24, q12, #14 723 vrshrn.s32 d25, q13, #14 724 vrshrn.s32 d28, q14, #14 725 vrshrn.s32 d29, q15, #14 726 vaddw.u8 q10, q10, d0 727 vaddw.u8 q12, q12, d0 728 vaddw.u8 q14, q14, d0 729 vqmovun.s16 d1\g_offs, q10 730 vqmovun.s16 d1\r_offs, q12 731 vqmovun.s16 d1\b_offs, q14 732.endm 733 734/* Apple gas crashes on adrl, work around that by using adr. 735 * But this requires a copy of these constants for each function. 736 */ 737 738.balign 16 739jsimd_ycc_\colorid\()_neon_consts: 740 .short 0, 0, 0, 0 741 .short 22971, -11277, -23401, 29033 742 .short -128, -128, -128, -128 743 .short -128, -128, -128, -128 744 745asm_function jsimd_ycc_\colorid\()_convert_neon 746 OUTPUT_WIDTH .req r0 747 INPUT_BUF .req r1 748 INPUT_ROW .req r2 749 OUTPUT_BUF .req r3 750 NUM_ROWS .req r4 751 752 INPUT_BUF0 .req r5 753 INPUT_BUF1 .req r6 754 INPUT_BUF2 .req INPUT_BUF 755 756 RGB .req r7 757 Y .req r8 758 U .req r9 759 V .req r10 760 N .req ip 761 762 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 763 adr ip, jsimd_ycc_\colorid\()_neon_consts 764 vld1.16 {d0, d1, d2, d3}, [ip, :128] 765 766 /* Save ARM registers and handle input arguments */ 767 push {r4, r5, r6, r7, r8, r9, r10, lr} 768 ldr NUM_ROWS, [sp, #(4 * 8)] 769 ldr INPUT_BUF0, [INPUT_BUF] 770 ldr INPUT_BUF1, [INPUT_BUF, #4] 771 ldr INPUT_BUF2, [INPUT_BUF, #8] 772 .unreq INPUT_BUF 773 774 /* Save NEON registers */ 775 vpush {d8-d15} 776 777 /* Initially set d10, d11, d12, d13 to 0xFF */ 778 vmov.u8 q5, #255 779 vmov.u8 q6, #255 780 781 /* Outer loop over scanlines */ 782 cmp NUM_ROWS, #1 783 blt 9f 7840: 785 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 786 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 787 mov N, OUTPUT_WIDTH 788 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 789 add INPUT_ROW, INPUT_ROW, #1 790 ldr RGB, [OUTPUT_BUF], #4 791 792 /* Inner loop over pixels */ 793 subs N, N, #8 794 blt 2f 7951: 796 do_load 8 797 do_yuv_to_rgb 798 do_store \bpp, 8 799 subs N, N, #8 800 bge 1b 801 tst N, #7 802 beq 8f 8032: 804 tst N, #4 805 beq 3f 806 do_load 4 8073: 808 tst N, #2 809 beq 4f 810 do_load 2 8114: 812 tst N, #1 813 beq 5f 814 do_load 1 8155: 816 do_yuv_to_rgb 817 tst N, #4 818 beq 6f 819 do_store \bpp, 4 8206: 821 tst N, #2 822 beq 7f 823 do_store \bpp, 2 8247: 825 tst N, #1 826 beq 8f 827 do_store \bpp, 1 8288: 829 subs NUM_ROWS, NUM_ROWS, #1 830 bgt 0b 8319: 832 /* Restore all registers and return */ 833 vpop {d8-d15} 834 pop {r4, r5, r6, r7, r8, r9, r10, pc} 835 836 .unreq OUTPUT_WIDTH 837 .unreq INPUT_ROW 838 .unreq OUTPUT_BUF 839 .unreq NUM_ROWS 840 .unreq INPUT_BUF0 841 .unreq INPUT_BUF1 842 .unreq INPUT_BUF2 843 .unreq RGB 844 .unreq Y 845 .unreq U 846 .unreq V 847 .unreq N 848.endfunc 849 850.purgem do_yuv_to_rgb 851 852.endm 853 854/*--------------------------------- id ----- bpp R G B */ 855generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 856generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 857generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 858generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 859generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 860generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 861 862.purgem do_load 863.purgem do_store 864 865/*****************************************************************************/ 866