jsimd_arm_neon.S revision 3e00f03aea551192233e143bbc63c435e09a3afe
1/* 2 * ARMv7 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 5 * All rights reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * 8 * This software is provided 'as-is', without any express or implied 9 * warranty. In no event will the authors be held liable for any damages 10 * arising from the use of this software. 11 * 12 * Permission is granted to anyone to use this software for any purpose, 13 * including commercial applications, and to alter it and redistribute it 14 * freely, subject to the following restrictions: 15 * 16 * 1. The origin of this software must not be misrepresented; you must not 17 * claim that you wrote the original software. If you use this software 18 * in a product, an acknowledgment in the product documentation would be 19 * appreciated but is not required. 20 * 2. Altered source versions must be plainly marked as such, and must not be 21 * misrepresented as being the original software. 22 * 3. This notice may not be removed or altered from any source distribution. 23 */ 24 25#if defined(__linux__) && defined(__ELF__) 26.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 27#endif 28 29.text 30.fpu neon 31.arch armv7a 32.object_arch armv4 33.arm 34 35 36#define RESPECT_STRICT_ALIGNMENT 1 37 38 39/*****************************************************************************/ 40 41/* Supplementary macro for setting function attributes */ 42.macro asm_function fname 43#ifdef __APPLE__ 44 .func _\fname 45 .globl _\fname 46_\fname: 47#else 48 .func \fname 49 .global \fname 50#ifdef __ELF__ 51 .hidden \fname 52 .type \fname, %function 53#endif 54\fname: 55#endif 56.endm 57 58/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 59.macro transpose_4x4 x0, x1, x2, x3 60 vtrn.16 \x0, \x1 61 vtrn.16 \x2, \x3 62 vtrn.32 \x0, \x2 63 vtrn.32 \x1, \x3 64.endm 65 66 67#define CENTERJSAMPLE 128 68 69/*****************************************************************************/ 70 71/* 72 * Perform dequantization and inverse DCT on one block of coefficients. 73 * 74 * GLOBAL(void) 75 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, 76 * JSAMPARRAY output_buf, JDIMENSION output_col) 77 */ 78 79#define FIX_0_298631336 (2446) 80#define FIX_0_390180644 (3196) 81#define FIX_0_541196100 (4433) 82#define FIX_0_765366865 (6270) 83#define FIX_0_899976223 (7373) 84#define FIX_1_175875602 (9633) 85#define FIX_1_501321110 (12299) 86#define FIX_1_847759065 (15137) 87#define FIX_1_961570560 (16069) 88#define FIX_2_053119869 (16819) 89#define FIX_2_562915447 (20995) 90#define FIX_3_072711026 (25172) 91 92#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 93#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 94#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 95#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 96#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 97#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 98#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 99#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 100 101/* 102 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 103 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 104 */ 105#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ 106{ \ 107 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 108 INT32 q1, q2, q3, q4, q5, q6, q7; \ 109 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ 110 \ 111 /* 1-D iDCT input data */ \ 112 row0 = xrow0; \ 113 row1 = xrow1; \ 114 row2 = xrow2; \ 115 row3 = xrow3; \ 116 row4 = xrow4; \ 117 row5 = xrow5; \ 118 row6 = xrow6; \ 119 row7 = xrow7; \ 120 \ 121 q5 = row7 + row3; \ 122 q4 = row5 + row1; \ 123 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 124 MULTIPLY(q4, FIX_1_175875602); \ 125 q7 = MULTIPLY(q5, FIX_1_175875602) + \ 126 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 127 q2 = MULTIPLY(row2, FIX_0_541196100) + \ 128 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 129 q4 = q6; \ 130 q3 = ((INT32) row0 - (INT32) row4) << 13; \ 131 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 132 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 133 /* now we can use q1 (reloadable constants have been used up) */ \ 134 q1 = q3 + q2; \ 135 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 136 MULTIPLY(row1, -FIX_0_899976223); \ 137 q5 = q7; \ 138 q1 = q1 + q6; \ 139 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 140 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 141 \ 142 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 143 tmp11_plus_tmp2 = q1; \ 144 row1 = 0; \ 145 \ 146 q1 = q1 - q6; \ 147 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 148 MULTIPLY(row3, -FIX_2_562915447); \ 149 q1 = q1 - q6; \ 150 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 151 MULTIPLY(row6, FIX_0_541196100); \ 152 q3 = q3 - q2; \ 153 \ 154 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 155 tmp11_minus_tmp2 = q1; \ 156 \ 157 q1 = ((INT32) row0 + (INT32) row4) << 13; \ 158 q2 = q1 + q6; \ 159 q1 = q1 - q6; \ 160 \ 161 /* pick up the results */ \ 162 tmp0 = q4; \ 163 tmp1 = q5; \ 164 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 165 tmp3 = q7; \ 166 tmp10 = q2; \ 167 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 168 tmp12 = q3; \ 169 tmp13 = q1; \ 170} 171 172#define XFIX_0_899976223 d0[0] 173#define XFIX_0_541196100 d0[1] 174#define XFIX_2_562915447 d0[2] 175#define XFIX_0_298631336_MINUS_0_899976223 d0[3] 176#define XFIX_1_501321110_MINUS_0_899976223 d1[0] 177#define XFIX_2_053119869_MINUS_2_562915447 d1[1] 178#define XFIX_0_541196100_PLUS_0_765366865 d1[2] 179#define XFIX_1_175875602 d1[3] 180#define XFIX_1_175875602_MINUS_0_390180644 d2[0] 181#define XFIX_0_541196100_MINUS_1_847759065 d2[1] 182#define XFIX_3_072711026_MINUS_2_562915447 d2[2] 183#define XFIX_1_175875602_MINUS_1_961570560 d2[3] 184 185.balign 16 186jsimd_idct_islow_neon_consts: 187 .short FIX_0_899976223 /* d0[0] */ 188 .short FIX_0_541196100 /* d0[1] */ 189 .short FIX_2_562915447 /* d0[2] */ 190 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 191 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 192 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 193 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 194 .short FIX_1_175875602 /* d1[3] */ 195 /* reloadable constants */ 196 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 197 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 198 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 199 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 200 201asm_function jsimd_idct_islow_neon 202 203 DCT_TABLE .req r0 204 COEF_BLOCK .req r1 205 OUTPUT_BUF .req r2 206 OUTPUT_COL .req r3 207 TMP1 .req r0 208 TMP2 .req r1 209 TMP3 .req r2 210 TMP4 .req ip 211 212 ROW0L .req d16 213 ROW0R .req d17 214 ROW1L .req d18 215 ROW1R .req d19 216 ROW2L .req d20 217 ROW2R .req d21 218 ROW3L .req d22 219 ROW3R .req d23 220 ROW4L .req d24 221 ROW4R .req d25 222 ROW5L .req d26 223 ROW5R .req d27 224 ROW6L .req d28 225 ROW6R .req d29 226 ROW7L .req d30 227 ROW7R .req d31 228 229 /* Load and dequantize coefficients into NEON registers 230 * with the following allocation: 231 * 0 1 2 3 | 4 5 6 7 232 * ---------+-------- 233 * 0 | d16 | d17 ( q8 ) 234 * 1 | d18 | d19 ( q9 ) 235 * 2 | d20 | d21 ( q10 ) 236 * 3 | d22 | d23 ( q11 ) 237 * 4 | d24 | d25 ( q12 ) 238 * 5 | d26 | d27 ( q13 ) 239 * 6 | d28 | d29 ( q14 ) 240 * 7 | d30 | d31 ( q15 ) 241 */ 242 adr ip, jsimd_idct_islow_neon_consts 243 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 244 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 245 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 246 vmul.s16 q8, q8, q0 247 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 248 vmul.s16 q9, q9, q1 249 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 250 vmul.s16 q10, q10, q2 251 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 252 vmul.s16 q11, q11, q3 253 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 254 vmul.s16 q12, q12, q0 255 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 256 vmul.s16 q14, q14, q2 257 vmul.s16 q13, q13, q1 258 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ 259 add ip, ip, #16 260 vmul.s16 q15, q15, q3 261 vpush {d8-d15} /* save NEON registers */ 262 /* 1-D IDCT, pass 1, left 4x8 half */ 263 vadd.s16 d4, ROW7L, ROW3L 264 vadd.s16 d5, ROW5L, ROW1L 265 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 266 vmlal.s16 q6, d5, XFIX_1_175875602 267 vmull.s16 q7, d4, XFIX_1_175875602 268 /* Check for the zero coefficients in the right 4x8 half */ 269 push {r4, r5} 270 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 271 vsubl.s16 q3, ROW0L, ROW4L 272 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 273 vmull.s16 q2, ROW2L, XFIX_0_541196100 274 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 275 orr r0, r4, r5 276 vmov q4, q6 277 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 278 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 279 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 280 vshl.s32 q3, q3, #13 281 orr r0, r0, r4 282 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 283 orr r0, r0, r5 284 vadd.s32 q1, q3, q2 285 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 286 vmov q5, q7 287 vadd.s32 q1, q1, q6 288 orr r0, r0, r4 289 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 290 orr r0, r0, r5 291 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 292 vrshrn.s32 ROW1L, q1, #11 293 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 294 vsub.s32 q1, q1, q6 295 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 296 orr r0, r0, r4 297 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 298 orr r0, r0, r5 299 vsub.s32 q1, q1, q6 300 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 301 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 302 vmlal.s16 q6, ROW6L, XFIX_0_541196100 303 vsub.s32 q3, q3, q2 304 orr r0, r0, r4 305 vrshrn.s32 ROW6L, q1, #11 306 orr r0, r0, r5 307 vadd.s32 q1, q3, q5 308 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 309 vsub.s32 q3, q3, q5 310 vaddl.s16 q5, ROW0L, ROW4L 311 orr r0, r0, r4 312 vrshrn.s32 ROW2L, q1, #11 313 orr r0, r0, r5 314 vrshrn.s32 ROW5L, q3, #11 315 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 316 vshl.s32 q5, q5, #13 317 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 318 orr r0, r0, r4 319 vadd.s32 q2, q5, q6 320 orrs r0, r0, r5 321 vsub.s32 q1, q5, q6 322 vadd.s32 q6, q2, q7 323 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 324 vsub.s32 q2, q2, q7 325 vadd.s32 q5, q1, q4 326 orr r0, r4, r5 327 vsub.s32 q3, q1, q4 328 pop {r4, r5} 329 vrshrn.s32 ROW7L, q2, #11 330 vrshrn.s32 ROW3L, q5, #11 331 vrshrn.s32 ROW0L, q6, #11 332 vrshrn.s32 ROW4L, q3, #11 333 334 beq 3f /* Go to do some special handling for the sparse right 4x8 half */ 335 336 /* 1-D IDCT, pass 1, right 4x8 half */ 337 vld1.s16 {d2}, [ip, :64] /* reload constants */ 338 vadd.s16 d10, ROW7R, ROW3R 339 vadd.s16 d8, ROW5R, ROW1R 340 /* Transpose left 4x8 half */ 341 vtrn.16 ROW6L, ROW7L 342 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 343 vmlal.s16 q6, d8, XFIX_1_175875602 344 vtrn.16 ROW2L, ROW3L 345 vmull.s16 q7, d10, XFIX_1_175875602 346 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 347 vtrn.16 ROW0L, ROW1L 348 vsubl.s16 q3, ROW0R, ROW4R 349 vmull.s16 q2, ROW2R, XFIX_0_541196100 350 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 351 vtrn.16 ROW4L, ROW5L 352 vmov q4, q6 353 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 354 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 355 vtrn.32 ROW1L, ROW3L 356 vshl.s32 q3, q3, #13 357 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 358 vtrn.32 ROW4L, ROW6L 359 vadd.s32 q1, q3, q2 360 vmov q5, q7 361 vadd.s32 q1, q1, q6 362 vtrn.32 ROW0L, ROW2L 363 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 364 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 365 vrshrn.s32 ROW1R, q1, #11 366 vtrn.32 ROW5L, ROW7L 367 vsub.s32 q1, q1, q6 368 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 369 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 370 vsub.s32 q1, q1, q6 371 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 372 vmlal.s16 q6, ROW6R, XFIX_0_541196100 373 vsub.s32 q3, q3, q2 374 vrshrn.s32 ROW6R, q1, #11 375 vadd.s32 q1, q3, q5 376 vsub.s32 q3, q3, q5 377 vaddl.s16 q5, ROW0R, ROW4R 378 vrshrn.s32 ROW2R, q1, #11 379 vrshrn.s32 ROW5R, q3, #11 380 vshl.s32 q5, q5, #13 381 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 382 vadd.s32 q2, q5, q6 383 vsub.s32 q1, q5, q6 384 vadd.s32 q6, q2, q7 385 vsub.s32 q2, q2, q7 386 vadd.s32 q5, q1, q4 387 vsub.s32 q3, q1, q4 388 vrshrn.s32 ROW7R, q2, #11 389 vrshrn.s32 ROW3R, q5, #11 390 vrshrn.s32 ROW0R, q6, #11 391 vrshrn.s32 ROW4R, q3, #11 392 /* Transpose right 4x8 half */ 393 vtrn.16 ROW6R, ROW7R 394 vtrn.16 ROW2R, ROW3R 395 vtrn.16 ROW0R, ROW1R 396 vtrn.16 ROW4R, ROW5R 397 vtrn.32 ROW1R, ROW3R 398 vtrn.32 ROW4R, ROW6R 399 vtrn.32 ROW0R, ROW2R 400 vtrn.32 ROW5R, ROW7R 401 4021: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 403 vld1.s16 {d2}, [ip, :64] /* reload constants */ 404 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 405 vmlal.s16 q6, ROW1L, XFIX_1_175875602 406 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 407 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 408 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 409 vmlal.s16 q7, ROW3L, XFIX_1_175875602 410 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 411 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 412 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 413 vmull.s16 q2, ROW2L, XFIX_0_541196100 414 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ 415 vmov q4, q6 416 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ 417 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 418 vshl.s32 q3, q3, #13 419 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 420 vadd.s32 q1, q3, q2 421 vmov q5, q7 422 vadd.s32 q1, q1, q6 423 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ 424 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 425 vshrn.s32 ROW1L, q1, #16 426 vsub.s32 q1, q1, q6 427 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ 428 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 429 vsub.s32 q1, q1, q6 430 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 431 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 432 vsub.s32 q3, q3, q2 433 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 434 vadd.s32 q1, q3, q5 435 vsub.s32 q3, q3, q5 436 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 437 vshrn.s32 ROW2L, q1, #16 438 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 439 vshl.s32 q5, q5, #13 440 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ 441 vadd.s32 q2, q5, q6 442 vsub.s32 q1, q5, q6 443 vadd.s32 q6, q2, q7 444 vsub.s32 q2, q2, q7 445 vadd.s32 q5, q1, q4 446 vsub.s32 q3, q1, q4 447 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 448 vshrn.s32 ROW3L, q5, #16 449 vshrn.s32 ROW0L, q6, #16 450 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 451 /* 1-D IDCT, pass 2, right 4x8 half */ 452 vld1.s16 {d2}, [ip, :64] /* reload constants */ 453 vmull.s16 q6, ROW5R, XFIX_1_175875602 454 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 455 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 456 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 457 vmull.s16 q7, ROW7R, XFIX_1_175875602 458 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 459 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 460 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 461 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 462 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 463 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 464 vmov q4, q6 465 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 466 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ 467 vshl.s32 q3, q3, #13 468 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ 469 vadd.s32 q1, q3, q2 470 vmov q5, q7 471 vadd.s32 q1, q1, q6 472 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 473 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ 474 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 475 vsub.s32 q1, q1, q6 476 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 477 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ 478 vsub.s32 q1, q1, q6 479 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ 480 vmlal.s16 q6, ROW6R, XFIX_0_541196100 481 vsub.s32 q3, q3, q2 482 vshrn.s32 ROW6R, q1, #16 483 vadd.s32 q1, q3, q5 484 vsub.s32 q3, q3, q5 485 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 486 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 487 vshrn.s32 ROW5R, q3, #16 488 vshl.s32 q5, q5, #13 489 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 490 vadd.s32 q2, q5, q6 491 vsub.s32 q1, q5, q6 492 vadd.s32 q6, q2, q7 493 vsub.s32 q2, q2, q7 494 vadd.s32 q5, q1, q4 495 vsub.s32 q3, q1, q4 496 vshrn.s32 ROW7R, q2, #16 497 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 498 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 499 vshrn.s32 ROW4R, q3, #16 500 5012: /* Descale to 8-bit and range limit */ 502 vqrshrn.s16 d16, q8, #2 503 vqrshrn.s16 d17, q9, #2 504 vqrshrn.s16 d18, q10, #2 505 vqrshrn.s16 d19, q11, #2 506 vpop {d8-d15} /* restore NEON registers */ 507 vqrshrn.s16 d20, q12, #2 508 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 509 vtrn.16 q8, q9 510 vqrshrn.s16 d21, q13, #2 511 vqrshrn.s16 d22, q14, #2 512 vmov.u8 q0, #(CENTERJSAMPLE) 513 vqrshrn.s16 d23, q15, #2 514 vtrn.8 d16, d17 515 vtrn.8 d18, d19 516 vadd.u8 q8, q8, q0 517 vadd.u8 q9, q9, q0 518 vtrn.16 q10, q11 519 /* Store results to the output buffer */ 520 ldmia OUTPUT_BUF!, {TMP1, TMP2} 521 add TMP1, TMP1, OUTPUT_COL 522 add TMP2, TMP2, OUTPUT_COL 523 vst1.8 {d16}, [TMP1] 524 vtrn.8 d20, d21 525 vst1.8 {d17}, [TMP2] 526 ldmia OUTPUT_BUF!, {TMP1, TMP2} 527 add TMP1, TMP1, OUTPUT_COL 528 add TMP2, TMP2, OUTPUT_COL 529 vst1.8 {d18}, [TMP1] 530 vadd.u8 q10, q10, q0 531 vst1.8 {d19}, [TMP2] 532 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 533 add TMP1, TMP1, OUTPUT_COL 534 add TMP2, TMP2, OUTPUT_COL 535 add TMP3, TMP3, OUTPUT_COL 536 add TMP4, TMP4, OUTPUT_COL 537 vtrn.8 d22, d23 538 vst1.8 {d20}, [TMP1] 539 vadd.u8 q11, q11, q0 540 vst1.8 {d21}, [TMP2] 541 vst1.8 {d22}, [TMP3] 542 vst1.8 {d23}, [TMP4] 543 bx lr 544 5453: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 546 547 /* Transpose left 4x8 half */ 548 vtrn.16 ROW6L, ROW7L 549 vtrn.16 ROW2L, ROW3L 550 vtrn.16 ROW0L, ROW1L 551 vtrn.16 ROW4L, ROW5L 552 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ 553 vtrn.32 ROW1L, ROW3L 554 vtrn.32 ROW4L, ROW6L 555 vtrn.32 ROW0L, ROW2L 556 vtrn.32 ROW5L, ROW7L 557 558 cmp r0, #0 559 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ 560 561 /* Only row 0 is non-zero for the right 4x8 half */ 562 vdup.s16 ROW1R, ROW0R[1] 563 vdup.s16 ROW2R, ROW0R[2] 564 vdup.s16 ROW3R, ROW0R[3] 565 vdup.s16 ROW4R, ROW0R[0] 566 vdup.s16 ROW5R, ROW0R[1] 567 vdup.s16 ROW6R, ROW0R[2] 568 vdup.s16 ROW7R, ROW0R[3] 569 vdup.s16 ROW0R, ROW0R[0] 570 b 1b /* Go to 'normal' second pass */ 571 5724: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 573 vld1.s16 {d2}, [ip, :64] /* reload constants */ 574 vmull.s16 q6, ROW1L, XFIX_1_175875602 575 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 576 vmull.s16 q7, ROW3L, XFIX_1_175875602 577 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 578 vmull.s16 q2, ROW2L, XFIX_0_541196100 579 vshll.s16 q3, ROW0L, #13 580 vmov q4, q6 581 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 582 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 583 vadd.s32 q1, q3, q2 584 vmov q5, q7 585 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 586 vadd.s32 q1, q1, q6 587 vadd.s32 q6, q6, q6 588 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 589 vshrn.s32 ROW1L, q1, #16 590 vsub.s32 q1, q1, q6 591 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 592 vsub.s32 q3, q3, q2 593 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 594 vadd.s32 q1, q3, q5 595 vsub.s32 q3, q3, q5 596 vshll.s16 q5, ROW0L, #13 597 vshrn.s32 ROW2L, q1, #16 598 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 599 vadd.s32 q2, q5, q6 600 vsub.s32 q1, q5, q6 601 vadd.s32 q6, q2, q7 602 vsub.s32 q2, q2, q7 603 vadd.s32 q5, q1, q4 604 vsub.s32 q3, q1, q4 605 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 606 vshrn.s32 ROW3L, q5, #16 607 vshrn.s32 ROW0L, q6, #16 608 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 609 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 610 vld1.s16 {d2}, [ip, :64] /* reload constants */ 611 vmull.s16 q6, ROW5L, XFIX_1_175875602 612 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 613 vmull.s16 q7, ROW7L, XFIX_1_175875602 614 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 615 vmull.s16 q2, ROW6L, XFIX_0_541196100 616 vshll.s16 q3, ROW4L, #13 617 vmov q4, q6 618 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 619 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 620 vadd.s32 q1, q3, q2 621 vmov q5, q7 622 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 623 vadd.s32 q1, q1, q6 624 vadd.s32 q6, q6, q6 625 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 626 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 627 vsub.s32 q1, q1, q6 628 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 629 vsub.s32 q3, q3, q2 630 vshrn.s32 ROW6R, q1, #16 631 vadd.s32 q1, q3, q5 632 vsub.s32 q3, q3, q5 633 vshll.s16 q5, ROW4L, #13 634 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 635 vshrn.s32 ROW5R, q3, #16 636 vadd.s32 q2, q5, q6 637 vsub.s32 q1, q5, q6 638 vadd.s32 q6, q2, q7 639 vsub.s32 q2, q2, q7 640 vadd.s32 q5, q1, q4 641 vsub.s32 q3, q1, q4 642 vshrn.s32 ROW7R, q2, #16 643 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 644 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 645 vshrn.s32 ROW4R, q3, #16 646 b 2b /* Go to epilogue */ 647 648 .unreq DCT_TABLE 649 .unreq COEF_BLOCK 650 .unreq OUTPUT_BUF 651 .unreq OUTPUT_COL 652 .unreq TMP1 653 .unreq TMP2 654 .unreq TMP3 655 .unreq TMP4 656 657 .unreq ROW0L 658 .unreq ROW0R 659 .unreq ROW1L 660 .unreq ROW1R 661 .unreq ROW2L 662 .unreq ROW2R 663 .unreq ROW3L 664 .unreq ROW3R 665 .unreq ROW4L 666 .unreq ROW4R 667 .unreq ROW5L 668 .unreq ROW5R 669 .unreq ROW6L 670 .unreq ROW6R 671 .unreq ROW7L 672 .unreq ROW7R 673.endfunc 674 675 676/*****************************************************************************/ 677 678/* 679 * jsimd_idct_ifast_neon 680 * 681 * This function contains a fast, not so accurate integer implementation of 682 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 683 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 684 * function from jidctfst.c 685 * 686 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 687 * But in ARM NEON case some extra additions are required because VQDMULH 688 * instruction can't handle the constants larger than 1. So the expressions 689 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 690 * which introduces an extra addition. Overall, there are 6 extra additions 691 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 692 */ 693 694#define XFIX_1_082392200 d0[0] 695#define XFIX_1_414213562 d0[1] 696#define XFIX_1_847759065 d0[2] 697#define XFIX_2_613125930 d0[3] 698 699.balign 16 700jsimd_idct_ifast_neon_consts: 701 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 702 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 703 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 704 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 705 706asm_function jsimd_idct_ifast_neon 707 708 DCT_TABLE .req r0 709 COEF_BLOCK .req r1 710 OUTPUT_BUF .req r2 711 OUTPUT_COL .req r3 712 TMP1 .req r0 713 TMP2 .req r1 714 TMP3 .req r2 715 TMP4 .req ip 716 717 /* Load and dequantize coefficients into NEON registers 718 * with the following allocation: 719 * 0 1 2 3 | 4 5 6 7 720 * ---------+-------- 721 * 0 | d16 | d17 ( q8 ) 722 * 1 | d18 | d19 ( q9 ) 723 * 2 | d20 | d21 ( q10 ) 724 * 3 | d22 | d23 ( q11 ) 725 * 4 | d24 | d25 ( q12 ) 726 * 5 | d26 | d27 ( q13 ) 727 * 6 | d28 | d29 ( q14 ) 728 * 7 | d30 | d31 ( q15 ) 729 */ 730 adr ip, jsimd_idct_ifast_neon_consts 731 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 732 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 733 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 734 vmul.s16 q8, q8, q0 735 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 736 vmul.s16 q9, q9, q1 737 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 738 vmul.s16 q10, q10, q2 739 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 740 vmul.s16 q11, q11, q3 741 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 742 vmul.s16 q12, q12, q0 743 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 744 vmul.s16 q14, q14, q2 745 vmul.s16 q13, q13, q1 746 vld1.16 {d0}, [ip, :64] /* load constants */ 747 vmul.s16 q15, q15, q3 748 vpush {d8-d13} /* save NEON registers */ 749 /* 1-D IDCT, pass 1 */ 750 vsub.s16 q2, q10, q14 751 vadd.s16 q14, q10, q14 752 vsub.s16 q1, q11, q13 753 vadd.s16 q13, q11, q13 754 vsub.s16 q5, q9, q15 755 vadd.s16 q15, q9, q15 756 vqdmulh.s16 q4, q2, XFIX_1_414213562 757 vqdmulh.s16 q6, q1, XFIX_2_613125930 758 vadd.s16 q3, q1, q1 759 vsub.s16 q1, q5, q1 760 vadd.s16 q10, q2, q4 761 vqdmulh.s16 q4, q1, XFIX_1_847759065 762 vsub.s16 q2, q15, q13 763 vadd.s16 q3, q3, q6 764 vqdmulh.s16 q6, q2, XFIX_1_414213562 765 vadd.s16 q1, q1, q4 766 vqdmulh.s16 q4, q5, XFIX_1_082392200 767 vsub.s16 q10, q10, q14 768 vadd.s16 q2, q2, q6 769 vsub.s16 q6, q8, q12 770 vadd.s16 q12, q8, q12 771 vadd.s16 q9, q5, q4 772 vadd.s16 q5, q6, q10 773 vsub.s16 q10, q6, q10 774 vadd.s16 q6, q15, q13 775 vadd.s16 q8, q12, q14 776 vsub.s16 q3, q6, q3 777 vsub.s16 q12, q12, q14 778 vsub.s16 q3, q3, q1 779 vsub.s16 q1, q9, q1 780 vadd.s16 q2, q3, q2 781 vsub.s16 q15, q8, q6 782 vadd.s16 q1, q1, q2 783 vadd.s16 q8, q8, q6 784 vadd.s16 q14, q5, q3 785 vsub.s16 q9, q5, q3 786 vsub.s16 q13, q10, q2 787 vadd.s16 q10, q10, q2 788 /* Transpose */ 789 vtrn.16 q8, q9 790 vsub.s16 q11, q12, q1 791 vtrn.16 q14, q15 792 vadd.s16 q12, q12, q1 793 vtrn.16 q10, q11 794 vtrn.16 q12, q13 795 vtrn.32 q9, q11 796 vtrn.32 q12, q14 797 vtrn.32 q8, q10 798 vtrn.32 q13, q15 799 vswp d28, d21 800 vswp d26, d19 801 /* 1-D IDCT, pass 2 */ 802 vsub.s16 q2, q10, q14 803 vswp d30, d23 804 vadd.s16 q14, q10, q14 805 vswp d24, d17 806 vsub.s16 q1, q11, q13 807 vadd.s16 q13, q11, q13 808 vsub.s16 q5, q9, q15 809 vadd.s16 q15, q9, q15 810 vqdmulh.s16 q4, q2, XFIX_1_414213562 811 vqdmulh.s16 q6, q1, XFIX_2_613125930 812 vadd.s16 q3, q1, q1 813 vsub.s16 q1, q5, q1 814 vadd.s16 q10, q2, q4 815 vqdmulh.s16 q4, q1, XFIX_1_847759065 816 vsub.s16 q2, q15, q13 817 vadd.s16 q3, q3, q6 818 vqdmulh.s16 q6, q2, XFIX_1_414213562 819 vadd.s16 q1, q1, q4 820 vqdmulh.s16 q4, q5, XFIX_1_082392200 821 vsub.s16 q10, q10, q14 822 vadd.s16 q2, q2, q6 823 vsub.s16 q6, q8, q12 824 vadd.s16 q12, q8, q12 825 vadd.s16 q9, q5, q4 826 vadd.s16 q5, q6, q10 827 vsub.s16 q10, q6, q10 828 vadd.s16 q6, q15, q13 829 vadd.s16 q8, q12, q14 830 vsub.s16 q3, q6, q3 831 vsub.s16 q12, q12, q14 832 vsub.s16 q3, q3, q1 833 vsub.s16 q1, q9, q1 834 vadd.s16 q2, q3, q2 835 vsub.s16 q15, q8, q6 836 vadd.s16 q1, q1, q2 837 vadd.s16 q8, q8, q6 838 vadd.s16 q14, q5, q3 839 vsub.s16 q9, q5, q3 840 vsub.s16 q13, q10, q2 841 vpop {d8-d13} /* restore NEON registers */ 842 vadd.s16 q10, q10, q2 843 vsub.s16 q11, q12, q1 844 vadd.s16 q12, q12, q1 845 /* Descale to 8-bit and range limit */ 846 vmov.u8 q0, #0x80 847 vqshrn.s16 d16, q8, #5 848 vqshrn.s16 d17, q9, #5 849 vqshrn.s16 d18, q10, #5 850 vqshrn.s16 d19, q11, #5 851 vqshrn.s16 d20, q12, #5 852 vqshrn.s16 d21, q13, #5 853 vqshrn.s16 d22, q14, #5 854 vqshrn.s16 d23, q15, #5 855 vadd.u8 q8, q8, q0 856 vadd.u8 q9, q9, q0 857 vadd.u8 q10, q10, q0 858 vadd.u8 q11, q11, q0 859 /* Transpose the final 8-bit samples */ 860 vtrn.16 q8, q9 861 vtrn.16 q10, q11 862 vtrn.32 q8, q10 863 vtrn.32 q9, q11 864 vtrn.8 d16, d17 865 vtrn.8 d18, d19 866 /* Store results to the output buffer */ 867 ldmia OUTPUT_BUF!, {TMP1, TMP2} 868 add TMP1, TMP1, OUTPUT_COL 869 add TMP2, TMP2, OUTPUT_COL 870 vst1.8 {d16}, [TMP1] 871 vst1.8 {d17}, [TMP2] 872 ldmia OUTPUT_BUF!, {TMP1, TMP2} 873 add TMP1, TMP1, OUTPUT_COL 874 add TMP2, TMP2, OUTPUT_COL 875 vst1.8 {d18}, [TMP1] 876 vtrn.8 d20, d21 877 vst1.8 {d19}, [TMP2] 878 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 879 add TMP1, TMP1, OUTPUT_COL 880 add TMP2, TMP2, OUTPUT_COL 881 add TMP3, TMP3, OUTPUT_COL 882 add TMP4, TMP4, OUTPUT_COL 883 vst1.8 {d20}, [TMP1] 884 vtrn.8 d22, d23 885 vst1.8 {d21}, [TMP2] 886 vst1.8 {d22}, [TMP3] 887 vst1.8 {d23}, [TMP4] 888 bx lr 889 890 .unreq DCT_TABLE 891 .unreq COEF_BLOCK 892 .unreq OUTPUT_BUF 893 .unreq OUTPUT_COL 894 .unreq TMP1 895 .unreq TMP2 896 .unreq TMP3 897 .unreq TMP4 898.endfunc 899 900 901/*****************************************************************************/ 902 903/* 904 * jsimd_idct_4x4_neon 905 * 906 * This function contains inverse-DCT code for getting reduced-size 907 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 908 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 909 * function from jpeg-6b (jidctred.c). 910 * 911 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 912 * requires much less arithmetic operations and hence should be faster. 913 * The primary purpose of this particular NEON optimized function is 914 * bit exact compatibility with jpeg-6b. 915 * 916 * TODO: a bit better instructions scheduling can be achieved by expanding 917 * idct_helper/transpose_4x4 macros and reordering instructions, 918 * but readability will suffer somewhat. 919 */ 920 921#define CONST_BITS 13 922 923#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 924#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 925#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 926#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 927#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 928#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 929#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 930#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 931#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 932#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 933#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 934#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 935#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 936#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 937 938.balign 16 939jsimd_idct_4x4_neon_consts: 940 .short FIX_1_847759065 /* d0[0] */ 941 .short -FIX_0_765366865 /* d0[1] */ 942 .short -FIX_0_211164243 /* d0[2] */ 943 .short FIX_1_451774981 /* d0[3] */ 944 .short -FIX_2_172734803 /* d1[0] */ 945 .short FIX_1_061594337 /* d1[1] */ 946 .short -FIX_0_509795579 /* d1[2] */ 947 .short -FIX_0_601344887 /* d1[3] */ 948 .short FIX_0_899976223 /* d2[0] */ 949 .short FIX_2_562915447 /* d2[1] */ 950 .short 1 << (CONST_BITS+1) /* d2[2] */ 951 .short 0 /* d2[3] */ 952 953.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 954 vmull.s16 q14, \x4, d2[2] 955 vmlal.s16 q14, \x8, d0[0] 956 vmlal.s16 q14, \x14, d0[1] 957 958 vmull.s16 q13, \x16, d1[2] 959 vmlal.s16 q13, \x12, d1[3] 960 vmlal.s16 q13, \x10, d2[0] 961 vmlal.s16 q13, \x6, d2[1] 962 963 vmull.s16 q15, \x4, d2[2] 964 vmlsl.s16 q15, \x8, d0[0] 965 vmlsl.s16 q15, \x14, d0[1] 966 967 vmull.s16 q12, \x16, d0[2] 968 vmlal.s16 q12, \x12, d0[3] 969 vmlal.s16 q12, \x10, d1[0] 970 vmlal.s16 q12, \x6, d1[1] 971 972 vadd.s32 q10, q14, q13 973 vsub.s32 q14, q14, q13 974 975.if \shift > 16 976 vrshr.s32 q10, q10, #\shift 977 vrshr.s32 q14, q14, #\shift 978 vmovn.s32 \y26, q10 979 vmovn.s32 \y29, q14 980.else 981 vrshrn.s32 \y26, q10, #\shift 982 vrshrn.s32 \y29, q14, #\shift 983.endif 984 985 vadd.s32 q10, q15, q12 986 vsub.s32 q15, q15, q12 987 988.if \shift > 16 989 vrshr.s32 q10, q10, #\shift 990 vrshr.s32 q15, q15, #\shift 991 vmovn.s32 \y27, q10 992 vmovn.s32 \y28, q15 993.else 994 vrshrn.s32 \y27, q10, #\shift 995 vrshrn.s32 \y28, q15, #\shift 996.endif 997 998.endm 999 1000asm_function jsimd_idct_4x4_neon 1001 1002 DCT_TABLE .req r0 1003 COEF_BLOCK .req r1 1004 OUTPUT_BUF .req r2 1005 OUTPUT_COL .req r3 1006 TMP1 .req r0 1007 TMP2 .req r1 1008 TMP3 .req r2 1009 TMP4 .req ip 1010 1011 vpush {d8-d15} 1012 1013 /* Load constants (d3 is just used for padding) */ 1014 adr TMP4, jsimd_idct_4x4_neon_consts 1015 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 1016 1017 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1018 * 0 1 2 3 | 4 5 6 7 1019 * ---------+-------- 1020 * 0 | d4 | d5 1021 * 1 | d6 | d7 1022 * 2 | d8 | d9 1023 * 3 | d10 | d11 1024 * 4 | - | - 1025 * 5 | d12 | d13 1026 * 6 | d14 | d15 1027 * 7 | d16 | d17 1028 */ 1029 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1030 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! 1031 add COEF_BLOCK, COEF_BLOCK, #16 1032 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! 1033 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1034 /* dequantize */ 1035 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1036 vmul.s16 q2, q2, q9 1037 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! 1038 vmul.s16 q3, q3, q10 1039 vmul.s16 q4, q4, q11 1040 add DCT_TABLE, DCT_TABLE, #16 1041 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! 1042 vmul.s16 q5, q5, q12 1043 vmul.s16 q6, q6, q13 1044 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1045 vmul.s16 q7, q7, q14 1046 vmul.s16 q8, q8, q15 1047 1048 /* Pass 1 */ 1049 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 1050 transpose_4x4 d4, d6, d8, d10 1051 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 1052 transpose_4x4 d5, d7, d9, d11 1053 1054 /* Pass 2 */ 1055 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 1056 transpose_4x4 d26, d27, d28, d29 1057 1058 /* Range limit */ 1059 vmov.u16 q15, #0x80 1060 vadd.s16 q13, q13, q15 1061 vadd.s16 q14, q14, q15 1062 vqmovun.s16 d26, q13 1063 vqmovun.s16 d27, q14 1064 1065 /* Store results to the output buffer */ 1066 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1067 add TMP1, TMP1, OUTPUT_COL 1068 add TMP2, TMP2, OUTPUT_COL 1069 add TMP3, TMP3, OUTPUT_COL 1070 add TMP4, TMP4, OUTPUT_COL 1071 1072#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1073 /* We can use much less instructions on little endian systems if the 1074 * OS kernel is not configured to trap unaligned memory accesses 1075 */ 1076 vst1.32 {d26[0]}, [TMP1]! 1077 vst1.32 {d27[0]}, [TMP3]! 1078 vst1.32 {d26[1]}, [TMP2]! 1079 vst1.32 {d27[1]}, [TMP4]! 1080#else 1081 vst1.8 {d26[0]}, [TMP1]! 1082 vst1.8 {d27[0]}, [TMP3]! 1083 vst1.8 {d26[1]}, [TMP1]! 1084 vst1.8 {d27[1]}, [TMP3]! 1085 vst1.8 {d26[2]}, [TMP1]! 1086 vst1.8 {d27[2]}, [TMP3]! 1087 vst1.8 {d26[3]}, [TMP1]! 1088 vst1.8 {d27[3]}, [TMP3]! 1089 1090 vst1.8 {d26[4]}, [TMP2]! 1091 vst1.8 {d27[4]}, [TMP4]! 1092 vst1.8 {d26[5]}, [TMP2]! 1093 vst1.8 {d27[5]}, [TMP4]! 1094 vst1.8 {d26[6]}, [TMP2]! 1095 vst1.8 {d27[6]}, [TMP4]! 1096 vst1.8 {d26[7]}, [TMP2]! 1097 vst1.8 {d27[7]}, [TMP4]! 1098#endif 1099 1100 vpop {d8-d15} 1101 bx lr 1102 1103 .unreq DCT_TABLE 1104 .unreq COEF_BLOCK 1105 .unreq OUTPUT_BUF 1106 .unreq OUTPUT_COL 1107 .unreq TMP1 1108 .unreq TMP2 1109 .unreq TMP3 1110 .unreq TMP4 1111.endfunc 1112 1113.purgem idct_helper 1114 1115 1116/*****************************************************************************/ 1117 1118/* 1119 * jsimd_idct_2x2_neon 1120 * 1121 * This function contains inverse-DCT code for getting reduced-size 1122 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1123 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1124 * function from jpeg-6b (jidctred.c). 1125 * 1126 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1127 * requires much less arithmetic operations and hence should be faster. 1128 * The primary purpose of this particular NEON optimized function is 1129 * bit exact compatibility with jpeg-6b. 1130 */ 1131 1132.balign 8 1133jsimd_idct_2x2_neon_consts: 1134 .short -FIX_0_720959822 /* d0[0] */ 1135 .short FIX_0_850430095 /* d0[1] */ 1136 .short -FIX_1_272758580 /* d0[2] */ 1137 .short FIX_3_624509785 /* d0[3] */ 1138 1139.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1140 vshll.s16 q14, \x4, #15 1141 vmull.s16 q13, \x6, d0[3] 1142 vmlal.s16 q13, \x10, d0[2] 1143 vmlal.s16 q13, \x12, d0[1] 1144 vmlal.s16 q13, \x16, d0[0] 1145 1146 vadd.s32 q10, q14, q13 1147 vsub.s32 q14, q14, q13 1148 1149.if \shift > 16 1150 vrshr.s32 q10, q10, #\shift 1151 vrshr.s32 q14, q14, #\shift 1152 vmovn.s32 \y26, q10 1153 vmovn.s32 \y27, q14 1154.else 1155 vrshrn.s32 \y26, q10, #\shift 1156 vrshrn.s32 \y27, q14, #\shift 1157.endif 1158 1159.endm 1160 1161asm_function jsimd_idct_2x2_neon 1162 1163 DCT_TABLE .req r0 1164 COEF_BLOCK .req r1 1165 OUTPUT_BUF .req r2 1166 OUTPUT_COL .req r3 1167 TMP1 .req r0 1168 TMP2 .req ip 1169 1170 vpush {d8-d15} 1171 1172 /* Load constants */ 1173 adr TMP2, jsimd_idct_2x2_neon_consts 1174 vld1.16 {d0}, [TMP2, :64] 1175 1176 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1177 * 0 1 2 3 | 4 5 6 7 1178 * ---------+-------- 1179 * 0 | d4 | d5 1180 * 1 | d6 | d7 1181 * 2 | - | - 1182 * 3 | d10 | d11 1183 * 4 | - | - 1184 * 5 | d12 | d13 1185 * 6 | - | - 1186 * 7 | d16 | d17 1187 */ 1188 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1189 add COEF_BLOCK, COEF_BLOCK, #16 1190 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! 1191 add COEF_BLOCK, COEF_BLOCK, #16 1192 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! 1193 add COEF_BLOCK, COEF_BLOCK, #16 1194 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1195 /* Dequantize */ 1196 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1197 vmul.s16 q2, q2, q9 1198 vmul.s16 q3, q3, q10 1199 add DCT_TABLE, DCT_TABLE, #16 1200 vld1.16 {d24, d25}, [DCT_TABLE, :128]! 1201 vmul.s16 q5, q5, q12 1202 add DCT_TABLE, DCT_TABLE, #16 1203 vld1.16 {d26, d27}, [DCT_TABLE, :128]! 1204 vmul.s16 q6, q6, q13 1205 add DCT_TABLE, DCT_TABLE, #16 1206 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1207 vmul.s16 q8, q8, q15 1208 1209 /* Pass 1 */ 1210#if 0 1211 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 1212 transpose_4x4 d4, d6, d8, d10 1213 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 1214 transpose_4x4 d5, d7, d9, d11 1215#else 1216 vmull.s16 q13, d6, d0[3] 1217 vmlal.s16 q13, d10, d0[2] 1218 vmlal.s16 q13, d12, d0[1] 1219 vmlal.s16 q13, d16, d0[0] 1220 vmull.s16 q12, d7, d0[3] 1221 vmlal.s16 q12, d11, d0[2] 1222 vmlal.s16 q12, d13, d0[1] 1223 vmlal.s16 q12, d17, d0[0] 1224 vshll.s16 q14, d4, #15 1225 vshll.s16 q15, d5, #15 1226 vadd.s32 q10, q14, q13 1227 vsub.s32 q14, q14, q13 1228 vrshrn.s32 d4, q10, #13 1229 vrshrn.s32 d6, q14, #13 1230 vadd.s32 q10, q15, q12 1231 vsub.s32 q14, q15, q12 1232 vrshrn.s32 d5, q10, #13 1233 vrshrn.s32 d7, q14, #13 1234 vtrn.16 q2, q3 1235 vtrn.32 q3, q5 1236#endif 1237 1238 /* Pass 2 */ 1239 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 1240 1241 /* Range limit */ 1242 vmov.u16 q15, #0x80 1243 vadd.s16 q13, q13, q15 1244 vqmovun.s16 d26, q13 1245 vqmovun.s16 d27, q13 1246 1247 /* Store results to the output buffer */ 1248 ldmia OUTPUT_BUF, {TMP1, TMP2} 1249 add TMP1, TMP1, OUTPUT_COL 1250 add TMP2, TMP2, OUTPUT_COL 1251 1252 vst1.8 {d26[0]}, [TMP1]! 1253 vst1.8 {d27[4]}, [TMP1]! 1254 vst1.8 {d26[1]}, [TMP2]! 1255 vst1.8 {d27[5]}, [TMP2]! 1256 1257 vpop {d8-d15} 1258 bx lr 1259 1260 .unreq DCT_TABLE 1261 .unreq COEF_BLOCK 1262 .unreq OUTPUT_BUF 1263 .unreq OUTPUT_COL 1264 .unreq TMP1 1265 .unreq TMP2 1266.endfunc 1267 1268.purgem idct_helper 1269 1270 1271/*****************************************************************************/ 1272 1273/* 1274 * jsimd_ycc_extrgb_convert_neon 1275 * jsimd_ycc_extbgr_convert_neon 1276 * jsimd_ycc_extrgbx_convert_neon 1277 * jsimd_ycc_extbgrx_convert_neon 1278 * jsimd_ycc_extxbgr_convert_neon 1279 * jsimd_ycc_extxrgb_convert_neon 1280 * 1281 * Colorspace conversion YCbCr -> RGB 1282 */ 1283 1284 1285.macro do_load size 1286 .if \size == 8 1287 vld1.8 {d4}, [U, :64]! 1288 vld1.8 {d5}, [V, :64]! 1289 vld1.8 {d0}, [Y, :64]! 1290 pld [U, #64] 1291 pld [V, #64] 1292 pld [Y, #64] 1293 .elseif \size == 4 1294 vld1.8 {d4[0]}, [U]! 1295 vld1.8 {d4[1]}, [U]! 1296 vld1.8 {d4[2]}, [U]! 1297 vld1.8 {d4[3]}, [U]! 1298 vld1.8 {d5[0]}, [V]! 1299 vld1.8 {d5[1]}, [V]! 1300 vld1.8 {d5[2]}, [V]! 1301 vld1.8 {d5[3]}, [V]! 1302 vld1.8 {d0[0]}, [Y]! 1303 vld1.8 {d0[1]}, [Y]! 1304 vld1.8 {d0[2]}, [Y]! 1305 vld1.8 {d0[3]}, [Y]! 1306 .elseif \size == 2 1307 vld1.8 {d4[4]}, [U]! 1308 vld1.8 {d4[5]}, [U]! 1309 vld1.8 {d5[4]}, [V]! 1310 vld1.8 {d5[5]}, [V]! 1311 vld1.8 {d0[4]}, [Y]! 1312 vld1.8 {d0[5]}, [Y]! 1313 .elseif \size == 1 1314 vld1.8 {d4[6]}, [U]! 1315 vld1.8 {d5[6]}, [V]! 1316 vld1.8 {d0[6]}, [Y]! 1317 .else 1318 .error unsupported macroblock size 1319 .endif 1320.endm 1321 1322.macro do_store bpp, size 1323 .if \bpp == 24 1324 .if \size == 8 1325 vst3.8 {d10, d11, d12}, [RGB]! 1326 .elseif \size == 4 1327 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1328 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1329 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1330 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1331 .elseif \size == 2 1332 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1333 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1334 .elseif \size == 1 1335 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1336 .else 1337 .error unsupported macroblock size 1338 .endif 1339 .elseif \bpp == 32 1340 .if \size == 8 1341 vst4.8 {d10, d11, d12, d13}, [RGB]! 1342 .elseif \size == 4 1343 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1344 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1345 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1346 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1347 .elseif \size == 2 1348 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1349 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1350 .elseif \size == 1 1351 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1352 .else 1353 .error unsupported macroblock size 1354 .endif 1355 .else 1356 .error unsupported bpp 1357 .endif 1358.endm 1359 1360.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1361 1362/* 1363 * 2 stage pipelined YCbCr->RGB conversion 1364 */ 1365 1366.macro do_yuv_to_rgb_stage1 1367 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1368 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1369 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1370 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1371 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1372 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1373 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1374 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1375 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1376 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1377.endm 1378 1379.macro do_yuv_to_rgb_stage2 1380 vrshrn.s32 d20, q10, #15 1381 vrshrn.s32 d21, q11, #15 1382 vrshrn.s32 d24, q12, #14 1383 vrshrn.s32 d25, q13, #14 1384 vrshrn.s32 d28, q14, #14 1385 vrshrn.s32 d29, q15, #14 1386 vaddw.u8 q10, q10, d0 1387 vaddw.u8 q12, q12, d0 1388 vaddw.u8 q14, q14, d0 1389 vqmovun.s16 d1\g_offs, q10 1390 vqmovun.s16 d1\r_offs, q12 1391 vqmovun.s16 d1\b_offs, q14 1392.endm 1393 1394.macro do_yuv_to_rgb_stage2_store_load_stage1 1395 vld1.8 {d4}, [U, :64]! 1396 vrshrn.s32 d20, q10, #15 1397 vrshrn.s32 d21, q11, #15 1398 vrshrn.s32 d24, q12, #14 1399 vrshrn.s32 d25, q13, #14 1400 vrshrn.s32 d28, q14, #14 1401 vld1.8 {d5}, [V, :64]! 1402 vrshrn.s32 d29, q15, #14 1403 vaddw.u8 q10, q10, d0 1404 vaddw.u8 q12, q12, d0 1405 vaddw.u8 q14, q14, d0 1406 vqmovun.s16 d1\g_offs, q10 1407 vld1.8 {d0}, [Y, :64]! 1408 vqmovun.s16 d1\r_offs, q12 1409 pld [U, #64] 1410 pld [V, #64] 1411 pld [Y, #64] 1412 vqmovun.s16 d1\b_offs, q14 1413 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1414 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1415 do_store \bpp, 8 1416 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1417 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1418 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1419 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1420 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1421 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1422 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1423 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1424.endm 1425 1426.macro do_yuv_to_rgb 1427 do_yuv_to_rgb_stage1 1428 do_yuv_to_rgb_stage2 1429.endm 1430 1431/* Apple gas crashes on adrl, work around that by using adr. 1432 * But this requires a copy of these constants for each function. 1433 */ 1434 1435.balign 16 1436jsimd_ycc_\colorid\()_neon_consts: 1437 .short 0, 0, 0, 0 1438 .short 22971, -11277, -23401, 29033 1439 .short -128, -128, -128, -128 1440 .short -128, -128, -128, -128 1441 1442asm_function jsimd_ycc_\colorid\()_convert_neon 1443 OUTPUT_WIDTH .req r0 1444 INPUT_BUF .req r1 1445 INPUT_ROW .req r2 1446 OUTPUT_BUF .req r3 1447 NUM_ROWS .req r4 1448 1449 INPUT_BUF0 .req r5 1450 INPUT_BUF1 .req r6 1451 INPUT_BUF2 .req INPUT_BUF 1452 1453 RGB .req r7 1454 Y .req r8 1455 U .req r9 1456 V .req r10 1457 N .req ip 1458 1459 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 1460 adr ip, jsimd_ycc_\colorid\()_neon_consts 1461 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1462 1463 /* Save ARM registers and handle input arguments */ 1464 push {r4, r5, r6, r7, r8, r9, r10, lr} 1465 ldr NUM_ROWS, [sp, #(4 * 8)] 1466 ldr INPUT_BUF0, [INPUT_BUF] 1467 ldr INPUT_BUF1, [INPUT_BUF, #4] 1468 ldr INPUT_BUF2, [INPUT_BUF, #8] 1469 .unreq INPUT_BUF 1470 1471 /* Save NEON registers */ 1472 vpush {d8-d15} 1473 1474 /* Initially set d10, d11, d12, d13 to 0xFF */ 1475 vmov.u8 q5, #255 1476 vmov.u8 q6, #255 1477 1478 /* Outer loop over scanlines */ 1479 cmp NUM_ROWS, #1 1480 blt 9f 14810: 1482 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 1483 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 1484 mov N, OUTPUT_WIDTH 1485 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 1486 add INPUT_ROW, INPUT_ROW, #1 1487 ldr RGB, [OUTPUT_BUF], #4 1488 1489 /* Inner loop over pixels */ 1490 subs N, N, #8 1491 blt 3f 1492 do_load 8 1493 do_yuv_to_rgb_stage1 1494 subs N, N, #8 1495 blt 2f 14961: 1497 do_yuv_to_rgb_stage2_store_load_stage1 1498 subs N, N, #8 1499 bge 1b 15002: 1501 do_yuv_to_rgb_stage2 1502 do_store \bpp, 8 1503 tst N, #7 1504 beq 8f 15053: 1506 tst N, #4 1507 beq 3f 1508 do_load 4 15093: 1510 tst N, #2 1511 beq 4f 1512 do_load 2 15134: 1514 tst N, #1 1515 beq 5f 1516 do_load 1 15175: 1518 do_yuv_to_rgb 1519 tst N, #4 1520 beq 6f 1521 do_store \bpp, 4 15226: 1523 tst N, #2 1524 beq 7f 1525 do_store \bpp, 2 15267: 1527 tst N, #1 1528 beq 8f 1529 do_store \bpp, 1 15308: 1531 subs NUM_ROWS, NUM_ROWS, #1 1532 bgt 0b 15339: 1534 /* Restore all registers and return */ 1535 vpop {d8-d15} 1536 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1537 1538 .unreq OUTPUT_WIDTH 1539 .unreq INPUT_ROW 1540 .unreq OUTPUT_BUF 1541 .unreq NUM_ROWS 1542 .unreq INPUT_BUF0 1543 .unreq INPUT_BUF1 1544 .unreq INPUT_BUF2 1545 .unreq RGB 1546 .unreq Y 1547 .unreq U 1548 .unreq V 1549 .unreq N 1550.endfunc 1551 1552.purgem do_yuv_to_rgb 1553.purgem do_yuv_to_rgb_stage1 1554.purgem do_yuv_to_rgb_stage2 1555.purgem do_yuv_to_rgb_stage2_store_load_stage1 1556 1557.endm 1558 1559/*--------------------------------- id ----- bpp R G B */ 1560generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 1561generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 1562generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 1563generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 1564generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 1565generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 1566 1567.purgem do_load 1568.purgem do_store 1569 1570 1571/*****************************************************************************/ 1572 1573/* 1574 * jsimd_extrgb_ycc_convert_neon 1575 * jsimd_extbgr_ycc_convert_neon 1576 * jsimd_extrgbx_ycc_convert_neon 1577 * jsimd_extbgrx_ycc_convert_neon 1578 * jsimd_extxbgr_ycc_convert_neon 1579 * jsimd_extxrgb_ycc_convert_neon 1580 * 1581 * Colorspace conversion RGB -> YCbCr 1582 */ 1583 1584.macro do_store size 1585 .if \size == 8 1586 vst1.8 {d20}, [Y]! 1587 vst1.8 {d21}, [U]! 1588 vst1.8 {d22}, [V]! 1589 .elseif \size == 4 1590 vst1.8 {d20[0]}, [Y]! 1591 vst1.8 {d20[1]}, [Y]! 1592 vst1.8 {d20[2]}, [Y]! 1593 vst1.8 {d20[3]}, [Y]! 1594 vst1.8 {d21[0]}, [U]! 1595 vst1.8 {d21[1]}, [U]! 1596 vst1.8 {d21[2]}, [U]! 1597 vst1.8 {d21[3]}, [U]! 1598 vst1.8 {d22[0]}, [V]! 1599 vst1.8 {d22[1]}, [V]! 1600 vst1.8 {d22[2]}, [V]! 1601 vst1.8 {d22[3]}, [V]! 1602 .elseif \size == 2 1603 vst1.8 {d20[4]}, [Y]! 1604 vst1.8 {d20[5]}, [Y]! 1605 vst1.8 {d21[4]}, [U]! 1606 vst1.8 {d21[5]}, [U]! 1607 vst1.8 {d22[4]}, [V]! 1608 vst1.8 {d22[5]}, [V]! 1609 .elseif \size == 1 1610 vst1.8 {d20[6]}, [Y]! 1611 vst1.8 {d21[6]}, [U]! 1612 vst1.8 {d22[6]}, [V]! 1613 .else 1614 .error unsupported macroblock size 1615 .endif 1616.endm 1617 1618.macro do_load bpp, size 1619 .if \bpp == 24 1620 .if \size == 8 1621 vld3.8 {d10, d11, d12}, [RGB]! 1622 pld [RGB, #128] 1623 .elseif \size == 4 1624 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1625 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1626 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1627 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1628 .elseif \size == 2 1629 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1630 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1631 .elseif \size == 1 1632 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1633 .else 1634 .error unsupported macroblock size 1635 .endif 1636 .elseif \bpp == 32 1637 .if \size == 8 1638 vld4.8 {d10, d11, d12, d13}, [RGB]! 1639 pld [RGB, #128] 1640 .elseif \size == 4 1641 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1642 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1643 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1644 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1645 .elseif \size == 2 1646 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1647 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1648 .elseif \size == 1 1649 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1650 .else 1651 .error unsupported macroblock size 1652 .endif 1653 .else 1654 .error unsupported bpp 1655 .endif 1656.endm 1657 1658.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1659 1660/* 1661 * 2 stage pipelined RGB->YCbCr conversion 1662 */ 1663 1664.macro do_rgb_to_yuv_stage1 1665 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1666 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1667 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1668 vmull.u16 q7, d4, d0[0] 1669 vmlal.u16 q7, d6, d0[1] 1670 vmlal.u16 q7, d8, d0[2] 1671 vmull.u16 q8, d5, d0[0] 1672 vmlal.u16 q8, d7, d0[1] 1673 vmlal.u16 q8, d9, d0[2] 1674 vrev64.32 q9, q1 1675 vrev64.32 q13, q1 1676 vmlsl.u16 q9, d4, d0[3] 1677 vmlsl.u16 q9, d6, d1[0] 1678 vmlal.u16 q9, d8, d1[1] 1679 vmlsl.u16 q13, d5, d0[3] 1680 vmlsl.u16 q13, d7, d1[0] 1681 vmlal.u16 q13, d9, d1[1] 1682 vrev64.32 q14, q1 1683 vrev64.32 q15, q1 1684 vmlal.u16 q14, d4, d1[1] 1685 vmlsl.u16 q14, d6, d1[2] 1686 vmlsl.u16 q14, d8, d1[3] 1687 vmlal.u16 q15, d5, d1[1] 1688 vmlsl.u16 q15, d7, d1[2] 1689 vmlsl.u16 q15, d9, d1[3] 1690.endm 1691 1692.macro do_rgb_to_yuv_stage2 1693 vrshrn.u32 d20, q7, #16 1694 vrshrn.u32 d21, q8, #16 1695 vshrn.u32 d22, q9, #16 1696 vshrn.u32 d23, q13, #16 1697 vshrn.u32 d24, q14, #16 1698 vshrn.u32 d25, q15, #16 1699 vmovn.u16 d20, q10 /* d20 = y */ 1700 vmovn.u16 d21, q11 /* d21 = u */ 1701 vmovn.u16 d22, q12 /* d22 = v */ 1702.endm 1703 1704.macro do_rgb_to_yuv 1705 do_rgb_to_yuv_stage1 1706 do_rgb_to_yuv_stage2 1707.endm 1708 1709.macro do_rgb_to_yuv_stage2_store_load_stage1 1710 vrshrn.u32 d20, q7, #16 1711 vrshrn.u32 d21, q8, #16 1712 vshrn.u32 d22, q9, #16 1713 vrev64.32 q9, q1 1714 vshrn.u32 d23, q13, #16 1715 vrev64.32 q13, q1 1716 vshrn.u32 d24, q14, #16 1717 vshrn.u32 d25, q15, #16 1718 do_load \bpp, 8 1719 vmovn.u16 d20, q10 /* d20 = y */ 1720 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1721 vmovn.u16 d21, q11 /* d21 = u */ 1722 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1723 vmovn.u16 d22, q12 /* d22 = v */ 1724 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1725 vmull.u16 q7, d4, d0[0] 1726 vmlal.u16 q7, d6, d0[1] 1727 vmlal.u16 q7, d8, d0[2] 1728 vst1.8 {d20}, [Y]! 1729 vmull.u16 q8, d5, d0[0] 1730 vmlal.u16 q8, d7, d0[1] 1731 vmlal.u16 q8, d9, d0[2] 1732 vmlsl.u16 q9, d4, d0[3] 1733 vmlsl.u16 q9, d6, d1[0] 1734 vmlal.u16 q9, d8, d1[1] 1735 vst1.8 {d21}, [U]! 1736 vmlsl.u16 q13, d5, d0[3] 1737 vmlsl.u16 q13, d7, d1[0] 1738 vmlal.u16 q13, d9, d1[1] 1739 vrev64.32 q14, q1 1740 vrev64.32 q15, q1 1741 vmlal.u16 q14, d4, d1[1] 1742 vmlsl.u16 q14, d6, d1[2] 1743 vmlsl.u16 q14, d8, d1[3] 1744 vst1.8 {d22}, [V]! 1745 vmlal.u16 q15, d5, d1[1] 1746 vmlsl.u16 q15, d7, d1[2] 1747 vmlsl.u16 q15, d9, d1[3] 1748.endm 1749 1750.balign 16 1751jsimd_\colorid\()_ycc_neon_consts: 1752 .short 19595, 38470, 7471, 11059 1753 .short 21709, 32768, 27439, 5329 1754 .short 32767, 128, 32767, 128 1755 .short 32767, 128, 32767, 128 1756 1757asm_function jsimd_\colorid\()_ycc_convert_neon 1758 OUTPUT_WIDTH .req r0 1759 INPUT_BUF .req r1 1760 OUTPUT_BUF .req r2 1761 OUTPUT_ROW .req r3 1762 NUM_ROWS .req r4 1763 1764 OUTPUT_BUF0 .req r5 1765 OUTPUT_BUF1 .req r6 1766 OUTPUT_BUF2 .req OUTPUT_BUF 1767 1768 RGB .req r7 1769 Y .req r8 1770 U .req r9 1771 V .req r10 1772 N .req ip 1773 1774 /* Load constants to d0, d1, d2, d3 */ 1775 adr ip, jsimd_\colorid\()_ycc_neon_consts 1776 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1777 1778 /* Save ARM registers and handle input arguments */ 1779 push {r4, r5, r6, r7, r8, r9, r10, lr} 1780 ldr NUM_ROWS, [sp, #(4 * 8)] 1781 ldr OUTPUT_BUF0, [OUTPUT_BUF] 1782 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] 1783 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] 1784 .unreq OUTPUT_BUF 1785 1786 /* Save NEON registers */ 1787 vpush {d8-d15} 1788 1789 /* Outer loop over scanlines */ 1790 cmp NUM_ROWS, #1 1791 blt 9f 17920: 1793 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] 1794 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] 1795 mov N, OUTPUT_WIDTH 1796 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] 1797 add OUTPUT_ROW, OUTPUT_ROW, #1 1798 ldr RGB, [INPUT_BUF], #4 1799 1800 /* Inner loop over pixels */ 1801 subs N, N, #8 1802 blt 3f 1803 do_load \bpp, 8 1804 do_rgb_to_yuv_stage1 1805 subs N, N, #8 1806 blt 2f 18071: 1808 do_rgb_to_yuv_stage2_store_load_stage1 1809 subs N, N, #8 1810 bge 1b 18112: 1812 do_rgb_to_yuv_stage2 1813 do_store 8 1814 tst N, #7 1815 beq 8f 18163: 1817 tst N, #4 1818 beq 3f 1819 do_load \bpp, 4 18203: 1821 tst N, #2 1822 beq 4f 1823 do_load \bpp, 2 18244: 1825 tst N, #1 1826 beq 5f 1827 do_load \bpp, 1 18285: 1829 do_rgb_to_yuv 1830 tst N, #4 1831 beq 6f 1832 do_store 4 18336: 1834 tst N, #2 1835 beq 7f 1836 do_store 2 18377: 1838 tst N, #1 1839 beq 8f 1840 do_store 1 18418: 1842 subs NUM_ROWS, NUM_ROWS, #1 1843 bgt 0b 18449: 1845 /* Restore all registers and return */ 1846 vpop {d8-d15} 1847 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1848 1849 .unreq OUTPUT_WIDTH 1850 .unreq OUTPUT_ROW 1851 .unreq INPUT_BUF 1852 .unreq NUM_ROWS 1853 .unreq OUTPUT_BUF0 1854 .unreq OUTPUT_BUF1 1855 .unreq OUTPUT_BUF2 1856 .unreq RGB 1857 .unreq Y 1858 .unreq U 1859 .unreq V 1860 .unreq N 1861.endfunc 1862 1863.purgem do_rgb_to_yuv 1864.purgem do_rgb_to_yuv_stage1 1865.purgem do_rgb_to_yuv_stage2 1866.purgem do_rgb_to_yuv_stage2_store_load_stage1 1867 1868.endm 1869 1870/*--------------------------------- id ----- bpp R G B */ 1871generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 1872generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 1873generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 1874generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 1875generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 1876generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 1877 1878.purgem do_load 1879.purgem do_store 1880 1881 1882/*****************************************************************************/ 1883 1884/* 1885 * Load data into workspace, applying unsigned->signed conversion 1886 * 1887 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 1888 * rid of VST1.16 instructions 1889 */ 1890 1891asm_function jsimd_convsamp_neon 1892 SAMPLE_DATA .req r0 1893 START_COL .req r1 1894 WORKSPACE .req r2 1895 TMP1 .req r3 1896 TMP2 .req r4 1897 TMP3 .req r5 1898 TMP4 .req ip 1899 1900 push {r4, r5} 1901 vmov.u8 d0, #128 1902 1903 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1904 add TMP1, TMP1, START_COL 1905 add TMP2, TMP2, START_COL 1906 add TMP3, TMP3, START_COL 1907 add TMP4, TMP4, START_COL 1908 vld1.8 {d16}, [TMP1] 1909 vsubl.u8 q8, d16, d0 1910 vld1.8 {d18}, [TMP2] 1911 vsubl.u8 q9, d18, d0 1912 vld1.8 {d20}, [TMP3] 1913 vsubl.u8 q10, d20, d0 1914 vld1.8 {d22}, [TMP4] 1915 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1916 vsubl.u8 q11, d22, d0 1917 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! 1918 add TMP1, TMP1, START_COL 1919 add TMP2, TMP2, START_COL 1920 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! 1921 add TMP3, TMP3, START_COL 1922 add TMP4, TMP4, START_COL 1923 vld1.8 {d24}, [TMP1] 1924 vsubl.u8 q12, d24, d0 1925 vld1.8 {d26}, [TMP2] 1926 vsubl.u8 q13, d26, d0 1927 vld1.8 {d28}, [TMP3] 1928 vsubl.u8 q14, d28, d0 1929 vld1.8 {d30}, [TMP4] 1930 vsubl.u8 q15, d30, d0 1931 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! 1932 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! 1933 pop {r4, r5} 1934 bx lr 1935 1936 .unreq SAMPLE_DATA 1937 .unreq START_COL 1938 .unreq WORKSPACE 1939 .unreq TMP1 1940 .unreq TMP2 1941 .unreq TMP3 1942 .unreq TMP4 1943.endfunc 1944 1945 1946/*****************************************************************************/ 1947 1948/* 1949 * jsimd_fdct_ifast_neon 1950 * 1951 * This function contains a fast, not so accurate integer implementation of 1952 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 1953 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 1954 * function from jfdctfst.c 1955 * 1956 * TODO: can be combined with 'jsimd_convsamp_neon' to get 1957 * rid of a bunch of VLD1.16 instructions 1958 */ 1959 1960#define XFIX_0_382683433 d0[0] 1961#define XFIX_0_541196100 d0[1] 1962#define XFIX_0_707106781 d0[2] 1963#define XFIX_1_306562965 d0[3] 1964 1965.balign 16 1966jsimd_fdct_ifast_neon_consts: 1967 .short (98 * 128) /* XFIX_0_382683433 */ 1968 .short (139 * 128) /* XFIX_0_541196100 */ 1969 .short (181 * 128) /* XFIX_0_707106781 */ 1970 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 1971 1972asm_function jsimd_fdct_ifast_neon 1973 1974 DATA .req r0 1975 TMP .req ip 1976 1977 vpush {d8-d15} 1978 1979 /* Load constants */ 1980 adr TMP, jsimd_fdct_ifast_neon_consts 1981 vld1.16 {d0}, [TMP, :64] 1982 1983 /* Load all DATA into NEON registers with the following allocation: 1984 * 0 1 2 3 | 4 5 6 7 1985 * ---------+-------- 1986 * 0 | d16 | d17 | q8 1987 * 1 | d18 | d19 | q9 1988 * 2 | d20 | d21 | q10 1989 * 3 | d22 | d23 | q11 1990 * 4 | d24 | d25 | q12 1991 * 5 | d26 | d27 | q13 1992 * 6 | d28 | d29 | q14 1993 * 7 | d30 | d31 | q15 1994 */ 1995 1996 vld1.16 {d16, d17, d18, d19}, [DATA, :128]! 1997 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! 1998 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! 1999 vld1.16 {d28, d29, d30, d31}, [DATA, :128] 2000 sub DATA, DATA, #(128 - 32) 2001 2002 mov TMP, #2 20031: 2004 /* Transpose */ 2005 vtrn.16 q12, q13 2006 vtrn.16 q10, q11 2007 vtrn.16 q8, q9 2008 vtrn.16 q14, q15 2009 vtrn.32 q9, q11 2010 vtrn.32 q13, q15 2011 vtrn.32 q8, q10 2012 vtrn.32 q12, q14 2013 vswp d30, d23 2014 vswp d24, d17 2015 vswp d26, d19 2016 /* 1-D FDCT */ 2017 vadd.s16 q2, q11, q12 2018 vswp d28, d21 2019 vsub.s16 q12, q11, q12 2020 vsub.s16 q6, q10, q13 2021 vadd.s16 q10, q10, q13 2022 vsub.s16 q7, q9, q14 2023 vadd.s16 q9, q9, q14 2024 vsub.s16 q1, q8, q15 2025 vadd.s16 q8, q8, q15 2026 vsub.s16 q4, q9, q10 2027 vsub.s16 q5, q8, q2 2028 vadd.s16 q3, q9, q10 2029 vadd.s16 q4, q4, q5 2030 vadd.s16 q2, q8, q2 2031 vqdmulh.s16 q4, q4, XFIX_0_707106781 2032 vadd.s16 q11, q12, q6 2033 vadd.s16 q8, q2, q3 2034 vsub.s16 q12, q2, q3 2035 vadd.s16 q3, q6, q7 2036 vadd.s16 q7, q7, q1 2037 vqdmulh.s16 q3, q3, XFIX_0_707106781 2038 vsub.s16 q6, q11, q7 2039 vadd.s16 q10, q5, q4 2040 vqdmulh.s16 q6, q6, XFIX_0_382683433 2041 vsub.s16 q14, q5, q4 2042 vqdmulh.s16 q11, q11, XFIX_0_541196100 2043 vqdmulh.s16 q5, q7, XFIX_1_306562965 2044 vadd.s16 q4, q1, q3 2045 vsub.s16 q3, q1, q3 2046 vadd.s16 q7, q7, q6 2047 vadd.s16 q11, q11, q6 2048 vadd.s16 q7, q7, q5 2049 vadd.s16 q13, q3, q11 2050 vsub.s16 q11, q3, q11 2051 vadd.s16 q9, q4, q7 2052 vsub.s16 q15, q4, q7 2053 subs TMP, TMP, #1 2054 bne 1b 2055 2056 /* store results */ 2057 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! 2058 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! 2059 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! 2060 vst1.16 {d28, d29, d30, d31}, [DATA, :128] 2061 2062 vpop {d8-d15} 2063 bx lr 2064 2065 .unreq DATA 2066 .unreq TMP 2067.endfunc 2068 2069 2070/*****************************************************************************/ 2071 2072/* 2073 * GLOBAL(void) 2074 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, 2075 * DCTELEM * workspace); 2076 * 2077 * Note: the code uses 2 stage pipelining in order to improve instructions 2078 * scheduling and eliminate stalls (this provides ~15% better 2079 * performance for this function on both ARM Cortex-A8 and 2080 * ARM Cortex-A9 when compared to the non-pipelined variant). 2081 * The instructions which belong to the second stage use different 2082 * indentation for better readiability. 2083 */ 2084asm_function jsimd_quantize_neon 2085 2086 COEF_BLOCK .req r0 2087 DIVISORS .req r1 2088 WORKSPACE .req r2 2089 2090 RECIPROCAL .req DIVISORS 2091 CORRECTION .req r3 2092 SHIFT .req ip 2093 LOOP_COUNT .req r4 2094 2095 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2096 vabs.s16 q12, q0 2097 add CORRECTION, DIVISORS, #(64 * 2) 2098 add SHIFT, DIVISORS, #(64 * 6) 2099 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2100 vabs.s16 q13, q1 2101 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2102 vadd.u16 q12, q12, q10 /* add correction */ 2103 vadd.u16 q13, q13, q11 2104 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2105 vmull.u16 q11, d25, d17 2106 vmull.u16 q8, d26, d18 2107 vmull.u16 q9, d27, d19 2108 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2109 vshrn.u32 d20, q10, #16 2110 vshrn.u32 d21, q11, #16 2111 vshrn.u32 d22, q8, #16 2112 vshrn.u32 d23, q9, #16 2113 vneg.s16 q12, q12 2114 vneg.s16 q13, q13 2115 vshr.s16 q2, q0, #15 /* extract sign */ 2116 vshr.s16 q3, q1, #15 2117 vshl.u16 q14, q10, q12 /* shift */ 2118 vshl.u16 q15, q11, q13 2119 2120 push {r4, r5} 2121 mov LOOP_COUNT, #3 21221: 2123 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2124 veor.u16 q14, q14, q2 /* restore sign */ 2125 vabs.s16 q12, q0 2126 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2127 vabs.s16 q13, q1 2128 veor.u16 q15, q15, q3 2129 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2130 vadd.u16 q12, q12, q10 /* add correction */ 2131 vadd.u16 q13, q13, q11 2132 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2133 vmull.u16 q11, d25, d17 2134 vmull.u16 q8, d26, d18 2135 vmull.u16 q9, d27, d19 2136 vsub.u16 q14, q14, q2 2137 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2138 vsub.u16 q15, q15, q3 2139 vshrn.u32 d20, q10, #16 2140 vshrn.u32 d21, q11, #16 2141 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2142 vshrn.u32 d22, q8, #16 2143 vshrn.u32 d23, q9, #16 2144 vneg.s16 q12, q12 2145 vneg.s16 q13, q13 2146 vshr.s16 q2, q0, #15 /* extract sign */ 2147 vshr.s16 q3, q1, #15 2148 vshl.u16 q14, q10, q12 /* shift */ 2149 vshl.u16 q15, q11, q13 2150 subs LOOP_COUNT, LOOP_COUNT, #1 2151 bne 1b 2152 pop {r4, r5} 2153 2154 veor.u16 q14, q14, q2 /* restore sign */ 2155 veor.u16 q15, q15, q3 2156 vsub.u16 q14, q14, q2 2157 vsub.u16 q15, q15, q3 2158 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2159 2160 bx lr /* return */ 2161 2162 .unreq COEF_BLOCK 2163 .unreq DIVISORS 2164 .unreq WORKSPACE 2165 .unreq RECIPROCAL 2166 .unreq CORRECTION 2167 .unreq SHIFT 2168 .unreq LOOP_COUNT 2169.endfunc 2170 2171 2172/*****************************************************************************/ 2173 2174/* 2175 * GLOBAL(void) 2176 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, 2177 * JDIMENSION downsampled_width, 2178 * JSAMPARRAY input_data, 2179 * JSAMPARRAY * output_data_ptr); 2180 * 2181 * Note: the use of unaligned writes is the main remaining bottleneck in 2182 * this code, which can be potentially solved to get up to tens 2183 * of percents performance improvement on Cortex-A8/Cortex-A9. 2184 */ 2185 2186/* 2187 * Upsample 16 source pixels to 32 destination pixels. The new 16 source 2188 * pixels are loaded to q0. The previous 16 source pixels are in q1. The 2189 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. 2190 * Register d28 is used for multiplication by 3. Register q15 is used 2191 * for adding +1 bias. 2192 */ 2193.macro upsample16 OUTPTR, INPTR 2194 vld1.8 {q0}, [\INPTR]! 2195 vmovl.u8 q8, d0 2196 vext.8 q2, q1, q0, #15 2197 vmovl.u8 q9, d1 2198 vaddw.u8 q10, q15, d4 2199 vaddw.u8 q11, q15, d5 2200 vmlal.u8 q8, d4, d28 2201 vmlal.u8 q9, d5, d28 2202 vmlal.u8 q10, d0, d28 2203 vmlal.u8 q11, d1, d28 2204 vmov q1, q0 /* backup source pixels to q1 */ 2205 vrshrn.u16 d6, q8, #2 2206 vrshrn.u16 d7, q9, #2 2207 vshrn.u16 d8, q10, #2 2208 vshrn.u16 d9, q11, #2 2209 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2210.endm 2211 2212/* 2213 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' 2214 * macro, the roles of q0 and q1 registers are reversed for even and odd 2215 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. 2216 * Also this unrolling allows to reorder loads and stores to compensate 2217 * multiplication latency and reduce stalls. 2218 */ 2219.macro upsample32 OUTPTR, INPTR 2220 /* even 16 pixels group */ 2221 vld1.8 {q0}, [\INPTR]! 2222 vmovl.u8 q8, d0 2223 vext.8 q2, q1, q0, #15 2224 vmovl.u8 q9, d1 2225 vaddw.u8 q10, q15, d4 2226 vaddw.u8 q11, q15, d5 2227 vmlal.u8 q8, d4, d28 2228 vmlal.u8 q9, d5, d28 2229 vmlal.u8 q10, d0, d28 2230 vmlal.u8 q11, d1, d28 2231 /* odd 16 pixels group */ 2232 vld1.8 {q1}, [\INPTR]! 2233 vrshrn.u16 d6, q8, #2 2234 vrshrn.u16 d7, q9, #2 2235 vshrn.u16 d8, q10, #2 2236 vshrn.u16 d9, q11, #2 2237 vmovl.u8 q8, d2 2238 vext.8 q2, q0, q1, #15 2239 vmovl.u8 q9, d3 2240 vaddw.u8 q10, q15, d4 2241 vaddw.u8 q11, q15, d5 2242 vmlal.u8 q8, d4, d28 2243 vmlal.u8 q9, d5, d28 2244 vmlal.u8 q10, d2, d28 2245 vmlal.u8 q11, d3, d28 2246 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2247 vrshrn.u16 d6, q8, #2 2248 vrshrn.u16 d7, q9, #2 2249 vshrn.u16 d8, q10, #2 2250 vshrn.u16 d9, q11, #2 2251 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2252.endm 2253 2254/* 2255 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. 2256 */ 2257.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 2258 /* special case for the first and last pixels */ 2259 sub \WIDTH, \WIDTH, #1 2260 add \OUTPTR, \OUTPTR, #1 2261 ldrb \TMP1, [\INPTR, \WIDTH] 2262 strb \TMP1, [\OUTPTR, \WIDTH, asl #1] 2263 ldrb \TMP1, [\INPTR], #1 2264 strb \TMP1, [\OUTPTR, #-1] 2265 vmov.8 d3[7], \TMP1 2266 2267 subs \WIDTH, \WIDTH, #32 2268 blt 5f 22690: /* process 32 pixels per iteration */ 2270 upsample32 \OUTPTR, \INPTR 2271 subs \WIDTH, \WIDTH, #32 2272 bge 0b 22735: 2274 adds \WIDTH, \WIDTH, #16 2275 blt 1f 22760: /* process 16 pixels if needed */ 2277 upsample16 \OUTPTR, \INPTR 2278 subs \WIDTH, \WIDTH, #16 22791: 2280 adds \WIDTH, \WIDTH, #16 2281 beq 9f 2282 2283 /* load the remaining 1-15 pixels */ 2284 add \INPTR, \INPTR, \WIDTH 2285 tst \WIDTH, #1 2286 beq 2f 2287 sub \INPTR, \INPTR, #1 2288 vld1.8 {d0[0]}, [\INPTR] 22892: 2290 tst \WIDTH, #2 2291 beq 2f 2292 vext.8 d0, d0, d0, #6 2293 sub \INPTR, \INPTR, #1 2294 vld1.8 {d0[1]}, [\INPTR] 2295 sub \INPTR, \INPTR, #1 2296 vld1.8 {d0[0]}, [\INPTR] 22972: 2298 tst \WIDTH, #4 2299 beq 2f 2300 vrev64.32 d0, d0 2301 sub \INPTR, \INPTR, #1 2302 vld1.8 {d0[3]}, [\INPTR] 2303 sub \INPTR, \INPTR, #1 2304 vld1.8 {d0[2]}, [\INPTR] 2305 sub \INPTR, \INPTR, #1 2306 vld1.8 {d0[1]}, [\INPTR] 2307 sub \INPTR, \INPTR, #1 2308 vld1.8 {d0[0]}, [\INPTR] 23092: 2310 tst \WIDTH, #8 2311 beq 2f 2312 vmov d1, d0 2313 sub \INPTR, \INPTR, #8 2314 vld1.8 {d0}, [\INPTR] 23152: /* upsample the remaining pixels */ 2316 vmovl.u8 q8, d0 2317 vext.8 q2, q1, q0, #15 2318 vmovl.u8 q9, d1 2319 vaddw.u8 q10, q15, d4 2320 vaddw.u8 q11, q15, d5 2321 vmlal.u8 q8, d4, d28 2322 vmlal.u8 q9, d5, d28 2323 vmlal.u8 q10, d0, d28 2324 vmlal.u8 q11, d1, d28 2325 vrshrn.u16 d10, q8, #2 2326 vrshrn.u16 d12, q9, #2 2327 vshrn.u16 d11, q10, #2 2328 vshrn.u16 d13, q11, #2 2329 vzip.8 d10, d11 2330 vzip.8 d12, d13 2331 /* store the remaining pixels */ 2332 tst \WIDTH, #8 2333 beq 2f 2334 vst1.8 {d10, d11}, [\OUTPTR]! 2335 vmov q5, q6 23362: 2337 tst \WIDTH, #4 2338 beq 2f 2339 vst1.8 {d10}, [\OUTPTR]! 2340 vmov d10, d11 23412: 2342 tst \WIDTH, #2 2343 beq 2f 2344 vst1.8 {d10[0]}, [\OUTPTR]! 2345 vst1.8 {d10[1]}, [\OUTPTR]! 2346 vst1.8 {d10[2]}, [\OUTPTR]! 2347 vst1.8 {d10[3]}, [\OUTPTR]! 2348 vext.8 d10, d10, d10, #4 23492: 2350 tst \WIDTH, #1 2351 beq 2f 2352 vst1.8 {d10[0]}, [\OUTPTR]! 2353 vst1.8 {d10[1]}, [\OUTPTR]! 23542: 23559: 2356.endm 2357 2358asm_function jsimd_h2v1_fancy_upsample_neon 2359 2360 MAX_V_SAMP_FACTOR .req r0 2361 DOWNSAMPLED_WIDTH .req r1 2362 INPUT_DATA .req r2 2363 OUTPUT_DATA_PTR .req r3 2364 OUTPUT_DATA .req OUTPUT_DATA_PTR 2365 2366 OUTPTR .req r4 2367 INPTR .req r5 2368 WIDTH .req ip 2369 TMP .req lr 2370 2371 push {r4, r5, r6, lr} 2372 vpush {d8-d15} 2373 2374 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] 2375 cmp MAX_V_SAMP_FACTOR, #0 2376 ble 99f 2377 2378 /* initialize constants */ 2379 vmov.u8 d28, #3 2380 vmov.u16 q15, #1 238111: 2382 ldr INPTR, [INPUT_DATA], #4 2383 ldr OUTPTR, [OUTPUT_DATA], #4 2384 mov WIDTH, DOWNSAMPLED_WIDTH 2385 upsample_row OUTPTR, INPTR, WIDTH, TMP 2386 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 2387 bgt 11b 2388 238999: 2390 vpop {d8-d15} 2391 pop {r4, r5, r6, pc} 2392 2393 .unreq MAX_V_SAMP_FACTOR 2394 .unreq DOWNSAMPLED_WIDTH 2395 .unreq INPUT_DATA 2396 .unreq OUTPUT_DATA_PTR 2397 .unreq OUTPUT_DATA 2398 2399 .unreq OUTPTR 2400 .unreq INPTR 2401 .unreq WIDTH 2402 .unreq TMP 2403 2404.endfunc 2405 2406.purgem upsample16 2407.purgem upsample32 2408.purgem upsample_row 2409