pixman-arm-neon-asm-bilinear.S revision 1176bdada62cabc6ec4b0308a930e83b679d5d36
1/* 2 * Copyright © 2011 SCore Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 * Author: Taekyun Kim (tkq.kim@samsung.com) 25 */ 26 27/* 28 * This file contains scaled bilinear scanline functions implemented 29 * using older siarhei's bilinear macro template. 30 * 31 * << General scanline function procedures >> 32 * 1. bilinear interpolate source pixels 33 * 2. load mask pixels 34 * 3. load destination pixels 35 * 4. duplicate mask to fill whole register 36 * 5. interleave source & destination pixels 37 * 6. apply mask to source pixels 38 * 7. combine source & destination pixels 39 * 8, Deinterleave final result 40 * 9. store destination pixels 41 * 42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers. 43 * Registers with double numbers(src01, dst01) are 128-bits registers. 44 * All temp registers can be used freely outside the code block. 45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. 46 * 47 * Remarks 48 * There can be lots of pipeline stalls inside code block and between code blocks. 49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme. 50 */ 51 52/* Prevent the stack from becoming executable for no reason... */ 53#if defined(__linux__) && defined (__ELF__) 54.section .note.GNU-stack,"",%progbits 55#endif 56 57.text 58.fpu neon 59.arch armv7a 60.object_arch armv4 61.eabi_attribute 10, 0 62.eabi_attribute 12, 0 63.arm 64.altmacro 65.p2align 2 66 67#include "pixman-private.h" 68#include "pixman-arm-neon-asm.h" 69 70/* 71 * Bilinear macros from pixman-arm-neon-asm.S 72 */ 73 74/* Supplementary macro for setting function attributes */ 75.macro pixman_asm_function fname 76 .func fname 77 .global fname 78#ifdef __ELF__ 79 .hidden fname 80 .type fname, %function 81#endif 82fname: 83.endm 84 85/* 86 * Bilinear scaling support code which tries to provide pixel fetching, color 87 * format conversion, and interpolation as separate macros which can be used 88 * as the basic building blocks for constructing bilinear scanline functions. 89 */ 90 91.macro bilinear_load_8888 reg1, reg2, tmp 92 mov TMP1, X, asr #16 93 add X, X, UX 94 add TMP1, TOP, TMP1, asl #2 95 vld1.32 {reg1}, [TMP1], STRIDE 96 vld1.32 {reg2}, [TMP1] 97.endm 98 99.macro bilinear_load_0565 reg1, reg2, tmp 100 mov TMP1, X, asr #16 101 add X, X, UX 102 add TMP1, TOP, TMP1, asl #1 103 vld1.32 {reg2[0]}, [TMP1], STRIDE 104 vld1.32 {reg2[1]}, [TMP1] 105 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 106.endm 107 108.macro bilinear_load_and_vertical_interpolate_two_8888 \ 109 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 110 111 bilinear_load_8888 reg1, reg2, tmp1 112 vmull.u8 acc1, reg1, d28 113 vmlal.u8 acc1, reg2, d29 114 bilinear_load_8888 reg3, reg4, tmp2 115 vmull.u8 acc2, reg3, d28 116 vmlal.u8 acc2, reg4, d29 117.endm 118 119.macro bilinear_load_and_vertical_interpolate_four_8888 \ 120 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 121 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 122 123 bilinear_load_and_vertical_interpolate_two_8888 \ 124 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 125 bilinear_load_and_vertical_interpolate_two_8888 \ 126 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 127.endm 128 129.macro bilinear_load_and_vertical_interpolate_two_0565 \ 130 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 131 132 mov TMP1, X, asr #16 133 add X, X, UX 134 add TMP1, TOP, TMP1, asl #1 135 mov TMP2, X, asr #16 136 add X, X, UX 137 add TMP2, TOP, TMP2, asl #1 138 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 139 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 140 vld1.32 {acc2lo[1]}, [TMP1] 141 vld1.32 {acc2hi[1]}, [TMP2] 142 convert_0565_to_x888 acc2, reg3, reg2, reg1 143 vzip.u8 reg1, reg3 144 vzip.u8 reg2, reg4 145 vzip.u8 reg3, reg4 146 vzip.u8 reg1, reg2 147 vmull.u8 acc1, reg1, d28 148 vmlal.u8 acc1, reg2, d29 149 vmull.u8 acc2, reg3, d28 150 vmlal.u8 acc2, reg4, d29 151.endm 152 153.macro bilinear_load_and_vertical_interpolate_four_0565 \ 154 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 155 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 156 157 mov TMP1, X, asr #16 158 add X, X, UX 159 add TMP1, TOP, TMP1, asl #1 160 mov TMP2, X, asr #16 161 add X, X, UX 162 add TMP2, TOP, TMP2, asl #1 163 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 164 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 165 vld1.32 {xacc2lo[1]}, [TMP1] 166 vld1.32 {xacc2hi[1]}, [TMP2] 167 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 168 mov TMP1, X, asr #16 169 add X, X, UX 170 add TMP1, TOP, TMP1, asl #1 171 mov TMP2, X, asr #16 172 add X, X, UX 173 add TMP2, TOP, TMP2, asl #1 174 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 175 vzip.u8 xreg1, xreg3 176 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 177 vzip.u8 xreg2, xreg4 178 vld1.32 {yacc2lo[1]}, [TMP1] 179 vzip.u8 xreg3, xreg4 180 vld1.32 {yacc2hi[1]}, [TMP2] 181 vzip.u8 xreg1, xreg2 182 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 183 vmull.u8 xacc1, xreg1, d28 184 vzip.u8 yreg1, yreg3 185 vmlal.u8 xacc1, xreg2, d29 186 vzip.u8 yreg2, yreg4 187 vmull.u8 xacc2, xreg3, d28 188 vzip.u8 yreg3, yreg4 189 vmlal.u8 xacc2, xreg4, d29 190 vzip.u8 yreg1, yreg2 191 vmull.u8 yacc1, yreg1, d28 192 vmlal.u8 yacc1, yreg2, d29 193 vmull.u8 yacc2, yreg3, d28 194 vmlal.u8 yacc2, yreg4, d29 195.endm 196 197.macro bilinear_store_8888 numpix, tmp1, tmp2 198.if numpix == 4 199 vst1.32 {d0, d1}, [OUT]! 200.elseif numpix == 2 201 vst1.32 {d0}, [OUT]! 202.elseif numpix == 1 203 vst1.32 {d0[0]}, [OUT, :32]! 204.else 205 .error bilinear_store_8888 numpix is unsupported 206.endif 207.endm 208 209.macro bilinear_store_0565 numpix, tmp1, tmp2 210 vuzp.u8 d0, d1 211 vuzp.u8 d2, d3 212 vuzp.u8 d1, d3 213 vuzp.u8 d0, d2 214 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 215.if numpix == 4 216 vst1.16 {d2}, [OUT]! 217.elseif numpix == 2 218 vst1.32 {d2[0]}, [OUT]! 219.elseif numpix == 1 220 vst1.16 {d2[0]}, [OUT]! 221.else 222 .error bilinear_store_0565 numpix is unsupported 223.endif 224.endm 225 226 227/* 228 * Macros for loading mask pixels into register 'mask'. 229 * vdup must be done in somewhere else. 230 */ 231.macro bilinear_load_mask_x numpix, mask 232.endm 233 234.macro bilinear_load_mask_8 numpix, mask 235.if numpix == 4 236 vld1.32 {mask[0]}, [MASK]! 237.elseif numpix == 2 238 vld1.16 {mask[0]}, [MASK]! 239.elseif numpix == 1 240 vld1.8 {mask[0]}, [MASK]! 241.else 242 .error bilinear_load_mask_8 numpix is unsupported 243.endif 244 pld [MASK, #prefetch_offset] 245.endm 246 247.macro bilinear_load_mask mask_fmt, numpix, mask 248 bilinear_load_mask_&mask_fmt numpix, mask 249.endm 250 251 252/* 253 * Macros for loading destination pixels into register 'dst0' and 'dst1'. 254 * Interleave should be done somewhere else. 255 */ 256.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 257.endm 258 259.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 260.endm 261 262.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 263.if numpix == 4 264 vld1.32 {dst0, dst1}, [OUT] 265.elseif numpix == 2 266 vld1.32 {dst0}, [OUT] 267.elseif numpix == 1 268 vld1.32 {dst0[0]}, [OUT] 269.else 270 .error bilinear_load_dst_8888 numpix is unsupported 271.endif 272 pld [OUT, #(prefetch_offset * 4)] 273.endm 274 275.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 276 bilinear_load_dst_8888 numpix, dst0, dst1, dst01 277.endm 278 279.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 280 bilinear_load_dst_8888 numpix, dst0, dst1, dst01 281.endm 282 283.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 284 bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 285.endm 286 287/* 288 * Macros for duplicating partially loaded mask to fill entire register. 289 * We will apply mask to interleaved source pixels, that is 290 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) 291 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) 292 * So, we need to duplicate loaded mask into whole register. 293 * 294 * For two pixel case 295 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 296 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 297 * We can do some optimizations for this including last pixel cases. 298 */ 299.macro bilinear_duplicate_mask_x numpix, mask 300.endm 301 302.macro bilinear_duplicate_mask_8 numpix, mask 303.if numpix == 4 304 vdup.32 mask, mask[0] 305.elseif numpix == 2 306 vdup.16 mask, mask[0] 307.elseif numpix == 1 308 vdup.8 mask, mask[0] 309.else 310 .error bilinear_duplicate_mask_8 is unsupported 311.endif 312.endm 313 314.macro bilinear_duplicate_mask mask_fmt, numpix, mask 315 bilinear_duplicate_mask_&mask_fmt numpix, mask 316.endm 317 318/* 319 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. 320 * Interleave should be done when maks is enabled or operator is 'over'. 321 */ 322.macro bilinear_interleave src0, src1, dst0, dst1 323 vuzp.8 src0, src1 324 vuzp.8 dst0, dst1 325 vuzp.8 src0, src1 326 vuzp.8 dst0, dst1 327.endm 328 329.macro bilinear_interleave_src_dst_x_src \ 330 numpix, src0, src1, src01, dst0, dst1, dst01 331.endm 332 333.macro bilinear_interleave_src_dst_x_over \ 334 numpix, src0, src1, src01, dst0, dst1, dst01 335 336 bilinear_interleave src0, src1, dst0, dst1 337.endm 338 339.macro bilinear_interleave_src_dst_x_add \ 340 numpix, src0, src1, src01, dst0, dst1, dst01 341.endm 342 343.macro bilinear_interleave_src_dst_8_src \ 344 numpix, src0, src1, src01, dst0, dst1, dst01 345 346 bilinear_interleave src0, src1, dst0, dst1 347.endm 348 349.macro bilinear_interleave_src_dst_8_over \ 350 numpix, src0, src1, src01, dst0, dst1, dst01 351 352 bilinear_interleave src0, src1, dst0, dst1 353.endm 354 355.macro bilinear_interleave_src_dst_8_add \ 356 numpix, src0, src1, src01, dst0, dst1, dst01 357 358 bilinear_interleave src0, src1, dst0, dst1 359.endm 360 361.macro bilinear_interleave_src_dst \ 362 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 363 364 bilinear_interleave_src_dst_&mask_fmt&_&op \ 365 numpix, src0, src1, src01, dst0, dst1, dst01 366.endm 367 368 369/* 370 * Macros for applying masks to src pixels. (see combine_mask_u() function) 371 * src, dst should be in interleaved form. 372 * mask register should be in form (m0, m1, m2, m3). 373 */ 374.macro bilinear_apply_mask_to_src_x \ 375 numpix, src0, src1, src01, mask, \ 376 tmp01, tmp23, tmp45, tmp67 377.endm 378 379.macro bilinear_apply_mask_to_src_8 \ 380 numpix, src0, src1, src01, mask, \ 381 tmp01, tmp23, tmp45, tmp67 382 383 vmull.u8 tmp01, src0, mask 384 vmull.u8 tmp23, src1, mask 385 /* bubbles */ 386 vrshr.u16 tmp45, tmp01, #8 387 vrshr.u16 tmp67, tmp23, #8 388 /* bubbles */ 389 vraddhn.u16 src0, tmp45, tmp01 390 vraddhn.u16 src1, tmp67, tmp23 391.endm 392 393.macro bilinear_apply_mask_to_src \ 394 mask_fmt, numpix, src0, src1, src01, mask, \ 395 tmp01, tmp23, tmp45, tmp67 396 397 bilinear_apply_mask_to_src_&mask_fmt \ 398 numpix, src0, src1, src01, mask, \ 399 tmp01, tmp23, tmp45, tmp67 400.endm 401 402 403/* 404 * Macros for combining src and destination pixels. 405 * Interleave or not is depending on operator 'op'. 406 */ 407.macro bilinear_combine_src \ 408 numpix, src0, src1, src01, dst0, dst1, dst01, \ 409 tmp01, tmp23, tmp45, tmp67, tmp8 410.endm 411 412.macro bilinear_combine_over \ 413 numpix, src0, src1, src01, dst0, dst1, dst01, \ 414 tmp01, tmp23, tmp45, tmp67, tmp8 415 416 vdup.32 tmp8, src1[1] 417 /* bubbles */ 418 vmvn.8 tmp8, tmp8 419 /* bubbles */ 420 vmull.u8 tmp01, dst0, tmp8 421 /* bubbles */ 422 vmull.u8 tmp23, dst1, tmp8 423 /* bubbles */ 424 vrshr.u16 tmp45, tmp01, #8 425 vrshr.u16 tmp67, tmp23, #8 426 /* bubbles */ 427 vraddhn.u16 dst0, tmp45, tmp01 428 vraddhn.u16 dst1, tmp67, tmp23 429 /* bubbles */ 430 vqadd.u8 src01, dst01, src01 431.endm 432 433.macro bilinear_combine_add \ 434 numpix, src0, src1, src01, dst0, dst1, dst01, \ 435 tmp01, tmp23, tmp45, tmp67, tmp8 436 437 vqadd.u8 src01, dst01, src01 438.endm 439 440.macro bilinear_combine \ 441 op, numpix, src0, src1, src01, dst0, dst1, dst01, \ 442 tmp01, tmp23, tmp45, tmp67, tmp8 443 444 bilinear_combine_&op \ 445 numpix, src0, src1, src01, dst0, dst1, dst01, \ 446 tmp01, tmp23, tmp45, tmp67, tmp8 447.endm 448 449/* 450 * Macros for final deinterleaving of destination pixels if needed. 451 */ 452.macro bilinear_deinterleave numpix, dst0, dst1, dst01 453 vuzp.8 dst0, dst1 454 /* bubbles */ 455 vuzp.8 dst0, dst1 456.endm 457 458.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 459.endm 460 461.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 462 bilinear_deinterleave numpix, dst0, dst1, dst01 463.endm 464 465.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 466.endm 467 468.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 469 bilinear_deinterleave numpix, dst0, dst1, dst01 470.endm 471 472.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 473 bilinear_deinterleave numpix, dst0, dst1, dst01 474.endm 475 476.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 477 bilinear_deinterleave numpix, dst0, dst1, dst01 478.endm 479 480.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 481 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 482.endm 483 484 485.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op 486 bilinear_load_&src_fmt d0, d1, d2 487 bilinear_load_mask mask_fmt, 1, d4 488 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 489 vmull.u8 q1, d0, d28 490 vmlal.u8 q1, d1, d29 491 /* 5 cycles bubble */ 492 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 493 vmlsl.u16 q0, d2, d30 494 vmlal.u16 q0, d3, d30 495 /* 5 cycles bubble */ 496 bilinear_duplicate_mask mask_fmt, 1, d4 497 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 498 /* 3 cycles bubble */ 499 vmovn.u16 d0, q0 500 /* 1 cycle bubble */ 501 bilinear_interleave_src_dst \ 502 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 503 bilinear_apply_mask_to_src \ 504 mask_fmt, 1, d0, d1, q0, d4, \ 505 q3, q8, q10, q11 506 bilinear_combine \ 507 op, 1, d0, d1, q0, d18, d19, q9, \ 508 q3, q8, q10, q11, d5 509 bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 510 bilinear_store_&dst_fmt 1, q2, q3 511.endm 512 513.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op 514 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 515 q1, q11, d0, d1, d20, d21, d22, d23 516 bilinear_load_mask mask_fmt, 2, d4 517 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 518 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 519 vmlsl.u16 q0, d2, d30 520 vmlal.u16 q0, d3, d30 521 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 522 vmlsl.u16 q10, d22, d31 523 vmlal.u16 q10, d23, d31 524 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 525 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 526 bilinear_duplicate_mask mask_fmt, 2, d4 527 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 528 vadd.u16 q12, q12, q13 529 vmovn.u16 d0, q0 530 bilinear_interleave_src_dst \ 531 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 532 bilinear_apply_mask_to_src \ 533 mask_fmt, 2, d0, d1, q0, d4, \ 534 q3, q8, q10, q11 535 bilinear_combine \ 536 op, 2, d0, d1, q0, d18, d19, q9, \ 537 q3, q8, q10, q11, d5 538 bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 539 bilinear_store_&dst_fmt 2, q2, q3 540.endm 541 542.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op 543 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 544 q1, q11, d0, d1, d20, d21, d22, d23 \ 545 q3, q9, d4, d5, d16, d17, d18, d19 546 pld [TMP1, PF_OFFS] 547 sub TMP1, TMP1, STRIDE 548 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 549 vmlsl.u16 q0, d2, d30 550 vmlal.u16 q0, d3, d30 551 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 552 vmlsl.u16 q10, d22, d31 553 vmlal.u16 q10, d23, d31 554 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 555 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 556 vmlsl.u16 q2, d6, d30 557 vmlal.u16 q2, d7, d30 558 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 559 bilinear_load_mask mask_fmt, 4, d22 560 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 561 pld [TMP1, PF_OFFS] 562 vmlsl.u16 q8, d18, d31 563 vmlal.u16 q8, d19, d31 564 vadd.u16 q12, q12, q13 565 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 566 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 567 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 568 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 569 bilinear_duplicate_mask mask_fmt, 4, d22 570 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 571 vmovn.u16 d0, q0 572 vmovn.u16 d1, q2 573 vadd.u16 q12, q12, q13 574 bilinear_interleave_src_dst \ 575 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 576 bilinear_apply_mask_to_src \ 577 mask_fmt, 4, d0, d1, q0, d22, \ 578 q3, q8, q9, q10 579 bilinear_combine \ 580 op, 4, d0, d1, q0, d2, d3, q1, \ 581 q3, q8, q9, q10, d23 582 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 583 bilinear_store_&dst_fmt 4, q2, q3 584.endm 585 586.set BILINEAR_FLAG_USE_MASK, 1 587.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 588 589/* 590 * Main template macro for generating NEON optimized bilinear scanline functions. 591 * 592 * Bilinear scanline generator macro take folling arguments: 593 * fname - name of the function to generate 594 * src_fmt - source color format (8888 or 0565) 595 * dst_fmt - destination color format (8888 or 0565) 596 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes 597 * process_last_pixel - code block that interpolate one pixel and does not 598 * update horizontal weight 599 * process_two_pixels - code block that interpolate two pixels and update 600 * horizontal weight 601 * process_four_pixels - code block that interpolate four pixels and update 602 * horizontal weight 603 * process_pixblock_head - head part of middle loop 604 * process_pixblock_tail - tail part of middle loop 605 * process_pixblock_tail_head - tail_head of middle loop 606 * pixblock_size - number of pixels processed in a single middle loop 607 * prefetch_distance - prefetch in the source image by that many pixels ahead 608 */ 609 610.macro generate_bilinear_scanline_func \ 611 fname, \ 612 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ 613 bilinear_process_last_pixel, \ 614 bilinear_process_two_pixels, \ 615 bilinear_process_four_pixels, \ 616 bilinear_process_pixblock_head, \ 617 bilinear_process_pixblock_tail, \ 618 bilinear_process_pixblock_tail_head, \ 619 pixblock_size, \ 620 prefetch_distance, \ 621 flags 622 623pixman_asm_function fname 624.if pixblock_size == 8 625.elseif pixblock_size == 4 626.else 627 .error unsupported pixblock size 628.endif 629 630.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 631 OUT .req r0 632 TOP .req r1 633 BOTTOM .req r2 634 WT .req r3 635 WB .req r4 636 X .req r5 637 UX .req r6 638 WIDTH .req ip 639 TMP1 .req r3 640 TMP2 .req r4 641 PF_OFFS .req r7 642 TMP3 .req r8 643 TMP4 .req r9 644 STRIDE .req r2 645 646 mov ip, sp 647 push {r4, r5, r6, r7, r8, r9} 648 mov PF_OFFS, #prefetch_distance 649 ldmia ip, {WB, X, UX, WIDTH} 650.else 651 OUT .req r0 652 MASK .req r1 653 TOP .req r2 654 BOTTOM .req r3 655 WT .req r4 656 WB .req r5 657 X .req r6 658 UX .req r7 659 WIDTH .req ip 660 TMP1 .req r4 661 TMP2 .req r5 662 PF_OFFS .req r8 663 TMP3 .req r9 664 TMP4 .req r10 665 STRIDE .req r3 666 667 .set prefetch_offset, prefetch_distance 668 669 mov ip, sp 670 push {r4, r5, r6, r7, r8, r9, r10, ip} 671 mov PF_OFFS, #prefetch_distance 672 ldmia ip, {WT, WB, X, UX, WIDTH} 673.endif 674 675 mul PF_OFFS, PF_OFFS, UX 676 677.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 678 vpush {d8-d15} 679.endif 680 681 sub STRIDE, BOTTOM, TOP 682 .unreq BOTTOM 683 684 cmp WIDTH, #0 685 ble 3f 686 687 vdup.u16 q12, X 688 vdup.u16 q13, UX 689 vdup.u8 d28, WT 690 vdup.u8 d29, WB 691 vadd.u16 d25, d25, d26 692 693 /* ensure good destination alignment */ 694 cmp WIDTH, #1 695 blt 0f 696 tst OUT, #(1 << dst_bpp_shift) 697 beq 0f 698 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 699 vadd.u16 q12, q12, q13 700 bilinear_process_last_pixel 701 sub WIDTH, WIDTH, #1 7020: 703 vadd.u16 q13, q13, q13 704 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 705 vadd.u16 q12, q12, q13 706 707 cmp WIDTH, #2 708 blt 0f 709 tst OUT, #(1 << (dst_bpp_shift + 1)) 710 beq 0f 711 bilinear_process_two_pixels 712 sub WIDTH, WIDTH, #2 7130: 714.if pixblock_size == 8 715 cmp WIDTH, #4 716 blt 0f 717 tst OUT, #(1 << (dst_bpp_shift + 2)) 718 beq 0f 719 bilinear_process_four_pixels 720 sub WIDTH, WIDTH, #4 7210: 722.endif 723 subs WIDTH, WIDTH, #pixblock_size 724 blt 1f 725 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 726 bilinear_process_pixblock_head 727 subs WIDTH, WIDTH, #pixblock_size 728 blt 5f 7290: 730 bilinear_process_pixblock_tail_head 731 subs WIDTH, WIDTH, #pixblock_size 732 bge 0b 7335: 734 bilinear_process_pixblock_tail 7351: 736.if pixblock_size == 8 737 tst WIDTH, #4 738 beq 2f 739 bilinear_process_four_pixels 7402: 741.endif 742 /* handle the remaining trailing pixels */ 743 tst WIDTH, #2 744 beq 2f 745 bilinear_process_two_pixels 7462: 747 tst WIDTH, #1 748 beq 3f 749 bilinear_process_last_pixel 7503: 751.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 752 vpop {d8-d15} 753.endif 754 755.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 756 pop {r4, r5, r6, r7, r8, r9} 757.else 758 pop {r4, r5, r6, r7, r8, r9, r10, ip} 759.endif 760 bx lr 761 762 .unreq OUT 763 .unreq TOP 764 .unreq WT 765 .unreq WB 766 .unreq X 767 .unreq UX 768 .unreq WIDTH 769 .unreq TMP1 770 .unreq TMP2 771 .unreq PF_OFFS 772 .unreq TMP3 773 .unreq TMP4 774 .unreq STRIDE 775.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 776 .unreq MASK 777.endif 778 779.endfunc 780 781.endm 782 783/* src_8888_8_8888 */ 784.macro bilinear_src_8888_8_8888_process_last_pixel 785 bilinear_interpolate_last_pixel 8888, 8, 8888, src 786.endm 787 788.macro bilinear_src_8888_8_8888_process_two_pixels 789 bilinear_interpolate_two_pixels 8888, 8, 8888, src 790.endm 791 792.macro bilinear_src_8888_8_8888_process_four_pixels 793 bilinear_interpolate_four_pixels 8888, 8, 8888, src 794.endm 795 796.macro bilinear_src_8888_8_8888_process_pixblock_head 797 bilinear_src_8888_8_8888_process_four_pixels 798.endm 799 800.macro bilinear_src_8888_8_8888_process_pixblock_tail 801.endm 802 803.macro bilinear_src_8888_8_8888_process_pixblock_tail_head 804 bilinear_src_8888_8_8888_process_pixblock_tail 805 bilinear_src_8888_8_8888_process_pixblock_head 806.endm 807 808/* src_8888_8_0565 */ 809.macro bilinear_src_8888_8_0565_process_last_pixel 810 bilinear_interpolate_last_pixel 8888, 8, 0565, src 811.endm 812 813.macro bilinear_src_8888_8_0565_process_two_pixels 814 bilinear_interpolate_two_pixels 8888, 8, 0565, src 815.endm 816 817.macro bilinear_src_8888_8_0565_process_four_pixels 818 bilinear_interpolate_four_pixels 8888, 8, 0565, src 819.endm 820 821.macro bilinear_src_8888_8_0565_process_pixblock_head 822 bilinear_src_8888_8_0565_process_four_pixels 823.endm 824 825.macro bilinear_src_8888_8_0565_process_pixblock_tail 826.endm 827 828.macro bilinear_src_8888_8_0565_process_pixblock_tail_head 829 bilinear_src_8888_8_0565_process_pixblock_tail 830 bilinear_src_8888_8_0565_process_pixblock_head 831.endm 832 833/* src_0565_8_x888 */ 834.macro bilinear_src_0565_8_x888_process_last_pixel 835 bilinear_interpolate_last_pixel 0565, 8, 8888, src 836.endm 837 838.macro bilinear_src_0565_8_x888_process_two_pixels 839 bilinear_interpolate_two_pixels 0565, 8, 8888, src 840.endm 841 842.macro bilinear_src_0565_8_x888_process_four_pixels 843 bilinear_interpolate_four_pixels 0565, 8, 8888, src 844.endm 845 846.macro bilinear_src_0565_8_x888_process_pixblock_head 847 bilinear_src_0565_8_x888_process_four_pixels 848.endm 849 850.macro bilinear_src_0565_8_x888_process_pixblock_tail 851.endm 852 853.macro bilinear_src_0565_8_x888_process_pixblock_tail_head 854 bilinear_src_0565_8_x888_process_pixblock_tail 855 bilinear_src_0565_8_x888_process_pixblock_head 856.endm 857 858/* src_0565_8_0565 */ 859.macro bilinear_src_0565_8_0565_process_last_pixel 860 bilinear_interpolate_last_pixel 0565, 8, 0565, src 861.endm 862 863.macro bilinear_src_0565_8_0565_process_two_pixels 864 bilinear_interpolate_two_pixels 0565, 8, 0565, src 865.endm 866 867.macro bilinear_src_0565_8_0565_process_four_pixels 868 bilinear_interpolate_four_pixels 0565, 8, 0565, src 869.endm 870 871.macro bilinear_src_0565_8_0565_process_pixblock_head 872 bilinear_src_0565_8_0565_process_four_pixels 873.endm 874 875.macro bilinear_src_0565_8_0565_process_pixblock_tail 876.endm 877 878.macro bilinear_src_0565_8_0565_process_pixblock_tail_head 879 bilinear_src_0565_8_0565_process_pixblock_tail 880 bilinear_src_0565_8_0565_process_pixblock_head 881.endm 882 883/* over_8888_8888 */ 884.macro bilinear_over_8888_8888_process_last_pixel 885 bilinear_interpolate_last_pixel 8888, x, 8888, over 886.endm 887 888.macro bilinear_over_8888_8888_process_two_pixels 889 bilinear_interpolate_two_pixels 8888, x, 8888, over 890.endm 891 892.macro bilinear_over_8888_8888_process_four_pixels 893 bilinear_interpolate_four_pixels 8888, x, 8888, over 894.endm 895 896.macro bilinear_over_8888_8888_process_pixblock_head 897 mov TMP1, X, asr #16 898 add X, X, UX 899 add TMP1, TOP, TMP1, asl #2 900 mov TMP2, X, asr #16 901 add X, X, UX 902 add TMP2, TOP, TMP2, asl #2 903 904 vld1.32 {d22}, [TMP1], STRIDE 905 vld1.32 {d23}, [TMP1] 906 mov TMP3, X, asr #16 907 add X, X, UX 908 add TMP3, TOP, TMP3, asl #2 909 vmull.u8 q8, d22, d28 910 vmlal.u8 q8, d23, d29 911 912 vld1.32 {d22}, [TMP2], STRIDE 913 vld1.32 {d23}, [TMP2] 914 mov TMP4, X, asr #16 915 add X, X, UX 916 add TMP4, TOP, TMP4, asl #2 917 vmull.u8 q9, d22, d28 918 vmlal.u8 q9, d23, d29 919 920 vld1.32 {d22}, [TMP3], STRIDE 921 vld1.32 {d23}, [TMP3] 922 vmull.u8 q10, d22, d28 923 vmlal.u8 q10, d23, d29 924 925 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 926 vmlsl.u16 q0, d16, d30 927 vmlal.u16 q0, d17, d30 928 929 pld [TMP4, PF_OFFS] 930 vld1.32 {d16}, [TMP4], STRIDE 931 vld1.32 {d17}, [TMP4] 932 pld [TMP4, PF_OFFS] 933 vmull.u8 q11, d16, d28 934 vmlal.u8 q11, d17, d29 935 936 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 937 vmlsl.u16 q1, d18, d31 938 vmlal.u16 q1, d19, d31 939 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 940 vadd.u16 q12, q12, q13 941.endm 942 943.macro bilinear_over_8888_8888_process_pixblock_tail 944 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 945 vmlsl.u16 q2, d20, d30 946 vmlal.u16 q2, d21, d30 947 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 948 vmlsl.u16 q3, d22, d31 949 vmlal.u16 q3, d23, d31 950 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 951 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 952 vld1.32 {d2, d3}, [OUT, :128] 953 pld [OUT, #(prefetch_offset * 4)] 954 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 955 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 956 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 957 vmovn.u16 d6, q0 958 vmovn.u16 d7, q2 959 vuzp.8 d6, d7 960 vuzp.8 d2, d3 961 vuzp.8 d6, d7 962 vuzp.8 d2, d3 963 vdup.32 d4, d7[1] 964 vmvn.8 d4, d4 965 vmull.u8 q11, d2, d4 966 vmull.u8 q2, d3, d4 967 vrshr.u16 q1, q11, #8 968 vrshr.u16 q10, q2, #8 969 vraddhn.u16 d2, q1, q11 970 vraddhn.u16 d3, q10, q2 971 vqadd.u8 q3, q1, q3 972 vuzp.8 d6, d7 973 vuzp.8 d6, d7 974 vadd.u16 q12, q12, q13 975 vst1.32 {d6, d7}, [OUT, :128]! 976.endm 977 978.macro bilinear_over_8888_8888_process_pixblock_tail_head 979 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 980 mov TMP1, X, asr #16 981 add X, X, UX 982 add TMP1, TOP, TMP1, asl #2 983 vmlsl.u16 q2, d20, d30 984 mov TMP2, X, asr #16 985 add X, X, UX 986 add TMP2, TOP, TMP2, asl #2 987 vmlal.u16 q2, d21, d30 988 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 989 vld1.32 {d20}, [TMP1], STRIDE 990 vmlsl.u16 q3, d22, d31 991 vmlal.u16 q3, d23, d31 992 vld1.32 {d21}, [TMP1] 993 vmull.u8 q8, d20, d28 994 vmlal.u8 q8, d21, d29 995 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 996 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 997 vld1.32 {d2, d3}, [OUT, :128] 998 pld [OUT, PF_OFFS] 999 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 1000 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1001 vld1.32 {d22}, [TMP2], STRIDE 1002 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 1003 vmovn.u16 d6, q0 1004 vld1.32 {d23}, [TMP2] 1005 vmull.u8 q9, d22, d28 1006 mov TMP3, X, asr #16 1007 add X, X, UX 1008 add TMP3, TOP, TMP3, asl #2 1009 mov TMP4, X, asr #16 1010 add X, X, UX 1011 add TMP4, TOP, TMP4, asl #2 1012 vmlal.u8 q9, d23, d29 1013 vmovn.u16 d7, q2 1014 vld1.32 {d22}, [TMP3], STRIDE 1015 vuzp.8 d6, d7 1016 vuzp.8 d2, d3 1017 vuzp.8 d6, d7 1018 vuzp.8 d2, d3 1019 vdup.32 d4, d7[1] 1020 vld1.32 {d23}, [TMP3] 1021 vmvn.8 d4, d4 1022 vmull.u8 q10, d22, d28 1023 vmlal.u8 q10, d23, d29 1024 vmull.u8 q11, d2, d4 1025 vmull.u8 q2, d3, d4 1026 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 1027 vmlsl.u16 q0, d16, d30 1028 vrshr.u16 q1, q11, #8 1029 vmlal.u16 q0, d17, d30 1030 vrshr.u16 q8, q2, #8 1031 vraddhn.u16 d2, q1, q11 1032 vraddhn.u16 d3, q8, q2 1033 pld [TMP4, PF_OFFS] 1034 vld1.32 {d16}, [TMP4], STRIDE 1035 vqadd.u8 q3, q1, q3 1036 vld1.32 {d17}, [TMP4] 1037 pld [TMP4, PF_OFFS] 1038 vmull.u8 q11, d16, d28 1039 vmlal.u8 q11, d17, d29 1040 vuzp.8 d6, d7 1041 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 1042 vuzp.8 d6, d7 1043 vmlsl.u16 q1, d18, d31 1044 vadd.u16 q12, q12, q13 1045 vmlal.u16 q1, d19, d31 1046 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1047 vadd.u16 q12, q12, q13 1048 vst1.32 {d6, d7}, [OUT, :128]! 1049.endm 1050 1051/* over_8888_8_8888 */ 1052.macro bilinear_over_8888_8_8888_process_last_pixel 1053 bilinear_interpolate_last_pixel 8888, 8, 8888, over 1054.endm 1055 1056.macro bilinear_over_8888_8_8888_process_two_pixels 1057 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1058.endm 1059 1060.macro bilinear_over_8888_8_8888_process_four_pixels 1061 bilinear_interpolate_four_pixels 8888, 8, 8888, over 1062.endm 1063 1064.macro bilinear_over_8888_8_8888_process_pixblock_head 1065 mov TMP1, X, asr #16 1066 add X, X, UX 1067 add TMP1, TOP, TMP1, asl #2 1068 vld1.32 {d0}, [TMP1], STRIDE 1069 mov TMP2, X, asr #16 1070 add X, X, UX 1071 add TMP2, TOP, TMP2, asl #2 1072 vld1.32 {d1}, [TMP1] 1073 mov TMP3, X, asr #16 1074 add X, X, UX 1075 add TMP3, TOP, TMP3, asl #2 1076 vld1.32 {d2}, [TMP2], STRIDE 1077 mov TMP4, X, asr #16 1078 add X, X, UX 1079 add TMP4, TOP, TMP4, asl #2 1080 vld1.32 {d3}, [TMP2] 1081 vmull.u8 q2, d0, d28 1082 vmull.u8 q3, d2, d28 1083 vmlal.u8 q2, d1, d29 1084 vmlal.u8 q3, d3, d29 1085 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS 1086 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS 1087 vmlsl.u16 q0, d4, d30 1088 vmlsl.u16 q1, d6, d31 1089 vmlal.u16 q0, d5, d30 1090 vmlal.u16 q1, d7, d31 1091 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1092 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1093 vld1.32 {d2}, [TMP3], STRIDE 1094 vld1.32 {d3}, [TMP3] 1095 pld [TMP4, PF_OFFS] 1096 vld1.32 {d4}, [TMP4], STRIDE 1097 vld1.32 {d5}, [TMP4] 1098 pld [TMP4, PF_OFFS] 1099 vmull.u8 q3, d2, d28 1100 vmlal.u8 q3, d3, d29 1101 vmull.u8 q1, d4, d28 1102 vmlal.u8 q1, d5, d29 1103 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1104 vld1.32 {d22[0]}, [MASK]! 1105 pld [MASK, #prefetch_offset] 1106 vadd.u16 q12, q12, q13 1107 vmovn.u16 d16, q0 1108.endm 1109 1110.macro bilinear_over_8888_8_8888_process_pixblock_tail 1111 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1112 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS 1113 vmlsl.u16 q9, d6, d30 1114 vmlsl.u16 q10, d2, d31 1115 vmlal.u16 q9, d7, d30 1116 vmlal.u16 q10, d3, d31 1117 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1118 vadd.u16 q12, q12, q13 1119 vdup.32 d22, d22[0] 1120 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) 1121 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 1122 vmovn.u16 d17, q9 1123 vld1.32 {d18, d19}, [OUT, :128] 1124 pld [OUT, PF_OFFS] 1125 vuzp.8 d16, d17 1126 vuzp.8 d18, d19 1127 vuzp.8 d16, d17 1128 vuzp.8 d18, d19 1129 vmull.u8 q10, d16, d22 1130 vmull.u8 q11, d17, d22 1131 vrsra.u16 q10, q10, #8 1132 vrsra.u16 q11, q11, #8 1133 vrshrn.u16 d16, q10, #8 1134 vrshrn.u16 d17, q11, #8 1135 vdup.32 d22, d17[1] 1136 vmvn.8 d22, d22 1137 vmull.u8 q10, d18, d22 1138 vmull.u8 q11, d19, d22 1139 vrshr.u16 q9, q10, #8 1140 vrshr.u16 q0, q11, #8 1141 vraddhn.u16 d18, q9, q10 1142 vraddhn.u16 d19, q0, q11 1143 vqadd.u8 q9, q8, q9 1144 vuzp.8 d18, d19 1145 vuzp.8 d18, d19 1146 vst1.32 {d18, d19}, [OUT, :128]! 1147.endm 1148 1149.macro bilinear_over_8888_8_8888_process_pixblock_tail_head 1150 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1151 mov TMP1, X, asr #16 1152 add X, X, UX 1153 add TMP1, TOP, TMP1, asl #2 1154 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS 1155 vld1.32 {d0}, [TMP1], STRIDE 1156 mov TMP2, X, asr #16 1157 add X, X, UX 1158 add TMP2, TOP, TMP2, asl #2 1159 vmlsl.u16 q9, d6, d30 1160 vmlsl.u16 q10, d2, d31 1161 vld1.32 {d1}, [TMP1] 1162 mov TMP3, X, asr #16 1163 add X, X, UX 1164 add TMP3, TOP, TMP3, asl #2 1165 vmlal.u16 q9, d7, d30 1166 vmlal.u16 q10, d3, d31 1167 vld1.32 {d2}, [TMP2], STRIDE 1168 mov TMP4, X, asr #16 1169 add X, X, UX 1170 add TMP4, TOP, TMP4, asl #2 1171 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1172 vadd.u16 q12, q12, q13 1173 vld1.32 {d3}, [TMP2] 1174 vdup.32 d22, d22[0] 1175 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) 1176 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 1177 vmull.u8 q2, d0, d28 1178 vmull.u8 q3, d2, d28 1179 vmovn.u16 d17, q9 1180 vld1.32 {d18, d19}, [OUT, :128] 1181 pld [OUT, #(prefetch_offset * 4)] 1182 vmlal.u8 q2, d1, d29 1183 vmlal.u8 q3, d3, d29 1184 vuzp.8 d16, d17 1185 vuzp.8 d18, d19 1186 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS 1187 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS 1188 vuzp.8 d16, d17 1189 vuzp.8 d18, d19 1190 vmlsl.u16 q0, d4, d30 1191 vmlsl.u16 q1, d6, d31 1192 vmull.u8 q10, d16, d22 1193 vmull.u8 q11, d17, d22 1194 vmlal.u16 q0, d5, d30 1195 vmlal.u16 q1, d7, d31 1196 vrsra.u16 q10, q10, #8 1197 vrsra.u16 q11, q11, #8 1198 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1199 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1200 vrshrn.u16 d16, q10, #8 1201 vrshrn.u16 d17, q11, #8 1202 vld1.32 {d2}, [TMP3], STRIDE 1203 vdup.32 d22, d17[1] 1204 vld1.32 {d3}, [TMP3] 1205 vmvn.8 d22, d22 1206 pld [TMP4, PF_OFFS] 1207 vld1.32 {d4}, [TMP4], STRIDE 1208 vmull.u8 q10, d18, d22 1209 vmull.u8 q11, d19, d22 1210 vld1.32 {d5}, [TMP4] 1211 pld [TMP4, PF_OFFS] 1212 vmull.u8 q3, d2, d28 1213 vrshr.u16 q9, q10, #8 1214 vrshr.u16 q15, q11, #8 1215 vmlal.u8 q3, d3, d29 1216 vmull.u8 q1, d4, d28 1217 vraddhn.u16 d18, q9, q10 1218 vraddhn.u16 d19, q15, q11 1219 vmlal.u8 q1, d5, d29 1220 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1221 vqadd.u8 q9, q8, q9 1222 vld1.32 {d22[0]}, [MASK]! 1223 vuzp.8 d18, d19 1224 vadd.u16 q12, q12, q13 1225 vuzp.8 d18, d19 1226 vmovn.u16 d16, q0 1227 vst1.32 {d18, d19}, [OUT, :128]! 1228.endm 1229 1230/* add_8888_8888 */ 1231.macro bilinear_add_8888_8888_process_last_pixel 1232 bilinear_interpolate_last_pixel 8888, x, 8888, add 1233.endm 1234 1235.macro bilinear_add_8888_8888_process_two_pixels 1236 bilinear_interpolate_two_pixels 8888, x, 8888, add 1237.endm 1238 1239.macro bilinear_add_8888_8888_process_four_pixels 1240 bilinear_interpolate_four_pixels 8888, x, 8888, add 1241.endm 1242 1243.macro bilinear_add_8888_8888_process_pixblock_head 1244 bilinear_add_8888_8888_process_four_pixels 1245.endm 1246 1247.macro bilinear_add_8888_8888_process_pixblock_tail 1248.endm 1249 1250.macro bilinear_add_8888_8888_process_pixblock_tail_head 1251 bilinear_add_8888_8888_process_pixblock_tail 1252 bilinear_add_8888_8888_process_pixblock_head 1253.endm 1254 1255/* add_8888_8_8888 */ 1256.macro bilinear_add_8888_8_8888_process_last_pixel 1257 bilinear_interpolate_last_pixel 8888, 8, 8888, add 1258.endm 1259 1260.macro bilinear_add_8888_8_8888_process_two_pixels 1261 bilinear_interpolate_two_pixels 8888, 8, 8888, add 1262.endm 1263 1264.macro bilinear_add_8888_8_8888_process_four_pixels 1265 bilinear_interpolate_four_pixels 8888, 8, 8888, add 1266.endm 1267 1268.macro bilinear_add_8888_8_8888_process_pixblock_head 1269 bilinear_add_8888_8_8888_process_four_pixels 1270.endm 1271 1272.macro bilinear_add_8888_8_8888_process_pixblock_tail 1273.endm 1274 1275.macro bilinear_add_8888_8_8888_process_pixblock_tail_head 1276 bilinear_add_8888_8_8888_process_pixblock_tail 1277 bilinear_add_8888_8_8888_process_pixblock_head 1278.endm 1279 1280 1281/* Bilinear scanline functions */ 1282generate_bilinear_scanline_func \ 1283 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ 1284 8888, 8888, 2, 2, \ 1285 bilinear_src_8888_8_8888_process_last_pixel, \ 1286 bilinear_src_8888_8_8888_process_two_pixels, \ 1287 bilinear_src_8888_8_8888_process_four_pixels, \ 1288 bilinear_src_8888_8_8888_process_pixblock_head, \ 1289 bilinear_src_8888_8_8888_process_pixblock_tail, \ 1290 bilinear_src_8888_8_8888_process_pixblock_tail_head, \ 1291 4, 28, BILINEAR_FLAG_USE_MASK 1292 1293generate_bilinear_scanline_func \ 1294 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ 1295 8888, 0565, 2, 1, \ 1296 bilinear_src_8888_8_0565_process_last_pixel, \ 1297 bilinear_src_8888_8_0565_process_two_pixels, \ 1298 bilinear_src_8888_8_0565_process_four_pixels, \ 1299 bilinear_src_8888_8_0565_process_pixblock_head, \ 1300 bilinear_src_8888_8_0565_process_pixblock_tail, \ 1301 bilinear_src_8888_8_0565_process_pixblock_tail_head, \ 1302 4, 28, BILINEAR_FLAG_USE_MASK 1303 1304generate_bilinear_scanline_func \ 1305 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ 1306 0565, 8888, 1, 2, \ 1307 bilinear_src_0565_8_x888_process_last_pixel, \ 1308 bilinear_src_0565_8_x888_process_two_pixels, \ 1309 bilinear_src_0565_8_x888_process_four_pixels, \ 1310 bilinear_src_0565_8_x888_process_pixblock_head, \ 1311 bilinear_src_0565_8_x888_process_pixblock_tail, \ 1312 bilinear_src_0565_8_x888_process_pixblock_tail_head, \ 1313 4, 28, BILINEAR_FLAG_USE_MASK 1314 1315generate_bilinear_scanline_func \ 1316 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ 1317 0565, 0565, 1, 1, \ 1318 bilinear_src_0565_8_0565_process_last_pixel, \ 1319 bilinear_src_0565_8_0565_process_two_pixels, \ 1320 bilinear_src_0565_8_0565_process_four_pixels, \ 1321 bilinear_src_0565_8_0565_process_pixblock_head, \ 1322 bilinear_src_0565_8_0565_process_pixblock_tail, \ 1323 bilinear_src_0565_8_0565_process_pixblock_tail_head, \ 1324 4, 28, BILINEAR_FLAG_USE_MASK 1325 1326generate_bilinear_scanline_func \ 1327 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ 1328 8888, 8888, 2, 2, \ 1329 bilinear_over_8888_8888_process_last_pixel, \ 1330 bilinear_over_8888_8888_process_two_pixels, \ 1331 bilinear_over_8888_8888_process_four_pixels, \ 1332 bilinear_over_8888_8888_process_pixblock_head, \ 1333 bilinear_over_8888_8888_process_pixblock_tail, \ 1334 bilinear_over_8888_8888_process_pixblock_tail_head, \ 1335 4, 28, 0 1336 1337generate_bilinear_scanline_func \ 1338 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ 1339 8888, 8888, 2, 2, \ 1340 bilinear_over_8888_8_8888_process_last_pixel, \ 1341 bilinear_over_8888_8_8888_process_two_pixels, \ 1342 bilinear_over_8888_8_8888_process_four_pixels, \ 1343 bilinear_over_8888_8_8888_process_pixblock_head, \ 1344 bilinear_over_8888_8_8888_process_pixblock_tail, \ 1345 bilinear_over_8888_8_8888_process_pixblock_tail_head, \ 1346 4, 28, BILINEAR_FLAG_USE_MASK 1347 1348generate_bilinear_scanline_func \ 1349 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ 1350 8888, 8888, 2, 2, \ 1351 bilinear_add_8888_8888_process_last_pixel, \ 1352 bilinear_add_8888_8888_process_two_pixels, \ 1353 bilinear_add_8888_8888_process_four_pixels, \ 1354 bilinear_add_8888_8888_process_pixblock_head, \ 1355 bilinear_add_8888_8888_process_pixblock_tail, \ 1356 bilinear_add_8888_8888_process_pixblock_tail_head, \ 1357 4, 28, 0 1358 1359generate_bilinear_scanline_func \ 1360 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ 1361 8888, 8888, 2, 2, \ 1362 bilinear_add_8888_8_8888_process_last_pixel, \ 1363 bilinear_add_8888_8_8888_process_two_pixels, \ 1364 bilinear_add_8888_8_8888_process_four_pixels, \ 1365 bilinear_add_8888_8_8888_process_pixblock_head, \ 1366 bilinear_add_8888_8_8888_process_pixblock_tail, \ 1367 bilinear_add_8888_8_8888_process_pixblock_tail_head, \ 1368 4, 28, BILINEAR_FLAG_USE_MASK 1369