1/* 2 * Copyright © 2012 Raspberry Pi Foundation 3 * Copyright © 2012 RISC OS Open Ltd 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of the copyright holders not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. The copyright holders make no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Ben Avison (bavison@riscosopen.org) 25 * 26 */ 27 28/* Prevent the stack from becoming executable */ 29#if defined(__linux__) && defined(__ELF__) 30.section .note.GNU-stack,"",%progbits 31#endif 32 33 .text 34 .arch armv6 35 .object_arch armv4 36 .arm 37 .altmacro 38 .p2align 2 39 40#include "pixman-arm-simd-asm.h" 41 42/* A head macro should do all processing which results in an output of up to 43 * 16 bytes, as far as the final load instruction. The corresponding tail macro 44 * should complete the processing of the up-to-16 bytes. The calling macro will 45 * sometimes choose to insert a preload or a decrement of X between them. 46 * cond ARM condition code for code block 47 * numbytes Number of output bytes that should be generated this time 48 * firstreg First WK register in which to place output 49 * unaligned_src Whether to use non-wordaligned loads of source image 50 * unaligned_mask Whether to use non-wordaligned loads of mask image 51 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output 52 */ 53 54.macro blit_init 55 line_saved_regs STRIDE_D, STRIDE_S 56.endm 57 58.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 59 pixld cond, numbytes, firstreg, SRC, unaligned_src 60.endm 61 62.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 63 WK4 .req STRIDE_D 64 WK5 .req STRIDE_S 65 WK6 .req MASK 66 WK7 .req STRIDE_M 67110: pixld , 16, 0, SRC, unaligned_src 68 pixld , 16, 4, SRC, unaligned_src 69 pld [SRC, SCRATCH] 70 pixst , 16, 0, DST 71 pixst , 16, 4, DST 72 subs X, X, #32*8/src_bpp 73 bhs 110b 74 .unreq WK4 75 .unreq WK5 76 .unreq WK6 77 .unreq WK7 78.endm 79 80generate_composite_function \ 81 pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ 82 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 83 4, /* prefetch distance */ \ 84 blit_init, \ 85 nop_macro, /* newline */ \ 86 nop_macro, /* cleanup */ \ 87 blit_process_head, \ 88 nop_macro, /* process tail */ \ 89 blit_inner_loop 90 91generate_composite_function \ 92 pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ 93 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 94 4, /* prefetch distance */ \ 95 blit_init, \ 96 nop_macro, /* newline */ \ 97 nop_macro, /* cleanup */ \ 98 blit_process_head, \ 99 nop_macro, /* process tail */ \ 100 blit_inner_loop 101 102generate_composite_function \ 103 pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ 104 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 105 3, /* prefetch distance */ \ 106 blit_init, \ 107 nop_macro, /* newline */ \ 108 nop_macro, /* cleanup */ \ 109 blit_process_head, \ 110 nop_macro, /* process tail */ \ 111 blit_inner_loop 112 113/******************************************************************************/ 114 115.macro src_n_8888_init 116 ldr SRC, [sp, #ARGS_STACK_OFFSET] 117 mov STRIDE_S, SRC 118 mov MASK, SRC 119 mov STRIDE_M, SRC 120.endm 121 122.macro src_n_0565_init 123 ldrh SRC, [sp, #ARGS_STACK_OFFSET] 124 orr SRC, SRC, lsl #16 125 mov STRIDE_S, SRC 126 mov MASK, SRC 127 mov STRIDE_M, SRC 128.endm 129 130.macro src_n_8_init 131 ldrb SRC, [sp, #ARGS_STACK_OFFSET] 132 orr SRC, SRC, lsl #8 133 orr SRC, SRC, lsl #16 134 mov STRIDE_S, SRC 135 mov MASK, SRC 136 mov STRIDE_M, SRC 137.endm 138 139.macro fill_process_tail cond, numbytes, firstreg 140 WK4 .req SRC 141 WK5 .req STRIDE_S 142 WK6 .req MASK 143 WK7 .req STRIDE_M 144 pixst cond, numbytes, 4, DST 145 .unreq WK4 146 .unreq WK5 147 .unreq WK6 148 .unreq WK7 149.endm 150 151generate_composite_function \ 152 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ 153 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 154 0, /* prefetch distance doesn't apply */ \ 155 src_n_8888_init \ 156 nop_macro, /* newline */ \ 157 nop_macro /* cleanup */ \ 158 nop_macro /* process head */ \ 159 fill_process_tail 160 161generate_composite_function \ 162 pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ 163 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 164 0, /* prefetch distance doesn't apply */ \ 165 src_n_0565_init \ 166 nop_macro, /* newline */ \ 167 nop_macro /* cleanup */ \ 168 nop_macro /* process head */ \ 169 fill_process_tail 170 171generate_composite_function \ 172 pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ 173 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 174 0, /* prefetch distance doesn't apply */ \ 175 src_n_8_init \ 176 nop_macro, /* newline */ \ 177 nop_macro /* cleanup */ \ 178 nop_macro /* process head */ \ 179 fill_process_tail 180 181/******************************************************************************/ 182 183.macro src_x888_8888_pixel, cond, reg 184 orr&cond WK®, WK®, #0xFF000000 185.endm 186 187.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 188 pixld cond, numbytes, firstreg, SRC, unaligned_src 189.endm 190 191.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg 192 src_x888_8888_pixel cond, %(firstreg+0) 193 .if numbytes >= 8 194 src_x888_8888_pixel cond, %(firstreg+1) 195 .if numbytes == 16 196 src_x888_8888_pixel cond, %(firstreg+2) 197 src_x888_8888_pixel cond, %(firstreg+3) 198 .endif 199 .endif 200.endm 201 202generate_composite_function \ 203 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ 204 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 205 3, /* prefetch distance */ \ 206 nop_macro, /* init */ \ 207 nop_macro, /* newline */ \ 208 nop_macro, /* cleanup */ \ 209 pixman_composite_src_x888_8888_process_head, \ 210 pixman_composite_src_x888_8888_process_tail 211 212/******************************************************************************/ 213 214.macro src_0565_8888_init 215 /* Hold loop invariants in MASK and STRIDE_M */ 216 ldr MASK, =0x07E007E0 217 mov STRIDE_M, #0xFF000000 218 /* Set GE[3:0] to 1010 so SEL instructions do what we want */ 219 ldr SCRATCH, =0x80008000 220 uadd8 SCRATCH, SCRATCH, SCRATCH 221.endm 222 223.macro src_0565_8888_2pixels, reg1, reg2 224 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 225 bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 226 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 227 mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 228 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG 229 bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 230 orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 231 orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 232 pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- 233 sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- 234 mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg 235 pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- 236 sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- 237 orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb 238 orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 239.endm 240 241/* This version doesn't need STRIDE_M, but is one instruction longer. 242 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? 243 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 244 bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 245 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 246 mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB 247 mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 248 bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb 249 mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 250 mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 251 orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB 252 orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 253 pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB 254 pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 255 sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB 256 sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb 257 orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 258 orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 259*/ 260 261.macro src_0565_8888_1pixel, reg 262 bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb 263 and WK®, WK®, MASK @ 000000000000000000000gggggg00000 264 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 265 mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 266 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 267 orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 268 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 269 sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb 270 orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 271.endm 272 273.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 274 .if numbytes == 16 275 pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src 276 .elseif numbytes == 8 277 pixld , 4, firstreg, SRC, unaligned_src 278 .elseif numbytes == 4 279 pixld , 2, firstreg, SRC, unaligned_src 280 .endif 281.endm 282 283.macro src_0565_8888_process_tail cond, numbytes, firstreg 284 .if numbytes == 16 285 src_0565_8888_2pixels firstreg, %(firstreg+1) 286 src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) 287 .elseif numbytes == 8 288 src_0565_8888_2pixels firstreg, %(firstreg+1) 289 .else 290 src_0565_8888_1pixel firstreg 291 .endif 292.endm 293 294generate_composite_function \ 295 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ 296 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 297 3, /* prefetch distance */ \ 298 src_0565_8888_init, \ 299 nop_macro, /* newline */ \ 300 nop_macro, /* cleanup */ \ 301 src_0565_8888_process_head, \ 302 src_0565_8888_process_tail 303 304/******************************************************************************/ 305 306.macro add_8_8_8pixels cond, dst1, dst2 307 uqadd8&cond WK&dst1, WK&dst1, MASK 308 uqadd8&cond WK&dst2, WK&dst2, STRIDE_M 309.endm 310 311.macro add_8_8_4pixels cond, dst 312 uqadd8&cond WK&dst, WK&dst, MASK 313.endm 314 315.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 316 WK4 .req MASK 317 WK5 .req STRIDE_M 318 .if numbytes == 16 319 pixld cond, 8, 4, SRC, unaligned_src 320 pixld cond, 16, firstreg, DST, 0 321 add_8_8_8pixels cond, firstreg, %(firstreg+1) 322 pixld cond, 8, 4, SRC, unaligned_src 323 .else 324 pixld cond, numbytes, 4, SRC, unaligned_src 325 pixld cond, numbytes, firstreg, DST, 0 326 .endif 327 .unreq WK4 328 .unreq WK5 329.endm 330 331.macro add_8_8_process_tail cond, numbytes, firstreg 332 .if numbytes == 16 333 add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) 334 .elseif numbytes == 8 335 add_8_8_8pixels cond, firstreg, %(firstreg+1) 336 .else 337 add_8_8_4pixels cond, firstreg 338 .endif 339.endm 340 341generate_composite_function \ 342 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ 343 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ 344 2, /* prefetch distance */ \ 345 nop_macro, /* init */ \ 346 nop_macro, /* newline */ \ 347 nop_macro, /* cleanup */ \ 348 add_8_8_process_head, \ 349 add_8_8_process_tail 350 351/******************************************************************************/ 352 353.macro over_8888_8888_init 354 /* Hold loop invariant in MASK */ 355 ldr MASK, =0x00800080 356 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 357 uadd8 SCRATCH, MASK, MASK 358 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W 359.endm 360 361.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 362 WK4 .req STRIDE_D 363 WK5 .req STRIDE_S 364 WK6 .req STRIDE_M 365 WK7 .req ORIG_W 366 pixld , numbytes, %(4+firstreg), SRC, unaligned_src 367 pixld , numbytes, firstreg, DST, 0 368 .unreq WK4 369 .unreq WK5 370 .unreq WK6 371 .unreq WK7 372.endm 373 374.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 375 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ 376 teq WK®0, #0 377 .if numbytes > 4 378 teqeq WK®1, #0 379 .if numbytes > 8 380 teqeq WK®2, #0 381 teqeq WK®3, #0 382 .endif 383 .endif 384.endm 385 386.macro over_8888_8888_prepare next 387 mov WK&next, WK&next, lsr #24 388.endm 389 390.macro over_8888_8888_1pixel src, dst, offset, next 391 /* src = destination component multiplier */ 392 rsb WK&src, WK&src, #255 393 /* Split even/odd bytes of dst into SCRATCH/dst */ 394 uxtb16 SCRATCH, WK&dst 395 uxtb16 WK&dst, WK&dst, ror #8 396 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ 397 mla SCRATCH, SCRATCH, WK&src, MASK 398 mla WK&dst, WK&dst, WK&src, MASK 399 /* Where we would have had a stall between the result of the first MLA and the shifter input, 400 * reload the complete source pixel */ 401 ldr WK&src, [SRC, #offset] 402 /* Multiply by 257/256 to approximate 256/255 */ 403 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 404 /* In this stall, start processing the next pixel */ 405 .if offset < -4 406 mov WK&next, WK&next, lsr #24 407 .endif 408 uxtab16 WK&dst, WK&dst, WK&dst, ror #8 409 /* Recombine even/odd bytes of multiplied destination */ 410 mov SCRATCH, SCRATCH, ror #8 411 sel WK&dst, SCRATCH, WK&dst 412 /* Saturated add of source to multiplied destination */ 413 uqadd8 WK&dst, WK&dst, WK&src 414.endm 415 416.macro over_8888_8888_process_tail cond, numbytes, firstreg 417 WK4 .req STRIDE_D 418 WK5 .req STRIDE_S 419 WK6 .req STRIDE_M 420 WK7 .req ORIG_W 421 over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) 422 beq 10f 423 over_8888_8888_prepare %(4+firstreg) 424 .set PROCESS_REG, firstreg 425 .set PROCESS_OFF, -numbytes 426 .rept numbytes / 4 427 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) 428 .set PROCESS_REG, PROCESS_REG+1 429 .set PROCESS_OFF, PROCESS_OFF+4 430 .endr 431 pixst , numbytes, firstreg, DST 43210: 433 .unreq WK4 434 .unreq WK5 435 .unreq WK6 436 .unreq WK7 437.endm 438 439generate_composite_function \ 440 pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ 441 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 442 2, /* prefetch distance */ \ 443 over_8888_8888_init, \ 444 nop_macro, /* newline */ \ 445 nop_macro, /* cleanup */ \ 446 over_8888_8888_process_head, \ 447 over_8888_8888_process_tail 448 449/******************************************************************************/ 450 451/* Multiply each byte of a word by a byte. 452 * Useful when there aren't any obvious ways to fill the stalls with other instructions. 453 * word Register containing 4 bytes 454 * byte Register containing byte multiplier (bits 8-31 must be 0) 455 * tmp Scratch register 456 * half Register containing the constant 0x00800080 457 * GE[3:0] bits must contain 0101 458 */ 459.macro mul_8888_8 word, byte, tmp, half 460 /* Split even/odd bytes of word apart */ 461 uxtb16 tmp, word 462 uxtb16 word, word, ror #8 463 /* Multiply bytes together with rounding, then by 257/256 */ 464 mla tmp, tmp, byte, half 465 mla word, word, byte, half /* 1 stall follows */ 466 uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ 467 uxtab16 word, word, word, ror #8 468 /* Recombine bytes */ 469 mov tmp, tmp, ror #8 470 sel word, tmp, word 471.endm 472 473/******************************************************************************/ 474 475.macro over_8888_n_8888_init 476 /* Mask is constant */ 477 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 478 /* Hold loop invariant in STRIDE_M */ 479 ldr STRIDE_M, =0x00800080 480 /* We only want the alpha bits of the constant mask */ 481 mov MASK, MASK, lsr #24 482 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 483 uadd8 SCRATCH, STRIDE_M, STRIDE_M 484 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W 485.endm 486 487.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 488 WK4 .req Y 489 WK5 .req STRIDE_D 490 WK6 .req STRIDE_S 491 WK7 .req ORIG_W 492 pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src 493 pixld , numbytes, firstreg, DST, 0 494 .unreq WK4 495 .unreq WK5 496 .unreq WK6 497 .unreq WK7 498.endm 499 500.macro over_8888_n_8888_1pixel src, dst 501 mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M 502 sub WK7, WK6, WK&src, lsr #24 503 mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M 504 uqadd8 WK&dst, WK&dst, WK&src 505.endm 506 507.macro over_8888_n_8888_process_tail cond, numbytes, firstreg 508 WK4 .req Y 509 WK5 .req STRIDE_D 510 WK6 .req STRIDE_S 511 WK7 .req ORIG_W 512 over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) 513 beq 10f 514 mov WK6, #255 515 .set PROCESS_REG, firstreg 516 .rept numbytes / 4 517 .if numbytes == 16 && PROCESS_REG == 2 518 /* We're using WK6 and WK7 as temporaries, so half way through 519 * 4 pixels, reload the second two source pixels but this time 520 * into WK4 and WK5 */ 521 ldmdb SRC, {WK4, WK5} 522 .endif 523 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) 524 .set PROCESS_REG, PROCESS_REG+1 525 .endr 526 pixst , numbytes, firstreg, DST 52710: 528 .unreq WK4 529 .unreq WK5 530 .unreq WK6 531 .unreq WK7 532.endm 533 534generate_composite_function \ 535 pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ 536 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 537 2, /* prefetch distance */ \ 538 over_8888_n_8888_init, \ 539 nop_macro, /* newline */ \ 540 nop_macro, /* cleanup */ \ 541 over_8888_n_8888_process_head, \ 542 over_8888_n_8888_process_tail 543 544/******************************************************************************/ 545 546.macro over_n_8_8888_init 547 /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ 548 ldr SRC, [sp, #ARGS_STACK_OFFSET] 549 /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ 550 ldr SCRATCH, =0x00800080 551 uxtb16 STRIDE_S, SRC 552 uxtb16 SRC, SRC, ror #8 553 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 554 uadd8 SCRATCH, SCRATCH, SCRATCH 555 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W 556.endm 557 558.macro over_n_8_8888_newline 559 ldr STRIDE_D, =0x00800080 560 b 1f 561 .ltorg 5621: 563.endm 564 565.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 566 WK4 .req STRIDE_M 567 pixld , numbytes/4, 4, MASK, unaligned_mask 568 pixld , numbytes, firstreg, DST, 0 569 .unreq WK4 570.endm 571 572.macro over_n_8_8888_1pixel src, dst 573 uxtb Y, WK4, ror #src*8 574 /* Trailing part of multiplication of source */ 575 mla SCRATCH, STRIDE_S, Y, STRIDE_D 576 mla Y, SRC, Y, STRIDE_D 577 mov ORIG_W, #255 578 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 579 uxtab16 Y, Y, Y, ror #8 580 mov SCRATCH, SCRATCH, ror #8 581 sub ORIG_W, ORIG_W, Y, lsr #24 582 sel Y, SCRATCH, Y 583 /* Then multiply the destination */ 584 mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D 585 uqadd8 WK&dst, WK&dst, Y 586.endm 587 588.macro over_n_8_8888_process_tail cond, numbytes, firstreg 589 WK4 .req STRIDE_M 590 teq WK4, #0 591 beq 10f 592 .set PROCESS_REG, firstreg 593 .rept numbytes / 4 594 over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) 595 .set PROCESS_REG, PROCESS_REG+1 596 .endr 597 pixst , numbytes, firstreg, DST 59810: 599 .unreq WK4 600.endm 601 602generate_composite_function \ 603 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ 604 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 605 2, /* prefetch distance */ \ 606 over_n_8_8888_init, \ 607 over_n_8_8888_newline, \ 608 nop_macro, /* cleanup */ \ 609 over_n_8_8888_process_head, \ 610 over_n_8_8888_process_tail 611 612/******************************************************************************/ 613 614