1/* 2 * Copyright © 2012 Raspberry Pi Foundation 3 * Copyright © 2012 RISC OS Open Ltd 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of the copyright holders not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. The copyright holders make no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Ben Avison (bavison@riscosopen.org) 25 * 26 */ 27 28/* 29 * Because the alignment of pixel data to cachelines, and even the number of 30 * cachelines per row can vary from row to row, and because of the need to 31 * preload each scanline once and only once, this prefetch strategy treats 32 * each row of pixels independently. When a pixel row is long enough, there 33 * are three distinct phases of prefetch: 34 * * an inner loop section, where each time a cacheline of data is 35 * processed, another cacheline is preloaded (the exact distance ahead is 36 * determined empirically using profiling results from lowlevel-blt-bench) 37 * * a leading section, where enough cachelines are preloaded to ensure no 38 * cachelines escape being preloaded when the inner loop starts 39 * * a trailing section, where a limited number (0 or more) of cachelines 40 * are preloaded to deal with data (if any) that hangs off the end of the 41 * last iteration of the inner loop, plus any trailing bytes that were not 42 * enough to make up one whole iteration of the inner loop 43 * 44 * There are (in general) three distinct code paths, selected between 45 * depending upon how long the pixel row is. If it is long enough that there 46 * is at least one iteration of the inner loop (as described above) then 47 * this is described as the "wide" case. If it is shorter than that, but 48 * there are still enough bytes output that there is at least one 16-byte- 49 * long, 16-byte-aligned write to the destination (the optimum type of 50 * write), then this is the "medium" case. If it is not even this long, then 51 * this is the "narrow" case, and there is no attempt to align writes to 52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the 53 * cachelines containing data from the pixel row are prefetched up-front. 54 */ 55 56/* 57 * Determine whether we put the arguments on the stack for debugging. 58 */ 59#undef DEBUG_PARAMS 60 61/* 62 * Bit flags for 'generate_composite_function' macro which are used 63 * to tune generated functions behavior. 64 */ 65.set FLAG_DST_WRITEONLY, 0 66.set FLAG_DST_READWRITE, 1 67.set FLAG_COND_EXEC, 0 68.set FLAG_BRANCH_OVER, 2 69.set FLAG_PROCESS_PRESERVES_PSR, 0 70.set FLAG_PROCESS_CORRUPTS_PSR, 4 71.set FLAG_PROCESS_DOESNT_STORE, 0 72.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ 73.set FLAG_NO_SPILL_LINE_VARS, 0 74.set FLAG_SPILL_LINE_VARS_WIDE, 16 75.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 76.set FLAG_SPILL_LINE_VARS, 48 77.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 78.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 79 80/* 81 * Offset into stack where mask and source pointer/stride can be accessed. 82 */ 83#ifdef DEBUG_PARAMS 84.set ARGS_STACK_OFFSET, (9*4+9*4) 85#else 86.set ARGS_STACK_OFFSET, (9*4) 87#endif 88 89/* 90 * Constants for selecting preferable prefetch type. 91 */ 92.set PREFETCH_TYPE_NONE, 0 93.set PREFETCH_TYPE_STANDARD, 1 94 95/* 96 * Definitions of macros for load/store of pixel data. 97 */ 98 99.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 100 .if numbytes == 16 101 .if unaligned == 1 102 op&r&cond WK®0, [base], #4 103 op&r&cond WK®1, [base], #4 104 op&r&cond WK®2, [base], #4 105 op&r&cond WK®3, [base], #4 106 .else 107 op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} 108 .endif 109 .elseif numbytes == 8 110 .if unaligned == 1 111 op&r&cond WK®0, [base], #4 112 op&r&cond WK®1, [base], #4 113 .else 114 op&m&cond&ia base!, {WK®0,WK®1} 115 .endif 116 .elseif numbytes == 4 117 op&r&cond WK®0, [base], #4 118 .elseif numbytes == 2 119 op&r&cond&h WK®0, [base], #2 120 .elseif numbytes == 1 121 op&r&cond&b WK®0, [base], #1 122 .else 123 .error "unsupported size: numbytes" 124 .endif 125.endm 126 127.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base 128 .if numbytes == 16 129 stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} 130 .elseif numbytes == 8 131 stm&cond&db base, {WK®0,WK®1} 132 .elseif numbytes == 4 133 str&cond WK®0, [base, #-4] 134 .elseif numbytes == 2 135 str&cond&h WK®0, [base, #-2] 136 .elseif numbytes == 1 137 str&cond&b WK®0, [base, #-1] 138 .else 139 .error "unsupported size: numbytes" 140 .endif 141.endm 142 143.macro pixld cond, numbytes, firstreg, base, unaligned 144 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned 145.endm 146 147.macro pixst cond, numbytes, firstreg, base 148 .if (flags) & FLAG_DST_READWRITE 149 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 150 .else 151 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 152 .endif 153.endm 154 155.macro PF a, x:vararg 156 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) 157 a x 158 .endif 159.endm 160 161 162.macro preload_leading_step1 bpp, ptr, base 163/* If the destination is already 16-byte aligned, then we need to preload 164 * between 0 and prefetch_distance (inclusive) cache lines ahead so there 165 * are no gaps when the inner loop starts. 166 */ 167 .if bpp > 0 168 PF bic, ptr, base, #31 169 .set OFFSET, 0 170 .rept prefetch_distance+1 171 PF pld, [ptr, #OFFSET] 172 .set OFFSET, OFFSET+32 173 .endr 174 .endif 175.endm 176 177.macro preload_leading_step2 bpp, bpp_shift, ptr, base 178/* However, if the destination is not 16-byte aligned, we may need to 179 * preload more cache lines than that. The question we need to ask is: 180 * are the bytes corresponding to the leading pixels more than the amount 181 * by which the source pointer will be rounded down for preloading, and if 182 * so, by how many cache lines? Effectively, we want to calculate 183 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp 184 * inner_loop_offset = (src+leading_bytes)&31 185 * extra_needed = leading_bytes - inner_loop_offset 186 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only 187 * possible when there are 4 src bytes for every 1 dst byte). 188 */ 189 .if bpp > 0 190 .ifc base,DST 191 /* The test can be simplified further when preloading the destination */ 192 PF tst, base, #16 193 PF beq, 61f 194 .else 195 .if bpp/dst_w_bpp == 4 196 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift 197 PF and, SCRATCH, SCRATCH, #31 198 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift 199 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ 200 PF movs, SCRATCH, SCRATCH, #32-6 /* so this sets NC / nc / Nc */ 201 PF bcs, 61f 202 PF bpl, 60f 203 PF pld, [ptr, #32*(prefetch_distance+2)] 204 .else 205 PF mov, SCRATCH, base, lsl #32-5 206 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 207 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 208 PF bls, 61f 209 .endif 210 .endif 21160: PF pld, [ptr, #32*(prefetch_distance+1)] 21261: 213 .endif 214.endm 215 216#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) 217.macro preload_middle bpp, base, scratch_holds_offset 218 .if bpp > 0 219 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ 220 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) 221 .if scratch_holds_offset 222 PF pld, [base, SCRATCH] 223 .else 224 PF bic, SCRATCH, base, #31 225 PF pld, [SCRATCH, #32*prefetch_distance] 226 .endif 227 .endif 228 .endif 229.endm 230 231.macro preload_trailing bpp, bpp_shift, base 232 .if bpp > 0 233 .if bpp*pix_per_block > 256 234 /* Calculations are more complex if more than one fetch per block */ 235 PF and, WK1, base, #31 236 PF add, WK1, WK1, WK0, lsl #bpp_shift 237 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) 238 PF bic, SCRATCH, base, #31 23980: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 240 PF add, SCRATCH, SCRATCH, #32 241 PF subs, WK1, WK1, #32 242 PF bhi, 80b 243 .else 244 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ 245 PF mov, SCRATCH, base, lsl #32-5 246 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift 247 PF adceqs, SCRATCH, SCRATCH, #0 248 /* The instruction above has two effects: ensures Z is only 249 * set if C was clear (so Z indicates that both shifted quantities 250 * were 0), and clears C if Z was set (so C indicates that the sum 251 * of the shifted quantities was greater and not equal to 32) */ 252 PF beq, 82f 253 PF bic, SCRATCH, base, #31 254 PF bcc, 81f 255 PF pld, [SCRATCH, #32*(prefetch_distance+2)] 25681: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 25782: 258 .endif 259 .endif 260.endm 261 262 263.macro preload_line narrow_case, bpp, bpp_shift, base 264/* "narrow_case" - just means that the macro was invoked from the "narrow" 265 * code path rather than the "medium" one - because in the narrow case, 266 * the row of pixels is known to output no more than 30 bytes, then 267 * (assuming the source pixels are no wider than the the destination 268 * pixels) they cannot possibly straddle more than 2 32-byte cachelines, 269 * meaning there's no need for a loop. 270 * "bpp" - number of bits per pixel in the channel (source, mask or 271 * destination) that's being preloaded, or 0 if this channel is not used 272 * for reading 273 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) 274 * "base" - base address register of channel to preload (SRC, MASK or DST) 275 */ 276 .if bpp > 0 277 .if narrow_case && (bpp <= dst_w_bpp) 278 /* In these cases, each line for each channel is in either 1 or 2 cache lines */ 279 PF bic, WK0, base, #31 280 PF pld, [WK0] 281 PF add, WK1, base, X, LSL #bpp_shift 282 PF sub, WK1, WK1, #1 283 PF bic, WK1, WK1, #31 284 PF cmp, WK1, WK0 285 PF beq, 90f 286 PF pld, [WK1] 28790: 288 .else 289 PF bic, WK0, base, #31 290 PF pld, [WK0] 291 PF add, WK1, base, X, lsl #bpp_shift 292 PF sub, WK1, WK1, #1 293 PF bic, WK1, WK1, #31 294 PF cmp, WK1, WK0 295 PF beq, 92f 29691: PF add, WK0, WK0, #32 297 PF cmp, WK0, WK1 298 PF pld, [WK0] 299 PF bne, 91b 30092: 301 .endif 302 .endif 303.endm 304 305 306.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 307 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 308 .if decrementx 309 sub&cond X, X, #8*numbytes/dst_w_bpp 310 .endif 311 process_tail cond, numbytes, firstreg 312 .if !((flags) & FLAG_PROCESS_DOES_STORE) 313 pixst cond, numbytes, firstreg, DST 314 .endif 315.endm 316 317.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 318 .if (flags) & FLAG_BRANCH_OVER 319 .ifc cond,mi 320 bpl 100f 321 .endif 322 .ifc cond,cs 323 bcc 100f 324 .endif 325 .ifc cond,ne 326 beq 100f 327 .endif 328 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 329100: 330 .else 331 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 332 .endif 333.endm 334 335.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx 336 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) 337 /* Can't interleave reads and writes */ 338 test 339 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx 340 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR 341 test 342 .endif 343 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx 344 .else 345 /* Can interleave reads and writes for better scheduling */ 346 test 347 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 348 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 349 .if decrementx 350 sub&cond1 X, X, #8*numbytes1/dst_w_bpp 351 sub&cond2 X, X, #8*numbytes2/dst_w_bpp 352 .endif 353 process_tail cond1, numbytes1, firstreg1 354 process_tail cond2, numbytes2, firstreg2 355 pixst cond1, numbytes1, firstreg1, DST 356 pixst cond2, numbytes2, firstreg2, DST 357 .endif 358.endm 359 360 361.macro test_bits_1_0_ptr 362 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ 363.endm 364 365.macro test_bits_3_2_ptr 366 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ 367.endm 368 369.macro leading_15bytes process_head, process_tail 370 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ 371 /* Use unaligned loads in all cases for simplicity */ 372 .if dst_w_bpp == 8 373 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1 374 .elseif dst_w_bpp == 16 375 test_bits_1_0_ptr 376 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1 377 .endif 378 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1 379.endm 380 381.macro test_bits_3_2_pix 382 movs SCRATCH, X, lsl #dst_bpp_shift+32-3 383.endm 384 385.macro test_bits_1_0_pix 386 .if dst_w_bpp == 8 387 movs SCRATCH, X, lsl #dst_bpp_shift+32-1 388 .else 389 movs SCRATCH, X, lsr #1 390 .endif 391.endm 392 393.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 394 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 395 .if dst_w_bpp == 16 396 test_bits_1_0_pix 397 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 398 .elseif dst_w_bpp == 8 399 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 400 .endif 401.endm 402 403 404.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 405110: 406 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ 407 .rept pix_per_block*dst_w_bpp/128 408 process_head , 16, 0, unaligned_src, unaligned_mask, 1 409 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 410 preload_middle src_bpp, SRC, 1 411 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 412 preload_middle mask_bpp, MASK, 1 413 .else 414 preload_middle src_bpp, SRC, 0 415 preload_middle mask_bpp, MASK, 0 416 .endif 417 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) 418 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that 419 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset 420 * preloads for, to achieve staggered prefetches for multiple channels, because there are 421 * always two STMs per prefetch, so there is always an opposite STM on which to put the 422 * preload. Note, no need to BIC the base register here */ 423 PF pld, [DST, #32*prefetch_distance - dst_alignment] 424 .endif 425 process_tail , 16, 0 426 .if !((flags) & FLAG_PROCESS_DOES_STORE) 427 pixst , 16, 0, DST 428 .endif 429 .set SUBBLOCK, SUBBLOCK+1 430 .endr 431 subs X, X, #pix_per_block 432 bhs 110b 433.endm 434 435.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask 436 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ 437 .if dst_r_bpp > 0 438 tst DST, #16 439 bne 111f 440 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 441 b 112f 442111: 443 .endif 444 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 445112: 446 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ 447 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) 448 PF and, WK0, X, #pix_per_block-1 449 .endif 450 preload_trailing src_bpp, src_bpp_shift, SRC 451 preload_trailing mask_bpp, mask_bpp_shift, MASK 452 preload_trailing dst_r_bpp, dst_bpp_shift, DST 453 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp 454 /* The remainder of the line is handled identically to the medium case */ 455 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask 456.endm 457 458.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 459120: 460 process_head , 16, 0, unaligned_src, unaligned_mask, 0 461 process_tail , 16, 0 462 .if !((flags) & FLAG_PROCESS_DOES_STORE) 463 pixst , 16, 0, DST 464 .endif 465 subs X, X, #128/dst_w_bpp 466 bhs 120b 467 /* Trailing pixels */ 468 tst X, #128/dst_w_bpp - 1 469 beq exit_label 470 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 471.endm 472 473.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 474 tst X, #16*8/dst_w_bpp 475 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 476 /* Trailing pixels */ 477 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ 478 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 479.endm 480 481.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label 482 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ 483 .if mask_bpp == 8 || mask_bpp == 16 484 tst MASK, #3 485 bne 141f 486 .endif 487 .if src_bpp == 8 || src_bpp == 16 488 tst SRC, #3 489 bne 140f 490 .endif 491 action process_head, process_tail, process_inner_loop, exit_label, 0, 0 492 .if src_bpp == 8 || src_bpp == 16 493 b exit_label 494140: 495 action process_head, process_tail, process_inner_loop, exit_label, 1, 0 496 .endif 497 .if mask_bpp == 8 || mask_bpp == 16 498 b exit_label 499141: 500 .if src_bpp == 8 || src_bpp == 16 501 tst SRC, #3 502 bne 142f 503 .endif 504 action process_head, process_tail, process_inner_loop, exit_label, 0, 1 505 .if src_bpp == 8 || src_bpp == 16 506 b exit_label 507142: 508 action process_head, process_tail, process_inner_loop, exit_label, 1, 1 509 .endif 510 .endif 511.endm 512 513 514.macro end_of_line restore_x, vars_spilled, loop_label, last_one 515 .if vars_spilled 516 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ 517 /* This is ldmia sp,{} */ 518 .word 0xE89D0000 | LINE_SAVED_REGS 519 .endif 520 subs Y, Y, #1 521 .if vars_spilled 522 .if (LINE_SAVED_REGS) & (1<<1) 523 str Y, [sp] 524 .endif 525 .endif 526 add DST, DST, STRIDE_D 527 .if src_bpp > 0 528 add SRC, SRC, STRIDE_S 529 .endif 530 .if mask_bpp > 0 531 add MASK, MASK, STRIDE_M 532 .endif 533 .if restore_x 534 mov X, ORIG_W 535 .endif 536 bhs loop_label 537 .ifc "last_one","" 538 .if vars_spilled 539 b 197f 540 .else 541 b 198f 542 .endif 543 .else 544 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) 545 b 198f 546 .endif 547 .endif 548.endm 549 550 551.macro generate_composite_function fname, \ 552 src_bpp_, \ 553 mask_bpp_, \ 554 dst_w_bpp_, \ 555 flags_, \ 556 prefetch_distance_, \ 557 init, \ 558 newline, \ 559 cleanup, \ 560 process_head, \ 561 process_tail, \ 562 process_inner_loop 563 564 .func fname 565 .global fname 566 /* For ELF format also set function visibility to hidden */ 567#ifdef __ELF__ 568 .hidden fname 569 .type fname, %function 570#endif 571 572/* 573 * Make some macro arguments globally visible and accessible 574 * from other macros 575 */ 576 .set src_bpp, src_bpp_ 577 .set mask_bpp, mask_bpp_ 578 .set dst_w_bpp, dst_w_bpp_ 579 .set flags, flags_ 580 .set prefetch_distance, prefetch_distance_ 581 582/* 583 * Select prefetch type for this function. 584 */ 585 .if prefetch_distance == 0 586 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 587 .else 588 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD 589 .endif 590 591 .if src_bpp == 32 592 .set src_bpp_shift, 2 593 .elseif src_bpp == 24 594 .set src_bpp_shift, 0 595 .elseif src_bpp == 16 596 .set src_bpp_shift, 1 597 .elseif src_bpp == 8 598 .set src_bpp_shift, 0 599 .elseif src_bpp == 0 600 .set src_bpp_shift, -1 601 .else 602 .error "requested src bpp (src_bpp) is not supported" 603 .endif 604 605 .if mask_bpp == 32 606 .set mask_bpp_shift, 2 607 .elseif mask_bpp == 24 608 .set mask_bpp_shift, 0 609 .elseif mask_bpp == 8 610 .set mask_bpp_shift, 0 611 .elseif mask_bpp == 0 612 .set mask_bpp_shift, -1 613 .else 614 .error "requested mask bpp (mask_bpp) is not supported" 615 .endif 616 617 .if dst_w_bpp == 32 618 .set dst_bpp_shift, 2 619 .elseif dst_w_bpp == 24 620 .set dst_bpp_shift, 0 621 .elseif dst_w_bpp == 16 622 .set dst_bpp_shift, 1 623 .elseif dst_w_bpp == 8 624 .set dst_bpp_shift, 0 625 .else 626 .error "requested dst bpp (dst_w_bpp) is not supported" 627 .endif 628 629 .if (((flags) & FLAG_DST_READWRITE) != 0) 630 .set dst_r_bpp, dst_w_bpp 631 .else 632 .set dst_r_bpp, 0 633 .endif 634 635 .set pix_per_block, 16*8/dst_w_bpp 636 .if src_bpp != 0 637 .if 32*8/src_bpp > pix_per_block 638 .set pix_per_block, 32*8/src_bpp 639 .endif 640 .endif 641 .if mask_bpp != 0 642 .if 32*8/mask_bpp > pix_per_block 643 .set pix_per_block, 32*8/mask_bpp 644 .endif 645 .endif 646 .if dst_r_bpp != 0 647 .if 32*8/dst_r_bpp > pix_per_block 648 .set pix_per_block, 32*8/dst_r_bpp 649 .endif 650 .endif 651 652/* The standard entry conditions set up by pixman-arm-common.h are: 653 * r0 = width (pixels) 654 * r1 = height (rows) 655 * r2 = pointer to top-left pixel of destination 656 * r3 = destination stride (pixels) 657 * [sp] = source pixel value, or pointer to top-left pixel of source 658 * [sp,#4] = 0 or source stride (pixels) 659 * The following arguments are unused for non-mask operations 660 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask 661 * [sp,#12] = 0 or mask stride (pixels) 662 */ 663 664/* 665 * Assign symbolic names to registers 666 */ 667 X .req r0 /* pixels to go on this line */ 668 Y .req r1 /* lines to go */ 669 DST .req r2 /* destination pixel pointer */ 670 STRIDE_D .req r3 /* destination stride (bytes, minus width) */ 671 SRC .req r4 /* source pixel pointer */ 672 STRIDE_S .req r5 /* source stride (bytes, minus width) */ 673 MASK .req r6 /* mask pixel pointer (if applicable) */ 674 STRIDE_M .req r7 /* mask stride (bytes, minus width) */ 675 WK0 .req r8 /* pixel data registers */ 676 WK1 .req r9 677 WK2 .req r10 678 WK3 .req r11 679 SCRATCH .req r12 680 ORIG_W .req r14 /* width (pixels) */ 681 682fname: 683 push {r4-r11, lr} /* save all registers */ 684 685 subs Y, Y, #1 686 blo 199f 687 688#ifdef DEBUG_PARAMS 689 sub sp, sp, #9*4 690#endif 691 692 .if src_bpp > 0 693 ldr SRC, [sp, #ARGS_STACK_OFFSET] 694 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] 695 .endif 696 .if mask_bpp > 0 697 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 698 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] 699 .endif 700 701#ifdef DEBUG_PARAMS 702 add Y, Y, #1 703 stmia sp, {r0-r7,pc} 704 sub Y, Y, #1 705#endif 706 707 init 708 709 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ 710 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift 711 .if src_bpp > 0 712 lsl STRIDE_S, #src_bpp_shift 713 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift 714 .endif 715 .if mask_bpp > 0 716 lsl STRIDE_M, #mask_bpp_shift 717 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift 718 .endif 719 720 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ 721 cmp X, #2*16*8/dst_w_bpp - 1 722 blo 170f 723 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ 724 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ 725 cmp X, #(prefetch_distance+3)*pix_per_block - 1 726 blo 160f 727 728 /* Wide case */ 729 /* Adjust X so that the decrement instruction can also test for 730 * inner loop termination. We want it to stop when there are 731 * (prefetch_distance+1) complete blocks to go. */ 732 sub X, X, #(prefetch_distance+2)*pix_per_block 733 mov ORIG_W, X 734 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE 735 /* This is stmdb sp!,{} */ 736 .word 0xE92D0000 | LINE_SAVED_REGS 737 .endif 738151: /* New line */ 739 newline 740 preload_leading_step1 src_bpp, WK1, SRC 741 preload_leading_step1 mask_bpp, WK2, MASK 742 preload_leading_step1 dst_r_bpp, WK3, DST 743 744 tst DST, #15 745 beq 154f 746 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ 747 .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp) 748 PF and, WK0, WK0, #15 749 .endif 750 751 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC 752 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK 753 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST 754 755 leading_15bytes process_head, process_tail 756 757154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ 758 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 759 and SCRATCH, SRC, #31 760 rsb SCRATCH, SCRATCH, #32*prefetch_distance 761 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 762 and SCRATCH, MASK, #31 763 rsb SCRATCH, SCRATCH, #32*prefetch_distance 764 .endif 765 .ifc "process_inner_loop","" 766 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f 767 .else 768 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f 769 .endif 770 771157: /* Check for another line */ 772 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b 773 .endif 774 775 .ltorg 776 777160: /* Medium case */ 778 mov ORIG_W, X 779 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 780 /* This is stmdb sp!,{} */ 781 .word 0xE92D0000 | LINE_SAVED_REGS 782 .endif 783161: /* New line */ 784 newline 785 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 786 preload_line 0, mask_bpp, mask_bpp_shift, MASK 787 preload_line 0, dst_r_bpp, dst_bpp_shift, DST 788 789 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ 790 tst DST, #15 791 beq 164f 792 rsb WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */ 793 794 leading_15bytes process_head, process_tail 795 796164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ 797 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f 798 799167: /* Check for another line */ 800 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b 801 802 .ltorg 803 804170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ 805 .if dst_w_bpp < 32 806 mov ORIG_W, X 807 .endif 808 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 809 /* This is stmdb sp!,{} */ 810 .word 0xE92D0000 | LINE_SAVED_REGS 811 .endif 812171: /* New line */ 813 newline 814 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 815 preload_line 1, mask_bpp, mask_bpp_shift, MASK 816 preload_line 1, dst_r_bpp, dst_bpp_shift, DST 817 818 .if dst_w_bpp == 8 819 tst DST, #3 820 beq 174f 821172: subs X, X, #1 822 blo 177f 823 process_head , 1, 0, 1, 1, 0 824 process_tail , 1, 0 825 .if !((flags) & FLAG_PROCESS_DOES_STORE) 826 pixst , 1, 0, DST 827 .endif 828 tst DST, #3 829 bne 172b 830 .elseif dst_w_bpp == 16 831 tst DST, #2 832 beq 174f 833 subs X, X, #1 834 blo 177f 835 process_head , 2, 0, 1, 1, 0 836 process_tail , 2, 0 837 .if !((flags) & FLAG_PROCESS_DOES_STORE) 838 pixst , 2, 0, DST 839 .endif 840 .endif 841 842174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ 843 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f 844 845177: /* Check for another line */ 846 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one 847 848197: 849 .if (flags) & FLAG_SPILL_LINE_VARS 850 add sp, sp, #LINE_SAVED_REG_COUNT*4 851 .endif 852198: 853 cleanup 854 855#ifdef DEBUG_PARAMS 856 add sp, sp, #9*4 /* junk the debug copy of arguments */ 857#endif 858199: 859 pop {r4-r11, pc} /* exit */ 860 861 .ltorg 862 863 .unreq X 864 .unreq Y 865 .unreq DST 866 .unreq STRIDE_D 867 .unreq SRC 868 .unreq STRIDE_S 869 .unreq MASK 870 .unreq STRIDE_M 871 .unreq WK0 872 .unreq WK1 873 .unreq WK2 874 .unreq WK3 875 .unreq SCRATCH 876 .unreq ORIG_W 877 .endfunc 878.endm 879 880.macro line_saved_regs x:vararg 881 .set LINE_SAVED_REGS, 0 882 .set LINE_SAVED_REG_COUNT, 0 883 .irp SAVED_REG,x 884 .ifc "SAVED_REG","Y" 885 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) 886 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 887 .endif 888 .ifc "SAVED_REG","STRIDE_D" 889 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) 890 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 891 .endif 892 .ifc "SAVED_REG","STRIDE_S" 893 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) 894 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 895 .endif 896 .ifc "SAVED_REG","STRIDE_M" 897 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) 898 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 899 .endif 900 .ifc "SAVED_REG","ORIG_W" 901 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) 902 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 903 .endif 904 .endr 905.endm 906 907.macro nop_macro x:vararg 908.endm 909