1/*
2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission.  The copyright holders make no
12 * representations about the suitability of this software for any purpose.  It
13 * is provided "as is" without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author:  Ben Avison (bavison@riscosopen.org)
25 *
26 */
27
28/*
29 * Because the alignment of pixel data to cachelines, and even the number of
30 * cachelines per row can vary from row to row, and because of the need to
31 * preload each scanline once and only once, this prefetch strategy treats
32 * each row of pixels independently. When a pixel row is long enough, there
33 * are three distinct phases of prefetch:
34 * * an inner loop section, where each time a cacheline of data is
35 *    processed, another cacheline is preloaded (the exact distance ahead is
36 *    determined empirically using profiling results from lowlevel-blt-bench)
37 * * a leading section, where enough cachelines are preloaded to ensure no
38 *    cachelines escape being preloaded when the inner loop starts
39 * * a trailing section, where a limited number (0 or more) of cachelines
40 *    are preloaded to deal with data (if any) that hangs off the end of the
41 *    last iteration of the inner loop, plus any trailing bytes that were not
42 *    enough to make up one whole iteration of the inner loop
43 *
44 * There are (in general) three distinct code paths, selected between
45 * depending upon how long the pixel row is. If it is long enough that there
46 * is at least one iteration of the inner loop (as described above) then
47 * this is described as the "wide" case. If it is shorter than that, but
48 * there are still enough bytes output that there is at least one 16-byte-
49 * long, 16-byte-aligned write to the destination (the optimum type of
50 * write), then this is the "medium" case. If it is not even this long, then
51 * this is the "narrow" case, and there is no attempt to align writes to
52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
53 * cachelines containing data from the pixel row are prefetched up-front.
54 */
55
56/*
57 * Determine whether we put the arguments on the stack for debugging.
58 */
59#undef DEBUG_PARAMS
60
61/*
62 * Bit flags for 'generate_composite_function' macro which are used
63 * to tune generated functions behavior.
64 */
65.set FLAG_DST_WRITEONLY,         0
66.set FLAG_DST_READWRITE,         1
67.set FLAG_COND_EXEC,             0
68.set FLAG_BRANCH_OVER,           2
69.set FLAG_PROCESS_PRESERVES_PSR, 0
70.set FLAG_PROCESS_CORRUPTS_PSR,  4
71.set FLAG_PROCESS_DOESNT_STORE,  0
72.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
73.set FLAG_NO_SPILL_LINE_VARS,        0
74.set FLAG_SPILL_LINE_VARS_WIDE,      16
75.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
76.set FLAG_SPILL_LINE_VARS,           48
77.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
78.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
79
80/*
81 * Offset into stack where mask and source pointer/stride can be accessed.
82 */
83#ifdef DEBUG_PARAMS
84.set ARGS_STACK_OFFSET,        (9*4+9*4)
85#else
86.set ARGS_STACK_OFFSET,        (9*4)
87#endif
88
89/*
90 * Constants for selecting preferable prefetch type.
91 */
92.set PREFETCH_TYPE_NONE,       0
93.set PREFETCH_TYPE_STANDARD,   1
94
95/*
96 * Definitions of macros for load/store of pixel data.
97 */
98
99.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
100 .if numbytes == 16
101  .if unaligned == 1
102        op&r&cond    WK&reg0, [base], #4
103        op&r&cond    WK&reg1, [base], #4
104        op&r&cond    WK&reg2, [base], #4
105        op&r&cond    WK&reg3, [base], #4
106  .else
107        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
108  .endif
109 .elseif numbytes == 8
110  .if unaligned == 1
111        op&r&cond    WK&reg0, [base], #4
112        op&r&cond    WK&reg1, [base], #4
113  .else
114        op&m&cond&ia base!, {WK&reg0,WK&reg1}
115  .endif
116 .elseif numbytes == 4
117        op&r&cond    WK&reg0, [base], #4
118 .elseif numbytes == 2
119        op&r&cond&h  WK&reg0, [base], #2
120 .elseif numbytes == 1
121        op&r&cond&b  WK&reg0, [base], #1
122 .else
123  .error "unsupported size: numbytes"
124 .endif
125.endm
126
127.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
128 .if numbytes == 16
129        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
130 .elseif numbytes == 8
131        stm&cond&db base, {WK&reg0,WK&reg1}
132 .elseif numbytes == 4
133        str&cond    WK&reg0, [base, #-4]
134 .elseif numbytes == 2
135        str&cond&h  WK&reg0, [base, #-2]
136 .elseif numbytes == 1
137        str&cond&b  WK&reg0, [base, #-1]
138 .else
139  .error "unsupported size: numbytes"
140 .endif
141.endm
142
143.macro pixld cond, numbytes, firstreg, base, unaligned
144        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
145.endm
146
147.macro pixst cond, numbytes, firstreg, base
148 .if (flags) & FLAG_DST_READWRITE
149        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
150 .else
151        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
152 .endif
153.endm
154
155.macro PF a, x:vararg
156 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
157        a x
158 .endif
159.endm
160
161
162.macro preload_leading_step1  bpp, ptr, base
163/* If the destination is already 16-byte aligned, then we need to preload
164 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
165 * are no gaps when the inner loop starts.
166 */
167 .if bpp > 0
168        PF  bic,    ptr, base, #31
169  .set OFFSET, 0
170  .rept prefetch_distance+1
171        PF  pld,    [ptr, #OFFSET]
172   .set OFFSET, OFFSET+32
173  .endr
174 .endif
175.endm
176
177.macro preload_leading_step2  bpp, bpp_shift, ptr, base
178/* However, if the destination is not 16-byte aligned, we may need to
179 * preload more cache lines than that. The question we need to ask is:
180 * are the bytes corresponding to the leading pixels more than the amount
181 * by which the source pointer will be rounded down for preloading, and if
182 * so, by how many cache lines? Effectively, we want to calculate
183 *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
184 *     inner_loop_offset = (src+leading_bytes)&31
185 *     extra_needed = leading_bytes - inner_loop_offset
186 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
187 * possible when there are 4 src bytes for every 1 dst byte).
188 */
189 .if bpp > 0
190  .ifc base,DST
191        /* The test can be simplified further when preloading the destination */
192        PF  tst,    base, #16
193        PF  beq,    61f
194  .else
195   .if bpp/dst_w_bpp == 4
196        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
197        PF  and,    SCRATCH, SCRATCH, #31
198        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
199        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
200        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
201        PF  bcs,    61f
202        PF  bpl,    60f
203        PF  pld,    [ptr, #32*(prefetch_distance+2)]
204   .else
205        PF  mov,    SCRATCH, base, lsl #32-5
206        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
207        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
208        PF  bls,    61f
209   .endif
210  .endif
21160:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
21261:
213 .endif
214.endm
215
216#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
217.macro preload_middle   bpp, base, scratch_holds_offset
218 .if bpp > 0
219        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
220  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
221   .if scratch_holds_offset
222        PF  pld,    [base, SCRATCH]
223   .else
224        PF  bic,    SCRATCH, base, #31
225        PF  pld,    [SCRATCH, #32*prefetch_distance]
226   .endif
227  .endif
228 .endif
229.endm
230
231.macro preload_trailing  bpp, bpp_shift, base
232 .if bpp > 0
233  .if bpp*pix_per_block > 256
234        /* Calculations are more complex if more than one fetch per block */
235        PF  and,    WK1, base, #31
236        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
237        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
238        PF  bic,    SCRATCH, base, #31
23980:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
240        PF  add,    SCRATCH, SCRATCH, #32
241        PF  subs,   WK1, WK1, #32
242        PF  bhi,    80b
243  .else
244        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
245        PF  mov,    SCRATCH, base, lsl #32-5
246        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
247        PF  adceqs, SCRATCH, SCRATCH, #0
248        /* The instruction above has two effects: ensures Z is only
249         * set if C was clear (so Z indicates that both shifted quantities
250         * were 0), and clears C if Z was set (so C indicates that the sum
251         * of the shifted quantities was greater and not equal to 32) */
252        PF  beq,    82f
253        PF  bic,    SCRATCH, base, #31
254        PF  bcc,    81f
255        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
25681:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
25782:
258  .endif
259 .endif
260.endm
261
262
263.macro preload_line    narrow_case, bpp, bpp_shift, base
264/* "narrow_case" - just means that the macro was invoked from the "narrow"
265 *    code path rather than the "medium" one - because in the narrow case,
266 *    the row of pixels is known to output no more than 30 bytes, then
267 *    (assuming the source pixels are no wider than the the destination
268 *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
269 *    meaning there's no need for a loop.
270 * "bpp" - number of bits per pixel in the channel (source, mask or
271 *    destination) that's being preloaded, or 0 if this channel is not used
272 *    for reading
273 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
274 * "base" - base address register of channel to preload (SRC, MASK or DST)
275 */
276 .if bpp > 0
277  .if narrow_case && (bpp <= dst_w_bpp)
278        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
279        PF  bic,    WK0, base, #31
280        PF  pld,    [WK0]
281        PF  add,    WK1, base, X, LSL #bpp_shift
282        PF  sub,    WK1, WK1, #1
283        PF  bic,    WK1, WK1, #31
284        PF  cmp,    WK1, WK0
285        PF  beq,    90f
286        PF  pld,    [WK1]
28790:
288  .else
289        PF  bic,    WK0, base, #31
290        PF  pld,    [WK0]
291        PF  add,    WK1, base, X, lsl #bpp_shift
292        PF  sub,    WK1, WK1, #1
293        PF  bic,    WK1, WK1, #31
294        PF  cmp,    WK1, WK0
295        PF  beq,    92f
29691:     PF  add,    WK0, WK0, #32
297        PF  cmp,    WK0, WK1
298        PF  pld,    [WK0]
299        PF  bne,    91b
30092:
301  .endif
302 .endif
303.endm
304
305
306.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
307        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
308 .if decrementx
309        sub&cond X, X, #8*numbytes/dst_w_bpp
310 .endif
311        process_tail  cond, numbytes, firstreg
312 .if !((flags) & FLAG_PROCESS_DOES_STORE)
313        pixst   cond, numbytes, firstreg, DST
314 .endif
315.endm
316
317.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
318 .if (flags) & FLAG_BRANCH_OVER
319  .ifc cond,mi
320        bpl     100f
321  .endif
322  .ifc cond,cs
323        bcc     100f
324  .endif
325  .ifc cond,ne
326        beq     100f
327  .endif
328        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
329100:
330 .else
331        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
332 .endif
333.endm
334
335.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
336 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
337        /* Can't interleave reads and writes */
338        test
339        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
340  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
341        test
342  .endif
343        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
344 .else
345        /* Can interleave reads and writes for better scheduling */
346        test
347        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
348        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
349  .if decrementx
350        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
351        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
352  .endif
353        process_tail  cond1, numbytes1, firstreg1
354        process_tail  cond2, numbytes2, firstreg2
355        pixst   cond1, numbytes1, firstreg1, DST
356        pixst   cond2, numbytes2, firstreg2, DST
357 .endif
358.endm
359
360
361.macro test_bits_1_0_ptr
362        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
363.endm
364
365.macro test_bits_3_2_ptr
366        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
367.endm
368
369.macro leading_15bytes  process_head, process_tail
370        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
371        /* Use unaligned loads in all cases for simplicity */
372 .if dst_w_bpp == 8
373        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
374 .elseif dst_w_bpp == 16
375        test_bits_1_0_ptr
376        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
377 .endif
378        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
379.endm
380
381.macro test_bits_3_2_pix
382        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
383.endm
384
385.macro test_bits_1_0_pix
386 .if dst_w_bpp == 8
387        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
388 .else
389        movs    SCRATCH, X, lsr #1
390 .endif
391.endm
392
393.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
394        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
395 .if dst_w_bpp == 16
396        test_bits_1_0_pix
397        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
398 .elseif dst_w_bpp == 8
399        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
400 .endif
401.endm
402
403
404.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
405110:
406 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
407 .rept pix_per_block*dst_w_bpp/128
408        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
409  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
410        preload_middle  src_bpp, SRC, 1
411  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
412        preload_middle  mask_bpp, MASK, 1
413  .else
414        preload_middle  src_bpp, SRC, 0
415        preload_middle  mask_bpp, MASK, 0
416  .endif
417  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
418        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
419         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
420         * preloads for, to achieve staggered prefetches for multiple channels, because there are
421         * always two STMs per prefetch, so there is always an opposite STM on which to put the
422         * preload. Note, no need to BIC the base register here */
423        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
424  .endif
425        process_tail  , 16, 0
426  .if !((flags) & FLAG_PROCESS_DOES_STORE)
427        pixst   , 16, 0, DST
428  .endif
429  .set SUBBLOCK, SUBBLOCK+1
430 .endr
431        subs    X, X, #pix_per_block
432        bhs     110b
433.endm
434
435.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
436        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
437 .if dst_r_bpp > 0
438        tst     DST, #16
439        bne     111f
440        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
441        b       112f
442111:
443 .endif
444        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
445112:
446        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
447 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
448        PF  and,    WK0, X, #pix_per_block-1
449 .endif
450        preload_trailing  src_bpp, src_bpp_shift, SRC
451        preload_trailing  mask_bpp, mask_bpp_shift, MASK
452        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
453        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
454        /* The remainder of the line is handled identically to the medium case */
455        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
456.endm
457
458.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
459120:
460        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
461        process_tail  , 16, 0
462 .if !((flags) & FLAG_PROCESS_DOES_STORE)
463        pixst   , 16, 0, DST
464 .endif
465        subs    X, X, #128/dst_w_bpp
466        bhs     120b
467        /* Trailing pixels */
468        tst     X, #128/dst_w_bpp - 1
469        beq     exit_label
470        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
471.endm
472
473.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
474        tst     X, #16*8/dst_w_bpp
475        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
476        /* Trailing pixels */
477        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
478        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
479.endm
480
481.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
482 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
483 .if mask_bpp == 8 || mask_bpp == 16
484        tst     MASK, #3
485        bne     141f
486 .endif
487  .if src_bpp == 8 || src_bpp == 16
488        tst     SRC, #3
489        bne     140f
490  .endif
491        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
492  .if src_bpp == 8 || src_bpp == 16
493        b       exit_label
494140:
495        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
496  .endif
497 .if mask_bpp == 8 || mask_bpp == 16
498        b       exit_label
499141:
500  .if src_bpp == 8 || src_bpp == 16
501        tst     SRC, #3
502        bne     142f
503  .endif
504        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
505  .if src_bpp == 8 || src_bpp == 16
506        b       exit_label
507142:
508        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
509  .endif
510 .endif
511.endm
512
513
514.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
515 .if vars_spilled
516        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
517        /* This is ldmia sp,{} */
518        .word   0xE89D0000 | LINE_SAVED_REGS
519 .endif
520        subs    Y, Y, #1
521 .if vars_spilled
522  .if (LINE_SAVED_REGS) & (1<<1)
523        str     Y, [sp]
524  .endif
525 .endif
526        add     DST, DST, STRIDE_D
527 .if src_bpp > 0
528        add     SRC, SRC, STRIDE_S
529 .endif
530 .if mask_bpp > 0
531        add     MASK, MASK, STRIDE_M
532 .endif
533 .if restore_x
534        mov     X, ORIG_W
535 .endif
536        bhs     loop_label
537 .ifc "last_one",""
538  .if vars_spilled
539        b       197f
540  .else
541        b       198f
542  .endif
543 .else
544  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
545        b       198f
546  .endif
547 .endif
548.endm
549
550
551.macro generate_composite_function fname, \
552                                   src_bpp_, \
553                                   mask_bpp_, \
554                                   dst_w_bpp_, \
555                                   flags_, \
556                                   prefetch_distance_, \
557                                   init, \
558                                   newline, \
559                                   cleanup, \
560                                   process_head, \
561                                   process_tail, \
562                                   process_inner_loop
563
564 .func fname
565 .global fname
566 /* For ELF format also set function visibility to hidden */
567#ifdef __ELF__
568 .hidden fname
569 .type fname, %function
570#endif
571
572/*
573 * Make some macro arguments globally visible and accessible
574 * from other macros
575 */
576 .set src_bpp, src_bpp_
577 .set mask_bpp, mask_bpp_
578 .set dst_w_bpp, dst_w_bpp_
579 .set flags, flags_
580 .set prefetch_distance, prefetch_distance_
581
582/*
583 * Select prefetch type for this function.
584 */
585 .if prefetch_distance == 0
586  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
587 .else
588  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
589 .endif
590
591 .if src_bpp == 32
592  .set src_bpp_shift, 2
593 .elseif src_bpp == 24
594  .set src_bpp_shift, 0
595 .elseif src_bpp == 16
596  .set src_bpp_shift, 1
597 .elseif src_bpp == 8
598  .set src_bpp_shift, 0
599 .elseif src_bpp == 0
600  .set src_bpp_shift, -1
601 .else
602  .error "requested src bpp (src_bpp) is not supported"
603 .endif
604
605 .if mask_bpp == 32
606  .set mask_bpp_shift, 2
607 .elseif mask_bpp == 24
608  .set mask_bpp_shift, 0
609 .elseif mask_bpp == 8
610  .set mask_bpp_shift, 0
611 .elseif mask_bpp == 0
612  .set mask_bpp_shift, -1
613 .else
614  .error "requested mask bpp (mask_bpp) is not supported"
615 .endif
616
617 .if dst_w_bpp == 32
618  .set dst_bpp_shift, 2
619 .elseif dst_w_bpp == 24
620  .set dst_bpp_shift, 0
621 .elseif dst_w_bpp == 16
622  .set dst_bpp_shift, 1
623 .elseif dst_w_bpp == 8
624  .set dst_bpp_shift, 0
625 .else
626  .error "requested dst bpp (dst_w_bpp) is not supported"
627 .endif
628
629 .if (((flags) & FLAG_DST_READWRITE) != 0)
630  .set dst_r_bpp, dst_w_bpp
631 .else
632  .set dst_r_bpp, 0
633 .endif
634
635 .set pix_per_block, 16*8/dst_w_bpp
636 .if src_bpp != 0
637  .if 32*8/src_bpp > pix_per_block
638   .set pix_per_block, 32*8/src_bpp
639  .endif
640 .endif
641 .if mask_bpp != 0
642  .if 32*8/mask_bpp > pix_per_block
643   .set pix_per_block, 32*8/mask_bpp
644  .endif
645 .endif
646 .if dst_r_bpp != 0
647  .if 32*8/dst_r_bpp > pix_per_block
648   .set pix_per_block, 32*8/dst_r_bpp
649  .endif
650 .endif
651
652/* The standard entry conditions set up by pixman-arm-common.h are:
653 * r0 = width (pixels)
654 * r1 = height (rows)
655 * r2 = pointer to top-left pixel of destination
656 * r3 = destination stride (pixels)
657 * [sp] = source pixel value, or pointer to top-left pixel of source
658 * [sp,#4] = 0 or source stride (pixels)
659 * The following arguments are unused for non-mask operations
660 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
661 * [sp,#12] = 0 or mask stride (pixels)
662 */
663
664/*
665 * Assign symbolic names to registers
666 */
667    X           .req    r0  /* pixels to go on this line */
668    Y           .req    r1  /* lines to go */
669    DST         .req    r2  /* destination pixel pointer */
670    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
671    SRC         .req    r4  /* source pixel pointer */
672    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
673    MASK        .req    r6  /* mask pixel pointer (if applicable) */
674    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
675    WK0         .req    r8  /* pixel data registers */
676    WK1         .req    r9
677    WK2         .req    r10
678    WK3         .req    r11
679    SCRATCH     .req    r12
680    ORIG_W      .req    r14 /* width (pixels) */
681
682fname:
683        push    {r4-r11, lr}        /* save all registers */
684
685        subs    Y, Y, #1
686        blo     199f
687
688#ifdef DEBUG_PARAMS
689        sub     sp, sp, #9*4
690#endif
691
692 .if src_bpp > 0
693        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
694        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
695 .endif
696 .if mask_bpp > 0
697        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
698        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
699 .endif
700
701#ifdef DEBUG_PARAMS
702        add     Y, Y, #1
703        stmia   sp, {r0-r7,pc}
704        sub     Y, Y, #1
705#endif
706
707        init
708
709        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
710        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
711 .if src_bpp > 0
712        lsl     STRIDE_S, #src_bpp_shift
713        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
714 .endif
715 .if mask_bpp > 0
716        lsl     STRIDE_M, #mask_bpp_shift
717        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
718 .endif
719
720        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
721        cmp     X, #2*16*8/dst_w_bpp - 1
722        blo     170f
723 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
724        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
725        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
726        blo     160f
727
728        /* Wide case */
729        /* Adjust X so that the decrement instruction can also test for
730         * inner loop termination. We want it to stop when there are
731         * (prefetch_distance+1) complete blocks to go. */
732        sub     X, X, #(prefetch_distance+2)*pix_per_block
733        mov     ORIG_W, X
734  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
735        /* This is stmdb sp!,{} */
736        .word   0xE92D0000 | LINE_SAVED_REGS
737  .endif
738151:    /* New line */
739        newline
740        preload_leading_step1  src_bpp, WK1, SRC
741        preload_leading_step1  mask_bpp, WK2, MASK
742        preload_leading_step1  dst_r_bpp, WK3, DST
743
744        tst     DST, #15
745        beq     154f
746        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
747  .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
748        PF  and,    WK0, WK0, #15
749  .endif
750
751        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
752        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
753        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
754
755        leading_15bytes  process_head, process_tail
756
757154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
758 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
759        and     SCRATCH, SRC, #31
760        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
761 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
762        and     SCRATCH, MASK, #31
763        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
764 .endif
765 .ifc "process_inner_loop",""
766        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
767 .else
768        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
769 .endif
770
771157:    /* Check for another line */
772        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
773 .endif
774
775 .ltorg
776
777160:    /* Medium case */
778        mov     ORIG_W, X
779 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
780        /* This is stmdb sp!,{} */
781        .word   0xE92D0000 | LINE_SAVED_REGS
782 .endif
783161:    /* New line */
784        newline
785        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
786        preload_line 0, mask_bpp, mask_bpp_shift, MASK
787        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
788
789        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
790        tst     DST, #15
791        beq     164f
792        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
793
794        leading_15bytes  process_head, process_tail
795
796164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
797        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
798
799167:    /* Check for another line */
800        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
801
802 .ltorg
803
804170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
805 .if dst_w_bpp < 32
806        mov     ORIG_W, X
807 .endif
808 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
809        /* This is stmdb sp!,{} */
810        .word   0xE92D0000 | LINE_SAVED_REGS
811 .endif
812171:    /* New line */
813        newline
814        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
815        preload_line 1, mask_bpp, mask_bpp_shift, MASK
816        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
817
818 .if dst_w_bpp == 8
819        tst     DST, #3
820        beq     174f
821172:    subs    X, X, #1
822        blo     177f
823        process_head  , 1, 0, 1, 1, 0
824        process_tail  , 1, 0
825  .if !((flags) & FLAG_PROCESS_DOES_STORE)
826        pixst   , 1, 0, DST
827  .endif
828        tst     DST, #3
829        bne     172b
830 .elseif dst_w_bpp == 16
831        tst     DST, #2
832        beq     174f
833        subs    X, X, #1
834        blo     177f
835        process_head  , 2, 0, 1, 1, 0
836        process_tail  , 2, 0
837  .if !((flags) & FLAG_PROCESS_DOES_STORE)
838        pixst   , 2, 0, DST
839  .endif
840 .endif
841
842174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
843        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
844
845177:    /* Check for another line */
846        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
847
848197:
849 .if (flags) & FLAG_SPILL_LINE_VARS
850        add     sp, sp, #LINE_SAVED_REG_COUNT*4
851 .endif
852198:
853        cleanup
854
855#ifdef DEBUG_PARAMS
856        add     sp, sp, #9*4 /* junk the debug copy of arguments */
857#endif
858199:
859        pop     {r4-r11, pc}  /* exit */
860
861 .ltorg
862
863    .unreq  X
864    .unreq  Y
865    .unreq  DST
866    .unreq  STRIDE_D
867    .unreq  SRC
868    .unreq  STRIDE_S
869    .unreq  MASK
870    .unreq  STRIDE_M
871    .unreq  WK0
872    .unreq  WK1
873    .unreq  WK2
874    .unreq  WK3
875    .unreq  SCRATCH
876    .unreq  ORIG_W
877    .endfunc
878.endm
879
880.macro line_saved_regs  x:vararg
881 .set LINE_SAVED_REGS, 0
882 .set LINE_SAVED_REG_COUNT, 0
883 .irp SAVED_REG,x
884  .ifc "SAVED_REG","Y"
885   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
886   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
887  .endif
888  .ifc "SAVED_REG","STRIDE_D"
889   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
890   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
891  .endif
892  .ifc "SAVED_REG","STRIDE_S"
893   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
894   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
895  .endif
896  .ifc "SAVED_REG","STRIDE_M"
897   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
898   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
899  .endif
900  .ifc "SAVED_REG","ORIG_W"
901   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
902   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
903  .endif
904 .endr
905.endm
906
907.macro nop_macro x:vararg
908.endm
909