pixman-arm-simd-asm.h revision 1176bdada62cabc6ec4b0308a930e83b679d5d36
11176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
21176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Copyright © 2012 Raspberry Pi Foundation
31176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Copyright © 2012 RISC OS Open Ltd
41176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
51176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Permission to use, copy, modify, distribute, and sell this software and its
61176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * documentation for any purpose is hereby granted without fee, provided that
71176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * the above copyright notice appear in all copies and that both that
81176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * copyright notice and this permission notice appear in supporting
91176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * documentation, and that the name of the copyright holders not be used in
101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * advertising or publicity pertaining to distribution of the software without
111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * specific, written prior permission.  The copyright holders make no
121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * representations about the suitability of this software for any purpose.  It
131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * is provided "as is" without express or implied warranty.
141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * SOFTWARE.
231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Author:  Ben Avison (bavison@riscosopen.org)
251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Because the alignment of pixel data to cachelines, and even the number of
301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * cachelines per row can vary from row to row, and because of the need to
311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * preload each scanline once and only once, this prefetch strategy treats
321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * each row of pixels independently. When a pixel row is long enough, there
331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * are three distinct phases of prefetch:
341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * * an inner loop section, where each time a cacheline of data is
351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    processed, another cacheline is preloaded (the exact distance ahead is
361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    determined empirically using profiling results from lowlevel-blt-bench)
371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * * a leading section, where enough cachelines are preloaded to ensure no
381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    cachelines escape being preloaded when the inner loop starts
391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * * a trailing section, where a limited number (0 or more) of cachelines
401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    are preloaded to deal with data (if any) that hangs off the end of the
411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    last iteration of the inner loop, plus any trailing bytes that were not
421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    enough to make up one whole iteration of the inner loop
431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *
441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * There are (in general) three distinct code paths, selected between
451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * depending upon how long the pixel row is. If it is long enough that there
461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * is at least one iteration of the inner loop (as described above) then
471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * this is described as the "wide" case. If it is shorter than that, but
481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * there are still enough bytes output that there is at least one 16-byte-
491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * long, 16-byte-aligned write to the destination (the optimum type of
501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * write), then this is the "medium" case. If it is not even this long, then
511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * this is the "narrow" case, and there is no attempt to align writes to
521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * 16-byte boundaries. In the "medium" and "narrow" cases, all the
531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * cachelines containing data from the pixel row are prefetched up-front.
541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Determine whether we put the arguments on the stack for debugging.
581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#undef DEBUG_PARAMS
601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Bit flags for 'generate_composite_function' macro which are used
631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * to tune generated functions behavior.
641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_DST_WRITEONLY,         0
661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_DST_READWRITE,         1
671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_COND_EXEC,             0
681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_BRANCH_OVER,           2
691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_PROCESS_PRESERVES_PSR, 0
701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_PROCESS_CORRUPTS_PSR,  4
711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_PROCESS_DOESNT_STORE,  0
721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_NO_SPILL_LINE_VARS,        0
741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_SPILL_LINE_VARS_WIDE,      16
751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_SPILL_LINE_VARS,           48
771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Offset into stack where mask and source pointer/stride can be accessed.
821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#ifdef DEBUG_PARAMS
841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set ARGS_STACK_OFFSET,        (9*4+9*4)
851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#else
861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set ARGS_STACK_OFFSET,        (9*4)
871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#endif
881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Constants for selecting preferable prefetch type.
911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set PREFETCH_TYPE_NONE,       0
931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.set PREFETCH_TYPE_STANDARD,   1
941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Definitions of macros for load/store of pixel data.
971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
1001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if numbytes == 16
1011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if unaligned == 1
1021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg0, [base], #4
1031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg1, [base], #4
1041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg2, [base], #4
1051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg3, [base], #4
1061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
1071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
1081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
1091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 8
1101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if unaligned == 1
1111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg0, [base], #4
1121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg1, [base], #4
1131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
1141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&m&cond&ia base!, {WK&reg0,WK&reg1}
1151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
1161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 4
1171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond    WK&reg0, [base], #4
1181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 2
1191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond&h  WK&reg0, [base], #2
1201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 1
1211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        op&r&cond&b  WK&reg0, [base], #1
1221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
1231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .error "unsupported size: numbytes"
1241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
1251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
1261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
1281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if numbytes == 16
1291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
1301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 8
1311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        stm&cond&db base, {WK&reg0,WK&reg1}
1321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 4
1331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        str&cond    WK&reg0, [base, #-4]
1341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 2
1351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        str&cond&h  WK&reg0, [base, #-2]
1361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif numbytes == 1
1371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        str&cond&b  WK&reg0, [base, #-1]
1381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
1391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .error "unsupported size: numbytes"
1401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
1411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
1421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro pixld cond, numbytes, firstreg, base, unaligned
1441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
1451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
1461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro pixst cond, numbytes, firstreg, base
1481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (flags) & FLAG_DST_READWRITE
1491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
1501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
1511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
1521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
1531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
1541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro PF a, x:vararg
1561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
1571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        a x
1581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
1591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
1601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro preload_leading_step1  bpp, ptr, base
1631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* If the destination is already 16-byte aligned, then we need to preload
1641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * between 0 and prefetch_distance (inclusive) cache lines ahead so there
1651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * are no gaps when the inner loop starts.
1661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
1671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if bpp > 0
1681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    ptr, base, #31
1691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set OFFSET, 0
1701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .rept prefetch_distance+1
1711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [ptr, #OFFSET]
1721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set OFFSET, OFFSET+32
1731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endr
1741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
1751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
1761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
1771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro preload_leading_step2  bpp, bpp_shift, ptr, base
1781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* However, if the destination is not 16-byte aligned, we may need to
1791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * preload more cache lines than that. The question we need to ask is:
1801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * are the bytes corresponding to the leading pixels more than the amount
1811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * by which the source pointer will be rounded down for preloading, and if
1821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * so, by how many cache lines? Effectively, we want to calculate
1831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
1841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *     inner_loop_offset = (src+leading_bytes)&31
1851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *     extra_needed = leading_bytes - inner_loop_offset
1861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
1871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * possible when there are 4 src bytes for every 1 dst byte).
1881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
1891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if bpp > 0
1901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc base,DST
1911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* The test can be simplified further when preloading the destination */
1921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  tst,    base, #16
1931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  beq,    61f
1941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
1951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .if bpp/dst_w_bpp == 4
1961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
1971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  and,    SCRATCH, SCRATCH, #31
1981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
1991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
2001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
2011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bcs,    61f
2021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bpl,    60f
2031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [ptr, #32*(prefetch_distance+2)]
2041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .else
2051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  mov,    SCRATCH, base, lsl #32-5
2061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
2071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
2081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bls,    61f
2091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .endif
2101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
2111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
2121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck61:
2131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
2141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
2151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
2171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro preload_middle   bpp, base, scratch_holds_offset
2181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if bpp > 0
2191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
2201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
2211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .if scratch_holds_offset
2221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [base, SCRATCH]
2231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .else
2241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    SCRATCH, base, #31
2251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [SCRATCH, #32*prefetch_distance]
2261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .endif
2271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
2281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
2291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
2301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro preload_trailing  bpp, bpp_shift, base
2321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if bpp > 0
2331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if bpp*pix_per_block > 256
2341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Calculations are more complex if more than one fetch per block */
2351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  and,    WK1, base, #31
2361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
2371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
2381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    SCRATCH, base, #31
2391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
2401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    SCRATCH, SCRATCH, #32
2411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  subs,   WK1, WK1, #32
2421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bhi,    80b
2431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
2441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
2451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  mov,    SCRATCH, base, lsl #32-5
2461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
2471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  adceqs, SCRATCH, SCRATCH, #0
2481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* The instruction above has two effects: ensures Z is only
2491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * set if C was clear (so Z indicates that both shifted quantities
2501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * were 0), and clears C if Z was set (so C indicates that the sum
2511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * of the shifted quantities was greater and not equal to 32) */
2521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  beq,    82f
2531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    SCRATCH, base, #31
2541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bcc,    81f
2551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
2561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
2571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck82:
2581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
2591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
2601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
2611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
2631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro preload_line    narrow_case, bpp, bpp_shift, base
2641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* "narrow_case" - just means that the macro was invoked from the "narrow"
2651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    code path rather than the "medium" one - because in the narrow case,
2661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    the row of pixels is known to output no more than 30 bytes, then
2671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    (assuming the source pixels are no wider than the the destination
2681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
2691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    meaning there's no need for a loop.
2701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * "bpp" - number of bits per pixel in the channel (source, mask or
2711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    destination) that's being preloaded, or 0 if this channel is not used
2721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck *    for reading
2731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
2741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * "base" - base address register of channel to preload (SRC, MASK or DST)
2751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
2761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if bpp > 0
2771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if narrow_case && (bpp <= dst_w_bpp)
2781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
2791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    WK0, base, #31
2801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [WK0]
2811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    WK1, base, X, LSL #bpp_shift
2821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  sub,    WK1, WK1, #1
2831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    WK1, WK1, #31
2841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  cmp,    WK1, WK0
2851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  beq,    90f
2861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [WK1]
2871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck90:
2881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
2891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    WK0, base, #31
2901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [WK0]
2911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  add,    WK1, base, X, lsl #bpp_shift
2921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  sub,    WK1, WK1, #1
2931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bic,    WK1, WK1, #31
2941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  cmp,    WK1, WK0
2951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  beq,    92f
2961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck91:     PF  add,    WK0, WK0, #32
2971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  cmp,    WK0, WK1
2981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [WK0]
2991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  bne,    91b
3001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck92:
3011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
3021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
3071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
3081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if decrementx
3091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub&cond X, X, #8*numbytes/dst_w_bpp
3101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  cond, numbytes, firstreg
3121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if !((flags) & FLAG_PROCESS_DOES_STORE)
3131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   cond, numbytes, firstreg, DST
3141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
3181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (flags) & FLAG_BRANCH_OVER
3191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc cond,mi
3201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bpl     100f
3211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
3221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc cond,cs
3231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bcc     100f
3241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
3251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc cond,ne
3261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        beq     100f
3271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
3281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
3291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck100:
3301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
3311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
3321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
3361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
3371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Can't interleave reads and writes */
3381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        test
3391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
3401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
3411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        test
3421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
3431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
3441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
3451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Can interleave reads and writes for better scheduling */
3461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        test
3471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
3481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
3491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if decrementx
3501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
3511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
3521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
3531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  cond1, numbytes1, firstreg1
3541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  cond2, numbytes2, firstreg2
3551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   cond1, numbytes1, firstreg1, DST
3561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   cond2, numbytes2, firstreg2, DST
3571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro test_bits_1_0_ptr
3621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
3631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro test_bits_3_2_ptr
3661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
3671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro leading_15bytes  process_head, process_tail
3701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
3711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Use unaligned loads in all cases for simplicity */
3721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_w_bpp == 8
3731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
3741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif dst_w_bpp == 16
3751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        test_bits_1_0_ptr
3761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
3771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
3791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro test_bits_3_2_pix
3821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
3831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro test_bits_1_0_pix
3861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_w_bpp == 8
3871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
3881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
3891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        movs    SCRATCH, X, lsr #1
3901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
3911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
3921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
3931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
3941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
3951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_w_bpp == 16
3961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        test_bits_1_0_pix
3971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
3981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif dst_w_bpp == 8
3991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
4001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
4011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
4021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
4031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
4041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
4051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck110:
4061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
4071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .rept pix_per_block*dst_w_bpp/128
4081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
4091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
4101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_middle  src_bpp, SRC, 1
4111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
4121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_middle  mask_bpp, MASK, 1
4131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
4141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_middle  src_bpp, SRC, 0
4151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_middle  mask_bpp, MASK, 0
4161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
4171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
4181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
4191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
4201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * preloads for, to achieve staggered prefetches for multiple channels, because there are
4211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * always two STMs per prefetch, so there is always an opposite STM on which to put the
4221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * preload. Note, no need to BIC the base register here */
4231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
4241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
4251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  , 16, 0
4261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if !((flags) & FLAG_PROCESS_DOES_STORE)
4271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   , 16, 0, DST
4281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
4291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set SUBBLOCK, SUBBLOCK+1
4301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endr
4311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        subs    X, X, #pix_per_block
4321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bhs     110b
4331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
4341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
4351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
4361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
4371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_r_bpp > 0
4381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     DST, #16
4391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bne     111f
4401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
4411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       112f
4421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck111:
4431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
4441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
4451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck112:
4461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
4471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
4481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  and,    WK0, X, #pix_per_block-1
4491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
4501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_trailing  src_bpp, src_bpp_shift, SRC
4511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_trailing  mask_bpp, mask_bpp_shift, MASK
4521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
4531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
4541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* The remainder of the line is handled identically to the medium case */
4551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
4561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
4571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
4581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
4591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck120:
4601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
4611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  , 16, 0
4621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if !((flags) & FLAG_PROCESS_DOES_STORE)
4631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   , 16, 0, DST
4641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
4651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        subs    X, X, #128/dst_w_bpp
4661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bhs     120b
4671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Trailing pixels */
4681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     X, #128/dst_w_bpp - 1
4691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        beq     exit_label
4701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
4711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
4721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
4731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
4741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     X, #16*8/dst_w_bpp
4751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
4761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Trailing pixels */
4771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
4781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
4791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
4801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
4811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
4821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
4831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp == 8 || mask_bpp == 16
4841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     MASK, #3
4851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bne     141f
4861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
4871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if src_bpp == 8 || src_bpp == 16
4881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     SRC, #3
4891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bne     140f
4901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
4911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
4921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if src_bpp == 8 || src_bpp == 16
4931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       exit_label
4941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck140:
4951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
4961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
4971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp == 8 || mask_bpp == 16
4981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       exit_label
4991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck141:
5001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if src_bpp == 8 || src_bpp == 16
5011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     SRC, #3
5021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bne     142f
5031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
5041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
5051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if src_bpp == 8 || src_bpp == 16
5061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       exit_label
5071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck142:
5081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
5091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
5101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
5121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
5151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if vars_spilled
5161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
5171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* This is ldmia sp,{} */
5181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        .word   0xE89D0000 | LINE_SAVED_REGS
5191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        subs    Y, Y, #1
5211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if vars_spilled
5221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (LINE_SAVED_REGS) & (1<<1)
5231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        str     Y, [sp]
5241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
5251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     DST, DST, STRIDE_D
5271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if src_bpp > 0
5281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     SRC, SRC, STRIDE_S
5291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp > 0
5311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     MASK, MASK, STRIDE_M
5321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if restore_x
5341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        mov     X, ORIG_W
5351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bhs     loop_label
5371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .ifc "last_one",""
5381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if vars_spilled
5391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       197f
5401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .else
5411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       198f
5421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
5431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
5441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
5451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        b       198f
5461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
5471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
5491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro generate_composite_function fname, \
5521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   src_bpp_, \
5531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   mask_bpp_, \
5541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   dst_w_bpp_, \
5551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   flags_, \
5561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   prefetch_distance_, \
5571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   init, \
5581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   newline, \
5591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   cleanup, \
5601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   process_head, \
5611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   process_tail, \
5621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck                                   process_inner_loop
5631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .func fname
5651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .global fname
5661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck /* For ELF format also set function visibility to hidden */
5671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#ifdef __ELF__
5681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .hidden fname
5691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .type fname, %function
5701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#endif
5711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
5731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Make some macro arguments globally visible and accessible
5741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * from other macros
5751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
5761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set src_bpp, src_bpp_
5771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set mask_bpp, mask_bpp_
5781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set dst_w_bpp, dst_w_bpp_
5791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set flags, flags_
5801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set prefetch_distance, prefetch_distance_
5811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
5831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Select prefetch type for this function.
5841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
5851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if prefetch_distance == 0
5861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
5871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
5881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
5891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
5901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
5911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if src_bpp == 32
5921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set src_bpp_shift, 2
5931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif src_bpp == 24
5941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set src_bpp_shift, 0
5951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif src_bpp == 16
5961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set src_bpp_shift, 1
5971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif src_bpp == 8
5981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set src_bpp_shift, 0
5991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif src_bpp == 0
6001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set src_bpp_shift, -1
6011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
6021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .error "requested src bpp (src_bpp) is not supported"
6031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp == 32
6061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set mask_bpp_shift, 2
6071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif mask_bpp == 24
6081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set mask_bpp_shift, 0
6091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif mask_bpp == 8
6101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set mask_bpp_shift, 0
6111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif mask_bpp == 0
6121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set mask_bpp_shift, -1
6131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
6141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .error "requested mask bpp (mask_bpp) is not supported"
6151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_w_bpp == 32
6181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set dst_bpp_shift, 2
6191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif dst_w_bpp == 24
6201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set dst_bpp_shift, 0
6211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif dst_w_bpp == 16
6221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set dst_bpp_shift, 1
6231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif dst_w_bpp == 8
6241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set dst_bpp_shift, 0
6251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
6261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .error "requested dst bpp (dst_w_bpp) is not supported"
6271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (((flags) & FLAG_DST_READWRITE) != 0)
6301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set dst_r_bpp, dst_w_bpp
6311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
6321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .set dst_r_bpp, 0
6331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set pix_per_block, 16*8/dst_w_bpp
6361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if src_bpp != 0
6371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if 32*8/src_bpp > pix_per_block
6381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set pix_per_block, 32*8/src_bpp
6391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
6401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp != 0
6421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if 32*8/mask_bpp > pix_per_block
6431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set pix_per_block, 32*8/mask_bpp
6441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
6451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_r_bpp != 0
6471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if 32*8/dst_r_bpp > pix_per_block
6481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set pix_per_block, 32*8/dst_r_bpp
6491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
6501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/* The standard entry conditions set up by pixman-arm-common.h are:
6531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * r0 = width (pixels)
6541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * r1 = height (rows)
6551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * r2 = pointer to top-left pixel of destination
6561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * r3 = destination stride (pixels)
6571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * [sp] = source pixel value, or pointer to top-left pixel of source
6581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * [sp,#4] = 0 or source stride (pixels)
6591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * The following arguments are unused for non-mask operations
6601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
6611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * [sp,#12] = 0 or mask stride (pixels)
6621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
6631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck/*
6651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck * Assign symbolic names to registers
6661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck */
6671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    X           .req    r0  /* pixels to go on this line */
6681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    Y           .req    r1  /* lines to go */
6691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    DST         .req    r2  /* destination pixel pointer */
6701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
6711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    SRC         .req    r4  /* source pixel pointer */
6721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
6731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    MASK        .req    r6  /* mask pixel pointer (if applicable) */
6741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
6751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    WK0         .req    r8  /* pixel data registers */
6761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    WK1         .req    r9
6771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    WK2         .req    r10
6781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    WK3         .req    r11
6791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    SCRATCH     .req    r12
6801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    ORIG_W      .req    r14 /* width (pixels) */
6811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6821176bdada62cabc6ec4b0308a930e83b679d5d36John Reckfname:
6831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        push    {r4-r11, lr}        /* save all registers */
6841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        subs    Y, Y, #1
6861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        blo     199f
6871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#ifdef DEBUG_PARAMS
6891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     sp, sp, #9*4
6901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#endif
6911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
6921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if src_bpp > 0
6931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
6941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
6951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
6961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp > 0
6971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
6981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
6991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#ifdef DEBUG_PARAMS
7021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     Y, Y, #1
7031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        stmia   sp, {r0-r7,pc}
7041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     Y, Y, #1
7051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#endif
7061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        init
7081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
7101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
7111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if src_bpp > 0
7121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        lsl     STRIDE_S, #src_bpp_shift
7131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
7141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if mask_bpp > 0
7161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        lsl     STRIDE_M, #mask_bpp_shift
7171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
7181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
7211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        cmp     X, #2*16*8/dst_w_bpp - 1
7221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        blo     170f
7231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
7241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
7251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
7261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        blo     160f
7271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Wide case */
7291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* Adjust X so that the decrement instruction can also test for
7301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * inner loop termination. We want it to stop when there are
7311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck         * (prefetch_distance+1) complete blocks to go. */
7321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     X, X, #(prefetch_distance+2)*pix_per_block
7331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        mov     ORIG_W, X
7341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
7351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* This is stmdb sp!,{} */
7361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        .word   0xE92D0000 | LINE_SAVED_REGS
7371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
7381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck151:    /* New line */
7391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        newline
7401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_leading_step1  src_bpp, WK1, SRC
7411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_leading_step1  mask_bpp, WK2, MASK
7421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_leading_step1  dst_r_bpp, WK3, DST
7431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     DST, #15
7451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        beq     154f
7461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
7471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
7481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        PF  and,    WK0, WK0, #15
7491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
7501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
7521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
7531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
7541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        leading_15bytes  process_head, process_tail
7561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
7581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
7591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        and     SCRATCH, SRC, #31
7601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
7611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
7621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        and     SCRATCH, MASK, #31
7631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
7641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .ifc "process_inner_loop",""
7661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
7671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .else
7681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
7691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck157:    /* Check for another line */
7721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
7731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .ltorg
7761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck160:    /* Medium case */
7781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        mov     ORIG_W, X
7791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
7801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* This is stmdb sp!,{} */
7811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        .word   0xE92D0000 | LINE_SAVED_REGS
7821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
7831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck161:    /* New line */
7841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        newline
7851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
7861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_line 0, mask_bpp, mask_bpp_shift, MASK
7871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
7881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
7901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     DST, #15
7911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        beq     164f
7921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
7931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        leading_15bytes  process_head, process_tail
7951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
7971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
7981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
7991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck167:    /* Check for another line */
8001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
8011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .ltorg
8031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
8051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_w_bpp < 32
8061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        mov     ORIG_W, X
8071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
8081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
8091176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        /* This is stmdb sp!,{} */
8101176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        .word   0xE92D0000 | LINE_SAVED_REGS
8111176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
8121176bdada62cabc6ec4b0308a930e83b679d5d36John Reck171:    /* New line */
8131176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        newline
8141176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
8151176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_line 1, mask_bpp, mask_bpp_shift, MASK
8161176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
8171176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8181176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if dst_w_bpp == 8
8191176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     DST, #3
8201176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        beq     174f
8211176bdada62cabc6ec4b0308a930e83b679d5d36John Reck172:    subs    X, X, #1
8221176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        blo     177f
8231176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  , 1, 0, 1, 1, 0
8241176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  , 1, 0
8251176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if !((flags) & FLAG_PROCESS_DOES_STORE)
8261176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   , 1, 0, DST
8271176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
8281176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     DST, #3
8291176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        bne     172b
8301176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .elseif dst_w_bpp == 16
8311176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        tst     DST, #2
8321176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        beq     174f
8331176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        subs    X, X, #1
8341176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        blo     177f
8351176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_head  , 2, 0, 1, 1, 0
8361176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        process_tail  , 2, 0
8371176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .if !((flags) & FLAG_PROCESS_DOES_STORE)
8381176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pixst   , 2, 0, DST
8391176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
8401176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
8411176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8421176bdada62cabc6ec4b0308a930e83b679d5d36John Reck174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
8431176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
8441176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8451176bdada62cabc6ec4b0308a930e83b679d5d36John Reck177:    /* Check for another line */
8461176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
8471176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8481176bdada62cabc6ec4b0308a930e83b679d5d36John Reck197:
8491176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .if (flags) & FLAG_SPILL_LINE_VARS
8501176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     sp, sp, #LINE_SAVED_REG_COUNT*4
8511176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endif
8521176bdada62cabc6ec4b0308a930e83b679d5d36John Reck198:
8531176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        cleanup
8541176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8551176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#ifdef DEBUG_PARAMS
8561176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        add     sp, sp, #9*4 /* junk the debug copy of arguments */
8571176bdada62cabc6ec4b0308a930e83b679d5d36John Reck#endif
8581176bdada62cabc6ec4b0308a930e83b679d5d36John Reck199:
8591176bdada62cabc6ec4b0308a930e83b679d5d36John Reck        pop     {r4-r11, pc}  /* exit */
8601176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8611176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .ltorg
8621176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8631176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  X
8641176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  Y
8651176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  DST
8661176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  STRIDE_D
8671176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  SRC
8681176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  STRIDE_S
8691176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  MASK
8701176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  STRIDE_M
8711176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  WK0
8721176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  WK1
8731176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  WK2
8741176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  WK3
8751176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  SCRATCH
8761176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .unreq  ORIG_W
8771176bdada62cabc6ec4b0308a930e83b679d5d36John Reck    .endfunc
8781176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
8791176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
8801176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro line_saved_regs  x:vararg
8811176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set LINE_SAVED_REGS, 0
8821176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .set LINE_SAVED_REG_COUNT, 0
8831176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .irp SAVED_REG,x
8841176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc "SAVED_REG","Y"
8851176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
8861176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
8871176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
8881176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc "SAVED_REG","STRIDE_D"
8891176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
8901176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
8911176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
8921176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc "SAVED_REG","STRIDE_S"
8931176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
8941176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
8951176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
8961176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc "SAVED_REG","STRIDE_M"
8971176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
8981176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
8991176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
9001176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .ifc "SAVED_REG","ORIG_W"
9011176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
9021176bdada62cabc6ec4b0308a930e83b679d5d36John Reck   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
9031176bdada62cabc6ec4b0308a930e83b679d5d36John Reck  .endif
9041176bdada62cabc6ec4b0308a930e83b679d5d36John Reck .endr
9051176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
9061176bdada62cabc6ec4b0308a930e83b679d5d36John Reck
9071176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.macro nop_macro x:vararg
9081176bdada62cabc6ec4b0308a930e83b679d5d36John Reck.endm
909