13a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/*
23a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Copyright (C) 2015 The Android Open Source Project
33a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
43a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Licensed under the Apache License, Version 2.0 (the "License");
53a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * you may not use this file except in compliance with the License.
63a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * You may obtain a copy of the License at
73a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
83a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *      http://www.apache.org/licenses/LICENSE-2.0
93a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Unless required by applicable law or agreed to in writing, software
113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * distributed under the License is distributed on an "AS IS" BASIS,
123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * See the License for the specific language governing permissions and
143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * limitations under the License.
153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define END(f) .fnend; .size f, .-f;
193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.eabi_attribute 25,1 @Tag_ABI_align8_preserved
213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.arm
223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * integer (bicubic has a little overshoot).  It would also be possible to add
253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a temporary DC bias to eliminate the sign bit for more precision, but that's
263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * extra arithmetic.
273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VERTBITS, 14
293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The size of the scratch buffer in which we store our vertically convolved
313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * intermediates.
323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSHIFT, 7
343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSIZE, (1 << CHUNKSHIFT)
353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The number of components processed in a single iteration of the innermost
373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * loop.
383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSHIFT, 3
403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSIZE, (1<<VECSHIFT)
413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Read four different lines (except at edges where addresses may be clamped,
433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * which is why we don't simply take base and stride registers), and multiply
443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * and accumulate them by the coefficients in d6[0..3], leaving the results in
453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * input pixels (depending on number of components per pixel) to be fed into
473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the horizontal scaling pass.
483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * known to represent negative values and VMLS is used to implement this).
513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Output is VERTBITS signed fixed-point, which must leave room for a little
523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * bit of overshoot beyond [0,1.0).
533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert8, dstlo=d24, dsthi=d25
553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u8     d16, [r4]!
563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u8     d18, [r5]!
573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u8     d20, [r6]!
583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u8     d22, [r7]!
593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q8, d16
603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q9, d18
613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q10, d20
623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q11, d22
633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmull.u16   q12, d18, d6[1]
643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmull.u16   q13, d19, d6[1]
653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlsl.u16   q12, d16, d6[0]
663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlsl.u16   q13, d17, d6[0]
673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlal.u16   q12, d20, d6[2]
683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlal.u16   q13, d21, d6[2]
693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlsl.u16   q12, d22, d6[3]
703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlsl.u16   q13, d23, d6[3]
713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie         * minus VERTBITS (the number of fraction bits we want to keep from
743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie         * here on).
753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie         */
763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm
793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* As above, but only four 16-bit results into d25.
813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert4
833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u32    d16[0], [r4]!
843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u32    d18[0], [r5]!
853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u32    d20[0], [r6]!
863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vld1.u32    d22[0], [r7]!
873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q8, d16
883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q9, d18
893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q10, d20
903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmovl.u8    q11, d22
913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmull.u16   q12, d18, d6[1]
923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlsl.u16   q12, d16, d6[0]
933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlal.u16   q12, d20, d6[2]
943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vmlsl.u16   q12, d22, d6[3]
953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm
973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* During horizontal resize having CHUNKSIZE input available means being able
1003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * to produce a varying amount of output, depending on the phase of the data.
1013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This function calculates the minimum number of VECSIZE chunks extracted from
1023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a CHUNKSIZE window (r1), and the threshold value for when the count will be
1033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * one higher than that (r0).
1043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * These work out, conveniently, to be the quotient and remainder from:
1053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
1063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
1073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The two values can be packed together in a uint64_t for convenience; and
1083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * they are, in fact, used this way as an arithmetic short-cut later on.
1093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
1103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
1123a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResize_oscctl_K)
1133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        lsl         r2, r0, #VECSHIFT
1143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
1153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
1163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        add         r0, r0, r2
1173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#if defined(ARCH_ARM_USE_UDIV)
1183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        udiv        r1, r0, r2
1193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        mls         r0, r1, r2, r0
1203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#else
1213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        clz         r3, r2
1223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        clz         r1, r0
1233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        subs        r3, r3, r1
1243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        movlt       r3, #0
1253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        mov         r1, #1
1263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        lsl         r2, r2, r3
1273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        lsl         r3, r1, r3
1283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        mov         r1, #0
1293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:      cmp         r2, r0
1303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        addls       r1, r3
1313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        subls       r0, r2
1323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        lsrs        r3, r3, #1
1333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        lsr         r2, r2, #1
1343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        bne         1b
1353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#endif
1363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        bx          lr
1373a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResize_oscctl_K)
1383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
1403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * For the most part the vertical pass (the outer loop) is the same for all
1413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * versions.  Exceptions are handled in-line with conditional assembly.
1423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
143e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp comp, 1, 2, 4
1443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
1453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 0
1463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
1473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 1
1483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
1493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 2
1503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else
1513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.error "Unknown component count"
1523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
1533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
1543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
1553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
1573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set OSC_STORE, (BUFFER_SIZE + 0)
1583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
1593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set OSCCTL_STORE, (BUFFER_SIZE + 8)
1603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set AVAIL_STORE, (BUFFER_SIZE + 16)
1613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */
1623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* void rsdIntrinsicResizeB\comp\()_K(
1643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t * restrict dst,          // r0
1653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             size_t count,                    // r1
1663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint32_t xf,                     // r2
1673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint32_t xinc,                   // r3
1683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
1693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
1703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
1713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
1723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             size_t xclip,                    // [sp,#16] -> [sp,#120]
1733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
1743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
1753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
1763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
1773a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResizeB\comp\()_K)
1783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vpush       {d8-d15}
1803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* align the working buffer on the stack to make it easy to use bit
1823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * twiddling for address calculations and bounds tests.
1833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
1843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r12, sp, #BUFFER_SIZE + 32
1853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         lr, sp
1863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
1873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         sp, r12
1883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         lr, [sp,#SP_STORE]
1893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldr         r8, [lr,#136]           // yr
1913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            adr         r9, 8f
1923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s32    {q4}, [r8]
1933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q5}, [r9]
1943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqmovun.s32 d8, q4                  // yr
1953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.s16    q6, r2
1963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.s16    q7, r3
1973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmla.s16    q6, q5, q7              // vxf
1983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshl.s16    q7, q7, #VECSHIFT       // vxinc
1993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldrd        r4,r5, [lr,#104]        // srcn, src0
2013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldrd        r6,r7, [lr,#112]        // src1, src2
2023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Compute starting condition for oscillator used to compute ahead
2043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * of time how many iterations are possible before needing to
2053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * refill the working buffer.  This is based on the fixed-point
2063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * index of the last element in the vector of pixels processed in
2073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * each iteration, counting up until it would overflow.
2083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r8, r2, r3
2103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r9, r3, LSL #VECSHIFT
2113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r8, r8, r9
2123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldrd        r10,r11, [lr,#128]      // osc_ctl
2143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         r8, [sp,#OSC_STORE]
2163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         r9, [sp,#OSCSTEP_STORE]
2173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         r10, [sp,#OSCCTL_STORE]
2183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         r11, [sp,#OSCCTL_STORE+4]
2193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldrd        r10,r11, [lr,#120]      // xclip,avail
2203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* r4-r7 contain pointers to the four lines of input to be
2233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * convolved.  These pointers have been clamped vertically and
2243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * horizontally (which is why it's not a simple row/stride pair),
2253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * and the xclip argument (now in r10) indicates how many pixels
2263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * from true the x position of the pointer is.  This value should
2273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * be 0, 1, or 2 only.
2283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
2293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Start by placing four pixels worth of input at the far end of
2303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the buffer.  As many as two of these may be clipped, so four
2313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * pixels are fetched, and then the first pixel is duplicated and
2323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the data shifted according to xclip.  The source pointers are
2333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * then also adjusted according to xclip so that subsequent fetches
2343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * match.
2353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */
2373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
2393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
2403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r8, r8, #4 * COMPONENT_COUNT * 2
2413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
2423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert4
2433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.s16    d24, d25[0]
2443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12}, [r12]
2453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {d24}, [r8]
2463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {d24}, [r9]
2473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
2483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
2493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.u32    q11, d24[0]
2503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q11,q12}, [r12]
2513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q12}, [r8]
2523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12}, [r9]
2533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
2543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8       d28, d29
2553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8       d30, d31
2563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.u64    d24, d28
2573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.u64    d25, d28
2583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.u64    d26, d28
2593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.u64    d27, d28
2603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12,q13}, [r12]!
2613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q14,q15}, [r12]
2623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r12, r12, #32
2633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q11,q12}, [r8]
2643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q11,q12}, [r9]
2653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
2663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Count off four pixels into the working buffer, and move count to
2673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * its new home.
2683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         lr, r11, #4
2703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
2713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * were read unconditionally, but some may have been discarded by
2723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * xclip, so we rewind the pointers to compensate.
2733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r4, r4, r10, LSL #COMPONENT_SHIFT
2753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r5, r5, r10, LSL #COMPONENT_SHIFT
2763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r6, r6, r10, LSL #COMPONENT_SHIFT
2773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r7, r7, r10, LSL #COMPONENT_SHIFT
2783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* First tap starts where we just pre-filled, at the end of the
2803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * buffer.
2813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16
2833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Use overflowing arithmetic to implement wraparound array
2853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * indexing.
2863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r2, r2, LSL #(15 - CHUNKSHIFT)
2883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r3, r3, LSL #(15 - CHUNKSHIFT)
2893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         lr, [sp,#AVAIL_STORE]
2913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Start of outermost loop.
2933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
2943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * number of iterations of the inner loop that can be performed and
2953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * get into that.
2963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
2973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The fill is complicated by the possibility of running out of
2983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * input before the scratch buffer is filled.  If this isn't a risk
2993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * then it's handled by the simple loop at 2:, otherwise the
3003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * horrible loop at 3:.
3013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
3033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d6, d8              /* put y scaling coefficients somewhere handy */
3043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        lr, #CHUNKSIZE
3053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bge         2f                  /* if at least CHUNKSIZE are available... */
3063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         lr, #CHUNKSIZE      /* if they're not... */
3073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           4f
3083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* ..just sneaking a literal in here after this unconditional branch.. */
3093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
3103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* basic fill loop, processing 8 bytes at a time until there are
3113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * fewer than eight bytes available.
3123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:          vert8
3143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         lr, lr, #8 / COMPONENT_COUNT
3153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12}, [r12]!
3163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4:          cmp         lr, #8 / COMPONENT_COUNT - 1
3173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         3b
3183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 4
3193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            blt         3f
3203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The last pixel (four bytes) if necessary */
3213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert4
3223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else
3233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         lr, #1
3243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            blt         3f
3253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The last pixels if necessary */
3263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r4, r4, #8
3273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r5, r5, #8
3283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r6, r6, #8
3293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r7, r7, #8
3303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r4, r4, lr, LSL #COMPONENT_SHIFT
3313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r5, r5, lr, LSL #COMPONENT_SHIFT
3323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r6, r6, lr, LSL #COMPONENT_SHIFT
3333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r7, r7, lr, LSL #COMPONENT_SHIFT
3343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
3353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
3363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         sp, sp, #32
3373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         lr, lr, #16
3383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.s16    q13, d25[3]
3403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.u32    q13, d25[1]
3423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12,q13}, [sp]
3443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q12}, [lr]
3453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         sp, sp, #32
3463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           4f
3473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Keep filling until we get to the end of this chunk of the buffer */
3493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:
3503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.s16    q12, d25[3]
3523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vdup.u32    q12, d25[1]
3543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
3553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.u64    d24, d25
3563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4:          vst1.s16    {q12}, [r12]!
3583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
3593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bne         3b
3603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           4f
3613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.align 4
3633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2:          /* Quickly pull a chunk of data into the working buffer.
3643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
3663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12}, [r12]!
3673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
3683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q12}, [r12]!
3693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
3703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bne         2b
3713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         lr, #0
3723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bne         3f
3733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4:          /* if we end with 0 pixels left we'll have nothing handy to spread
3743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * across to the right, so we rewind a bit.
3753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         lr, #1
3773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r4, r4, #COMPONENT_COUNT
3783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r5, r5, #COMPONENT_COUNT
3793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r6, r6, #COMPONENT_COUNT
3803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r7, r7, #COMPONENT_COUNT
3813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
3823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         lr, sp, #OSC_STORE
3833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
3843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
3853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* copy four taps (width of cubic window) to far end for overflow
3873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * address handling
3883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
3903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
3913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {d28}, [lr]
3933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q14}, [lr]
3953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
3963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q14,q15}, [lr]
3973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
3993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
4003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {d28}, [lr]
4013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
4023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q14}, [lr]
4033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
4043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.s16    {q14,q15}, [lr]
4053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
4063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* r11 contains the maximum possible iteration count, but if r8 is
4073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * greater than r10 then this indicates that the count must be
4083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * reduced by one for this iteration to avoid reading past the end
4093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * of the available data.
4103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp             r10, r8
4123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sbc         lr, r11, #0
4133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mla         r8, lr, r9, r8
4153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r8, r8, #(CHUNKSIZE << 16)
4163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            str         r8, [sp,#OSC_STORE]         /* done with osc */
4183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* prefer to count pixels, rather than vectors, to clarify the tail
4203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * store case on exit.
4213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         lr, lr, LSL #VECSHIFT
4233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         lr, r1
4243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            movgt       lr, r1
4253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         r1, r1, lr
4273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         lr, lr, LSL #COMPONENT_SHIFT
4293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.i16    d10, #3
4313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.i16    d11, #0x8000
4323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         lr, #0
4343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         3f
4353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         r1, #0
4363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
4373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           9f
4383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            .align 4
4403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2:          /* Inner loop continues here, but starts at 3:, see end of loop
4413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * below for explanation. */
4423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4
4433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u32    {d16[0]}, [r0]!
4443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8
4453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {d16}, [r0]!
4463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16
4473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {q8}, [r0]!
4483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32
4493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {q8,q9}, [r0]!
4503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
4513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Inner loop:  here the four x coefficients for each tap are
4523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * calculated in vector code, and the addresses are calculated in
4533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * scalar code, and these calculations are interleaved.
4543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:          vshr.u16    q8, q6, #1
4563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
4573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrdmulh.s16 q9, q8, q8
4583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
4593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrdmulh.s16 q10, q9, q8
4603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
4613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshll.s16   q11, d18, #2
4623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshll.s16   q12, d19, #2
4633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
4643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d20, d10
4653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q12, d21, d10
4663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
4673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vhadd.s16   q0, q10, q8
4693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
4703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vsub.s16    q0, q9, q0
4713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
4723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vaddw.s16   q1, q11, d18
4743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vaddw.s16   q13, q12, d19
4753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
4763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshrn.s32   d2, q1, #1
4773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshrn.s32   d3, q13, #1
4783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
4793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vsub.s16    d2, d2, d11
4803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
4813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
4823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vaddw.s16   q2, q11, d16
4843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vaddw.s16   q13, q12, d17
4853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
4863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshrn.s32   d4, q2, #1
4873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vshrn.s32   d5, q13, #1
4883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
4893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vneg.s16    q2, q2
4903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vhsub.s16   q3, q10, q9
4923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* increment the x fractional parts (oveflow is ignored, as the
4943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * scalar arithmetic shadows this addition with full precision).
4953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vadd.s16    q6, q6, q7
4973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* At this point we have four pointers in r8-r11, pointing to the
4993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * four taps in the scratch buffer that must be convolved together
5003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * to produce an output pixel (one output pixel per pointer).
5013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * These pointers usually overlap, but their spacing is irregular
5023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * so resolving the redundancy through L1 is a pragmatic solution.
5033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
5043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The scratch buffer is made of signed 16-bit data, holding over
5053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * some extra precision, and overshoot, from the vertical pass.
5063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
5073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * We also have the 16-bit unsigned fixed-point weights for each
5083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * of the four taps in q0 - q3.  That's eight pixels worth of
5093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * coefficients when we have only four pointers, so calculations
5103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * for four more pixels are interleaved with the fetch and permute
5113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * code for each variant in the following code.
5123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
5133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The data arrangement is less than ideal for any pixel format,
5143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * but permuting loads help to mitigate most of the problems.
5153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
5163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Note also that the two outside taps of a bicubic are negative,
5173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * but these coefficients are unsigned.  The sign is hard-coded by
5183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * use of multiply-and-subtract operations.
5193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
5203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
5213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The uchar 1 case.
5223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Issue one lanewise vld4.s16 to load four consecutive pixels from
5233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * one pointer (one pixel) into four different registers; then load
5243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * four consecutive s16 values from the next pointer (pixel) into
5253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the next lane of those four registers, etc., so that we finish
5263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * with q12 - q15 representing the four taps, and each lane
5273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * representing a separate pixel.
5283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
5293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The first vld4 uses a splat to avoid any false dependency on
5303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the previous state of the register.
5313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
5323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
5333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
5343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
5363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
5373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
5383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
5403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
5413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
5423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
5443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
5453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
5463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
5483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
5493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
5503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
5513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]
5523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q8, d24, d0
5543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q9, d25, d1
5553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q8, d26, d2
5563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q9, d27, d3
5573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q8, d28, d4
5583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q9, d29, d5
5593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q8, d30, d6
5603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q9, d31, d7
5613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        lr, lr, #LOOP_OUTPUT_SIZE
5633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d16, q8, #15
5653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d17, q9, #15
5663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d16, q8, #VERTBITS - 8
5683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
5693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The uchar2 case:
5703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * This time load pairs of values into adjacent lanes in q12 - q15
5713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * by aliasing them as u32 data; leaving room for only four pixels,
5723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * so the process has to be done twice.  This also means that the
5733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * coefficient registers fail to align with the coefficient data
5743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * (eight separate pixels), so that has to be doubled-up to match.
5753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
5763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
5773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
5783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
5803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
5813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
5823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
5843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
5853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
5863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
5883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
5893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
5903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
5913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* double-up coefficients to align with component pairs */
5933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d20, d0
5943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
5953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d21, d2
5963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d22, d4
5973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d23, d6
5983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d0, d20
5993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d2, d21
6003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d4, d22
6013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d6, d23
6023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q8, d24, d0
6043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q9, d25, d20
6053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q8, d26, d2
6063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q9, d27, d21
6073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q8, d28, d4
6083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q9, d29, d22
6093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q8, d30, d6
6103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q9, d31, d23
6113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d16, q8, #15
6133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d17, q9, #15
6143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
6163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
6173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
6183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
6193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* double-up coefficients to align with component pairs */
6213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d0, d1
6223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d2, d3
6233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d4, d5
6243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        d6, d7
6253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d0, d1
6263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d2, d3
6273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d4, d5
6283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vzip.s16    d6, d7
6293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q10, d24, d0
6313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q11, d25, d1
6323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d26, d2
6333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d27, d3
6343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d28, d4
6353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d29, d5
6363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q10, d30, d6
6373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q11, d31, d7
6383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        lr, lr, #LOOP_OUTPUT_SIZE
6403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d18, q10, #15
6423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d19, q11, #15
6433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d16, q8, #VERTBITS - 8
6453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d17, q9, #VERTBITS - 8
6463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
6473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The uchar4 case.
6483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * This case is comparatively painless because four s16s are the
6493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * smallest addressable unit for a vmul-by-scalar.  Rather than
6503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * permute the data, simply arrange the multiplies to suit the way
6513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the data comes in.  That's a lot of data, though, so things
6523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * progress in pairs of pixels at a time.
6533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
6543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q12,q13}, [r8]
6553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
6563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
6573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q14,q15}, [r9]
6583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
6593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
6603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
6613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q8, d24, d0[0]
6633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q9, d28, d0[1]
6643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q8, d25, d2[0]
6653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q9, d29, d2[1]
6663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q8, d26, d4[0]
6673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q9, d30, d4[1]
6683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q8, d27, d6[0]
6693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q9, d31, d6[1]
6703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* And two more...  */
6723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q12,q13}, [r10]
6733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
6743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
6753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
6763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q14,q15}, [r11]
6773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
6783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
6793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r2, r2, r3
6803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d16, q8, #15
6823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
6833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d17, q9, #15
6843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q10, d24, d0[2]
6863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q11, d28, d0[3]
6873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d25, d2[2]
6883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d29, d2[3]
6893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d26, d4[2]
6903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d30, d4[3]
6913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q10, d27, d6[2]
6923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q11, d31, d6[3]
6933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d18, q10, #15
6953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d19, q11, #15
6963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d16, q8, #VERTBITS - 8
6983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d17, q9, #VERTBITS - 8
6993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* And two more...  */
7013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q12,q13}, [r8]
7023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q14,q15}, [r9]
7033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q10, d24, d1[0]
7053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q11, d28, d1[1]
7063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d25, d3[0]
7073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d29, d3[1]
7083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d26, d5[0]
7093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d30, d5[1]
7103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q10, d27, d7[0]
7113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q11, d31, d7[1]
7123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* And two more...  */
7143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q12,q13}, [r10]
7153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vld1.s16    {q14,q15}, [r11]
7163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        lr, lr, #LOOP_OUTPUT_SIZE
7183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d18, q10, #15
7203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d19, q11, #15
7213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q10, d24, d1[2]
7233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmull.s16   q11, d28, d1[3]
7243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d25, d3[2]
7253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d29, d3[3]
7263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q10, d26, d5[2]
7273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlsl.s16   q11, d30, d5[3]
7283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q10, d27, d7[2]
7293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmlal.s16   q11, d31, d7[3]
7303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d20, q10, #15
7323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrn.s32 d21, q11, #15
7333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
7343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d18, q9, #VERTBITS - 8
7353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vqrshrun.s16 d19, q10, #VERTBITS - 8
7363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         2b      /* continue inner loop */
7383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The inner loop has already been limited to ensure that none of
7393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the earlier iterations could overfill the output, so the store
7403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * appears within the loop but after the conditional branch (at the
7413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * top).  At the end, provided it won't overfill, perform the final
7423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * store here.  If it would, then break out to the tricky tail case
7433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * instead.
7443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
7453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            blt         1f
7463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Store the amount of data appropriate to the configuration of the
7473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * instance being assembled.
7483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
7493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4
7503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u32    {d16[0]}, [r0]!
7513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8
7523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {d16}, [r0]!
7533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16
7543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {q8}, [r0]!
7553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32
7563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {q8,q9}, [r0]!
7573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           1b              /* resume outer loop */
7593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Partial tail store case:
7603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Different versions of the code need different subsets of the
7613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * following partial stores.  Here the number of components and the
7623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * size of the chunk of data produced by each inner loop iteration
7633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * is tested to figure out whether or not each phrase is relevant.
7643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
7653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
7663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         lr, #16
7673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {q8}, [r0]!
7693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov        q8, q9
7703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
7723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         lr, #8
7733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {d16}, [r0]!
7753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vmov.u8     d16, d17
7763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
7783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         lr, #4
7793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u32    {d16[0]}, [r0]!
7813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vext.u32    d16, d16, d16, #1
7823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
7843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         lr, #2
7853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u16    {d16[0]}, [r0]!
7873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vext.u16    d16, d16, d16, #1
7883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
7903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         lr, #1
7913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vst1.u8     {d16[0]}, [r0]!
7933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:
7953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie9:          ldr         sp, [sp,#SP_STORE]
7963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vpop        {d8-d15}
7973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
7983a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResizeB\comp\()_K)
7993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endr
800