13a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/*
23a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Copyright (C) 2015 The Android Open Source Project
33a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
43a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Licensed under the Apache License, Version 2.0 (the "License");
53a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * you may not use this file except in compliance with the License.
63a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * You may obtain a copy of the License at
73a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
83a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *      http://www.apache.org/licenses/LICENSE-2.0
93a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Unless required by applicable law or agreed to in writing, software
113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * distributed under the License is distributed on an "AS IS" BASIS,
123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * See the License for the specific language governing permissions and
143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * limitations under the License.
153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define END(f) .size f, .-f;
193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * integer (bicubic has a little overshoot).  It would also be possible to add
223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a temporary DC bias to eliminate the sign bit for more precision, but that's
233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * extra arithmetic.
243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VERTBITS, 14
263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The size of the scratch buffer in which we store our vertically convolved
283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * intermediates.
293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSIZE, (1 << CHUNKSHIFT)
323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The number of components processed in a single iteration of the innermost
343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * loop.
353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSHIFT, 3
373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSIZE, (1<<VECSHIFT)
383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Read four different lines (except at edges where addresses may be clamped,
403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * which is why we don't simply take base and stride registers), and multiply
413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * and accumulate them by the coefficients in v3[0..3], leaving the results in
423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * input pixels (depending on number of components per pixel) to be fed into
443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the horizontal scaling pass.
453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * known to represent negative values and VMLS is used to implement this).
483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Output is VERTBITS signed fixed-point, which must leave room for a little
493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * v12.  This gives eight 16-bit results.
503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert8, dstlo=v12.4h, dsthi=v12.8h
523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v8.8b}, [x4], #8
533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v9.8b}, [x5], #8
543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v10.8b}, [x6], #8
553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v11.8b}, [x7], #8
563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v8.8h, v8.8b
573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v9.8h, v9.8b
583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v10.8h, v10.8b
593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v11.8h, v11.8b
603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umull       v12.4s, v9.4h, v3.h[1]
613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umull2      v13.4s, v9.8h, v3.h[1]
623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlsl       v12.4s, v8.4h, v3.h[0]
633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlsl2      v13.4s, v8.8h, v3.h[0]
643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlal       v12.4s, v10.4h, v3.h[2]
653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlal2      v13.4s, v10.8h, v3.h[2]
663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlsl       v12.4s, v11.4h, v3.h[3]
673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlsl2      v13.4s, v11.8h, v3.h[3]
683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie         * minus VERTBITS (the number of fraction bits we want to keep from
713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie         * here on).
723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie         */
733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm
763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* As above, but only four 16-bit results into v12hi.
783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert4, dst=v12.8h
803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v8.s}[0], [x4], #4
813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v9.s}[0], [x5], #4
823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v10.s}[0], [x6], #4
833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ld1         {v11.s}[0], [x7], #4
843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v8.8h, v8.8b
853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v9.8h, v9.8b
863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v10.8h, v10.8b
873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        uxtl        v11.8h, v11.8b
883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umull       v12.4s, v9.4h, v3.h[1]
893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlsl       v12.4s, v8.4h, v3.h[0]
903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlal       v12.4s, v10.4h, v3.h[2]
913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        umlsl       v12.4s, v11.4h, v3.h[3]
923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.ifc \dst,v12.8h
933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else
953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm
983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* During horizontal resize having CHUNKSIZE input available means being able
1013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * to produce a varying amount of output, depending on the phase of the data.
1023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This function calculates the minimum number of VECSIZE chunks extracted from
1033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a CHUNKSIZE window (x1), and the threshold value for when the count will be
1043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * one higher than that (x0).
1053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * These work out, conveniently, to be the quotient and remainder from:
1063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
1073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *
1083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The two values are packed together in a uint64_t for convenience; and
1093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * they are, in fact, used this way as an arithmetic short-cut later on.
1103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
1113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
1123a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResize_oscctl_K)
1133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        lsl         x2, x0, #VECSHIFT
1143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        mov         x0, #(CHUNKSIZE << 16) - 1
1153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        add         x0, x0, x2
1163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        udiv        x1, x0, x2
1173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        msub        x0, x1, x2, x0
1183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        add         x0, x0, x1, LSL #32
1193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie        ret
1203a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResize_oscctl_K)
1213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
1233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * For the most part the vertical pass (the outer loop) is the same for all
1243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * versions.  Exceptions are handled in-line with conditional assembly.
1253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
1263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.irep comp, 1, 2, 4
1273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
1283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 0
1293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
1303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 1
1313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
1323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 2
1333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else
1343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.error "Unknown component count"
1353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
1363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
1373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
1383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
1403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* void rsdIntrinsicResizeB1_K(
1423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t * restrict dst,          // x0
1433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             size_t count,                    // x1
1443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint32_t xf,                     // x2
1453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint32_t xinc,                   // x3
1463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict srcn,   // x4
1473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict src0,   // x5
1483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict src1,   // x6
1493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint8_t const * restrict src2,   // x7
1503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             size_t xclip,                    // [sp,#0]  -> [sp,#64] -> x12
1513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             size_t avail,                    // [sp,#8]  -> [sp,#72] -> x11
1523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#80] -> x10
1533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie *             int32 const *yr,                 // [sp,#24] -> [sp,#88] -> v4   (copied to v3   for scalar access)
1543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */
1553a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResizeB\comp\()_K)
1563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x8, sp, #32
1573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         sp, sp, #64
1583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.1d - v11.1d}, [sp]
1593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.1d - v15.1d}, [x8]
1603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* align the working buffer on the stack to make it easy to use bit
1623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * twiddling for address calculations.
1633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
1643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x12, sp, #BUFFER_SIZE
1653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
1663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldr         x8, [sp,#88]            // yr
1683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            adr         x9, 8f
1693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v4.4s}, [x8]
1703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v5.8h}, [x9]
1713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqxtun      v4.4h, v4.4s            // yr
1723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v6.8h, w2
1733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v7.8h, w3
1743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mla         v6.8h, v5.8h, v7.8h     // vxf
1753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shl         v7.8h, v7.8h, #VECSHIFT // vxinc
1763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Compute starting condition for oscillator used to compute ahead
1783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * of time how many iterations are possible before needing to
1793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * refill the working buffer.  This is based on the fixed-point
1803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * index of the last element in the vector of pixels processed in
1813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * each iteration, counting up until it would overflow.
1823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
1833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x8, x2, x3
1843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsl         x9, x3, #VECSHIFT
1853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x8, x8, x9
1863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldr         x10, [sp,#80]           // osc_ctl
1883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ldp         x13,x11, [sp,#64]       // xclip, avail
1893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         x18, sp
1913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         sp, x12
1923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
1933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* x4-x7 contain pointers to the four lines of input to be
1943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * convolved.  These pointers have been clamped vertically and
1953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * horizontally (which is why it's not a simple row/stride pair),
1963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * and the xclip argument (now in x13) indicates how many pixels
1973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * from true the x position of the pointer is.  This value should
1983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * be 0, 1, or 2 only.
1993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
2003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Start by placing four pixels worth of input at the far end of
2013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the buffer.  As many as two of these may be clipped, so four
2023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * pixels are fetched, and then the first pixel is duplicated and
2033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the data shifted according to xclip.  The source pointers are
2043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * then also adjusted according to xclip so that subsequent fetches
2053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * match.
2063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
2083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x14, x12, x13, LSL #COMPONENT_SHIFT + 1
2093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
2103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x14, x14, #4 * COMPONENT_COUNT * 2
2113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
2123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert4       v12.4h
2133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v11.4h, v12.h[0]
2143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v11.4h,v12.4h}, [x12]
2153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.4h}, [x14]
2163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.4h}, [x15]
2173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
2183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
2193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v11.4s, v12.s[0]
2203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v11.8h,v12.8h}, [x12]
2213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.8h}, [x14]
2223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.8h}, [x15]
2233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
2243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8       v14.4h, v14.8h
2253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8       v15.4h, v15.8h
2263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v12.2d, v14.d[0]
2273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v13.2d, v14.d[0]
2283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.8h,v13.8h}, [x12], #32
2293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v14.8h,v15.8h}, [x12]
2303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x12, x12, #32
2313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v11.8h,v12.8h}, [x14]
2323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v11.8h,v12.8h}, [x15]
2333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
2343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Count off four pixels into the working buffer.
2353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x11, x11, #4
2373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
2383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * were read unconditionally, but some may have been discarded by
2393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * xclip, so we rewind the pointers to compensate.
2403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x4, x4, x13, LSL #COMPONENT_SHIFT
2423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x5, x5, x13, LSL #COMPONENT_SHIFT
2433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x6, x6, x13, LSL #COMPONENT_SHIFT
2443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x7, x7, x13, LSL #COMPONENT_SHIFT
2453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* First tap starts where we just pre-filled, at the end of the
2473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * buffer.
2483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
2503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Use overflowing arithmetic to implement wraparound array
2523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * indexing.
2533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsl         x2, x2, #(47 - CHUNKSHIFT)
2553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsl         x3, x3, #(47 - CHUNKSHIFT)
2563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
2583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Start of outermost loop.
2593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
2603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * number of iterations of the inner loop that can be performed and
2613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * get into that.
2623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
2633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The fill is complicated by the possibility of running out of
2643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * input before the scratch buffer is filled.  If this isn't a risk
2653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * then it's handled by the simple loop at 2:, otherwise the
2663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * horrible loop at 3:.
2673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
2693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        x11, x11, #CHUNKSIZE
2703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bge         2f                      /* if at least CHUNKSIZE are available... */
2713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x11, x11, #CHUNKSIZE    /* if they're not... */
2723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           4f
2733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* ..just sneaking a literal in here after this unconditional branch.. */
2743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
2753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* basic fill loop, processing 8 bytes at a time until there are
2763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * fewer than eight bytes available.
2773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
2783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:          vert8
2793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x11, x11, #8 / COMPONENT_COUNT
2803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.8h}, [x12], #16
2813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4:          cmp         x11, #8 / COMPONENT_COUNT - 1
2823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         3b
2833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 4
2843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            blt         3f
2853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The last pixel (four bytes) if necessary */
2863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert4
2873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else
2883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         x11, #1
2893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            blt         3f
2903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The last pixels if necessary */
2913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x4, x4, #8
2923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x5, x5, #8
2933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x6, x6, #8
2943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x7, x7, #8
2953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x4, x4, x11, LSL #COMPONENT_SHIFT
2963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x5, x5, x11, LSL #COMPONENT_SHIFT
2973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x6, x6, x11, LSL #COMPONENT_SHIFT
2983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x7, x7, x11, LSL #COMPONENT_SHIFT
2993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
3003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x11, sp, x11, LSL #COMPONENT_SHIFT + 1
3013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         sp, sp, #32
3023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x11, x11, #16
3033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v13.8h, v12.h[7]
3053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v13.4s, v12.s[3]
3073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.8h,v13.8h}, [sp]
3093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.8h}, [x11]
3103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         sp, sp, #32
3113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           4f
3123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Keep filling until we get to the end of this chunk of the buffer */
3143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:
3153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v12.8h, v12.h[7]
3173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v12.4s, v12.s[3]
3193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
3203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v12.2d, v12.d[1]
3213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4:          st1         {v12.8h}, [x12], #16
3233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
3243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bne         3b
3253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           4f
3263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.align 4
3283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2:          /* Quickly pull a chunk of data into the working buffer.
3293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
3313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.8h}, [x12], #16
3323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            vert8
3333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v12.8h}, [x12], #16
3343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
3353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bne         2b
3363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         x11, #0
3373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bne         3f
3383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4:          /* if we end with 0 pixels left we'll have nothing handy to spread
3393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * across to the right, so we rewind a bit.
3403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         x11, #1
3423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x4, x4, #COMPONENT_COUNT
3433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x5, x5, #COMPONENT_COUNT
3443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x6, x6, #COMPONENT_COUNT
3453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x7, x7, #COMPONENT_COUNT
3463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:          /* copy four taps (width of cubic window) to far end for overflow
3473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * address handling
3483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
3503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
3513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.4h}, [x13]
3533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.8h}, [x13]
3553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
3563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.8h,v15.8h}, [x13]
3573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
3593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
3603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v14.4h}, [x13]
3613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
3623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v14.8h}, [x13]
3633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
3643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v14.8h,v15.8h}, [x13]
3653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
3663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The high 32-bits of x10 contains the maximum possible iteration
3673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * count, but if x8 is greater than the low 32-bits of x10 then
3683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * this indicates that the count must be reduced by one for this
3693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * iteration to avoid reading past the end of the available data.
3703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x13, x10, x8
3723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x13, x13, #32
3733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            madd        x8, x13, x9, x8
3753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x8, x8, #(CHUNKSIZE << 16)
3763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* prefer to count pixels, rather than vectors, to clarify the tail
3783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * store case on exit.
3793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
3803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsl         x13, x13, #VECSHIFT
3813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         x13, x1
3823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            csel        x13, x1, x13, gt
3833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         x1, x1, x13
3853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsl         x13, x13, #COMPONENT_SHIFT
3873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         w14, #0x8000
3893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            movi        v30.8h, #3
3903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            dup         v31.8h, w14
3913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         x13, #0
3933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         3f
3943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            cmp         x1, #0
3953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
3963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           9f
3973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
3983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            .align 4
3993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2:          /* Inner loop continues here, but starts at 3:, see end of loop
4003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * below for explanation. */
4013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4
4023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.s}[0], [x0], #4
4033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8
4043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.8b}, [x0], #8
4053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16
4063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.16b}, [x0], #16
4073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32
4083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.16b,v9.16b}, [x0], #32
4093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
4103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Inner loop:  here the four x coefficients for each tap are
4113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * calculated in vector code, and the addresses are calculated in
4123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * scalar code, and these calculations are interleaved.
4133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3:          ushr        v8.8h, v6.8h, #1            // sxf
4153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x14, x2, #(63 - CHUNKSHIFT)
4163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
4173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
4183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
4193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x15, x2, #(63 - CHUNKSHIFT)
4203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sshll       v11.4s, v9.4h, #2
4213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sshll2      v12.4s, v9.8h, #2
4223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
4233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v11.4s, v10.4h, v30.4h
4243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v12.4s, v10.8h, v30.8h
4253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x16, x2, #(63 - CHUNKSHIFT)
4263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shadd       v0.8h, v10.8h, v8.8h
4283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
4293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         v0.8h, v9.8h, v0.8h
4303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x17, x2, #(63 - CHUNKSHIFT)
4313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            saddw       v1.4s, v11.4s, v9.4h
4333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            saddw2      v13.4s, v12.4s, v9.8h
4343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
4353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shrn        v1.4h, v1.4s, #1
4363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shrn2       v1.8h, v13.4s, #1
4373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
4383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sub         v1.8h, v1.8h, v31.8h
4393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
4403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            saddw       v2.4s, v11.4s, v8.4h
4423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            saddw2      v13.4s, v12.4s, v8.8h
4433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
4443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shrn        v2.4h, v2.4s, #1
4453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shrn2       v2.8h, v13.4s, #1
4463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
4473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            neg         v2.8h, v2.8h
4483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            shsub       v3.8h, v10.8h, v9.8h
4503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* increment the x fractional parts (oveflow is ignored, as the
4523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * scalar arithmetic shadows this addition with full precision).
4533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         v6.8h, v6.8h, v7.8h
4553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
4563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* At this point we have four pointers in x8-x11, pointing to the
4573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * four taps in the scratch buffer that must be convolved together
4583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * to produce an output pixel (one output pixel per pointer).
4593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * These pointers usually overlap, but their spacing is irregular
4603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * so resolving the redundancy through L1 is a pragmatic solution.
4613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
4623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The scratch buffer is made of signed 16-bit data, holding over
4633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * some extra precision, and overshoot, from the vertical pass.
4643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
4653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * We also have the 16-bit unsigned fixed-point weights for each
4663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * of the four taps in v0 - v3.  That's eight pixels worth of
4673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * coefficients when we have only four pointers, so calculations
4683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * for four more pixels are interleaved with the fetch and permute
4693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * code for each variant in the following code.
4703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
4713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The data arrangement is less than ideal for any pixel format,
4723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * but permuting loads help to mitigate most of the problems.
4733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
4743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Note also that the two outside taps of a bicubic are negative,
4753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * but these coefficients are unsigned.  The sign is hard-coded by
4763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * use of multiply-and-subtract operations.
4773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1
4793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The uchar 1 case.
4803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Issue one lanewise ld4.h to load four consecutive pixels from
4813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * one pointer (one pixel) into four different registers; then load
4823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * four consecutive s16 values from the next pointer (pixel) into
4833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the next lane of those four registers, etc., so that we finish
4843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * with v12 - v15 representing the four taps, and each lane
4853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * representing a separate pixel.
4863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             *
4873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * The first ld4 uses a splat to avoid any false dependency on
4883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the previous state of the register.
4893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
4903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
4913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x14, x2, #(63 - CHUNKSHIFT)
4923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
4933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
4943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
4953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x15, x2, #(63 - CHUNKSHIFT)
4963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
4973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
4983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
4993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x16, x2, #(63 - CHUNKSHIFT)
5003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
5013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
5023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
5033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x17, x2, #(63 - CHUNKSHIFT)
5043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
5053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
5063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
5073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
5083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
5093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
5103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v8.4s, v12.4h, v0.4h
5123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull2      v9.4s, v12.8h, v0.8h
5133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v8.4s, v13.4h, v1.4h
5143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v9.4s, v13.8h, v1.8h
5153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v8.4s, v14.4h, v2.4h
5163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v9.4s, v14.8h, v2.8h
5173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal       v8.4s, v15.4h, v3.4h
5183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v9.4s, v15.8h, v3.8h
5193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        x13, x13, #LOOP_OUTPUT_SIZE
5213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v8.4h, v8.4s, #15
5233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v8.8h, v9.4s, #15
5243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
5263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2
5273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The uchar2 case:
5283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * This time load pairs of values into adjacent lanes in v12 - v15
5293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * by aliasing them as u32 data; leaving room for only four pixels,
5303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * so the process has to be done twice.  This also means that the
5313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * coefficient registers fail to align with the coefficient data
5323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * (eight separate pixels), so that has to be doubled-up to match.
5333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
5343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
5353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x14, x2, #(63 - CHUNKSHIFT)
5363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
5373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
5383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
5393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x15, x2, #(63 - CHUNKSHIFT)
5403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
5413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
5423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
5433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x16, x2, #(63 - CHUNKSHIFT)
5443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
5453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
5463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
5473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x17, x2, #(63 - CHUNKSHIFT)
5483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
5493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* double-up coefficients to align with component pairs */
5513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip1        v16.8h, v0.8h, v0.8h
5523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
5533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip1        v17.8h, v1.8h, v1.8h
5543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip1        v18.8h, v2.8h, v2.8h
5553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip1        v19.8h, v3.8h, v3.8h
5563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v8.4s, v12.4h, v16.4h
5583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull2      v9.4s, v12.8h, v16.8h
5593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v8.4s, v13.4h, v17.4h
5603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v9.4s, v13.8h, v17.8h
5613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v8.4s, v14.4h, v18.4h
5623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v9.4s, v14.8h, v18.8h
5633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal       v8.4s, v15.4h, v19.4h
5643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v9.4s, v15.8h, v19.8h
5653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v8.4h, v8.4s, #15
5673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v8.8h, v9.4s, #15
5683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
5703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
5713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
5723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
5733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* double-up coefficients to align with component pairs */
5753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip2        v16.8h, v0.8h, v0.8h
5763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip2        v17.8h, v1.8h, v1.8h
5773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip2        v18.8h, v2.8h, v2.8h
5783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            zip2        v19.8h, v3.8h, v3.8h
5793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v10.4s, v12.4h, v16.4h
5813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull2      v11.4s, v12.8h, v16.8h
5823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v10.4s, v13.4h, v17.4h
5833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v11.4s, v13.8h, v17.8h
5843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v10.4s, v14.4h, v18.4h
5853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v11.4s, v14.8h, v18.8h
5863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal       v10.4s, v15.4h, v19.4h
5873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v11.4s, v15.8h, v19.8h
5883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        x13, x13, #LOOP_OUTPUT_SIZE
5903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v9.4h, v10.4s, #15
5923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v9.8h, v11.4s, #15
5933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
5943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
5953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
5963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4
5973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The uchar4 case.
5983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * This case is comparatively painless because four s16s are the
5993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * smallest addressable unit for a vmul-by-scalar.  Rather than
6003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * permute the data, simply arrange the multiplies to suit the way
6013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the data comes in.  That's a lot of data, though, so things
6023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * progress in pairs of pixels at a time.
6033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
6043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.8h,v13.8h}, [x14]
6053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x14, x2, #(63 - CHUNKSHIFT)
6063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
6073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.8h,v15.8h}, [x15]
6083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
6093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x15, x2, #(63 - CHUNKSHIFT)
6103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
6113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v8.4s, v12.4h, v0.h[0]
6133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v9.4s, v14.4h, v0.h[1]
6143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v8.4s, v12.8h, v1.h[0]
6153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v9.4s, v14.8h, v1.h[1]
6163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v8.4s, v13.4h, v2.h[0]
6173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v9.4s, v15.4h, v2.h[1]
6183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v8.4s, v13.8h, v3.h[0]
6193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v9.4s, v15.8h, v3.h[1]
6203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* And two more...  */
6223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.8h,v13.8h}, [x16]
6233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
6243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x16, x2, #(63 - CHUNKSHIFT)
6253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
6263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.8h,v15.8h}, [x17]
6273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
6283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            lsr         x17, x2, #(63 - CHUNKSHIFT)
6293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x2, x2, x3
6303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v8.4h, v8.4s, #15
6323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
6333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v8.8h, v9.4s, #15
6343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v10.4s, v12.4h, v0.h[2]
6363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v11.4s, v14.4h, v0.h[3]
6373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v10.4s, v12.8h, v1.h[2]
6383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v11.4s, v14.8h, v1.h[3]
6393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v10.4s, v13.4h, v2.h[2]
6403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v11.4s, v15.4h, v2.h[3]
6413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v10.4s, v13.8h, v3.h[2]
6423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v11.4s, v15.8h, v3.h[3]
6433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v9.4h, v10.4s, #15
6453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v9.8h, v11.4s, #15
6463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
6483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
6493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* And two more...  */
6513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.8h,v13.8h}, [x14]
6523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.8h,v15.8h}, [x15]
6533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v10.4s, v12.4h, v0.h[4]
6553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v11.4s, v14.4h, v0.h[5]
6563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v10.4s, v12.8h, v1.h[4]
6573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v11.4s, v14.8h, v1.h[5]
6583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v10.4s, v13.4h, v2.h[4]
6593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v11.4s, v15.4h, v2.h[5]
6603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v10.4s, v13.8h, v3.h[4]
6613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v11.4s, v15.8h, v3.h[5]
6623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* And two more...  */
6643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.8h,v13.8h}, [x16]
6653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v14.8h,v15.8h}, [x17]
6663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            subs        x13, x13, #LOOP_OUTPUT_SIZE
6683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v9.4h, v10.4s, #15
6703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v9.8h, v11.4s, #15
6713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v10.4s, v12.4h, v0.h[6]
6733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smull       v11.4s, v14.4h, v0.h[7]
6743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v10.4s, v12.8h, v1.h[6]
6753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl2      v11.4s, v14.8h, v1.h[7]
6763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v10.4s, v13.4h, v2.h[6]
6773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlsl       v11.4s, v15.4h, v2.h[7]
6783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v10.4s, v13.8h, v3.h[6]
6793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            smlal2      v11.4s, v15.8h, v3.h[7]
6803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn     v10.4h, v10.4s, #15
6823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrn2    v10.8h, v11.4s, #15
6833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
6843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
6853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
6863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
6873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            bgt         2b      /* continue inner loop */
6883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* The inner loop has already been limited to ensure that none of
6893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * the earlier iterations could overfill the output, so the store
6903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * appears within the loop but after the conditional branch (at the
6913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * top).  At the end, provided it won't overfill, perform the final
6923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * store here.  If it would, then break out to the tricky tail case
6933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * instead.
6943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
6953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            blt         1f
6963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Store the amount of data appropriate to the configuration of the
6973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * instance being assembled.
6983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
6993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4
7003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.s}[0], [x0], #4
7013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8
7023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.8b}, [x0], #8
7033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16
7043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.16b}, [x0], #16
7053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32
7063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.16b,v9.16b}, [x0], #32
7073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            b           1b              /* resume outer loop */
7093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            /* Partial tail store case:
7103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * Different versions of the code need different subsets of the
7113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * following partial stores.  Here the number of components and the
7123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * size of the chunk of data produced by each inner loop iteration
7133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             * is tested to figure out whether or not each phrase is relevant.
7143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie             */
7153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
7163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         x13, #16
7173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.16b}, [x0], #16
7193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            mov         v8.16b, v9.16b
7203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
7223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         x13, #8
7233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.8b}, [x0], #8
7253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ext         v8.16b, v8.16b, v8.16b, #8
7263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
7283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         x13, #4
7293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.s}[0], [x0], #4
7313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ext         v8.8b, v8.8b, v8.8b, #4
7323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
7343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         x13, #2
7353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.h}[0], [x0], #2
7373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ext         v8.8b, v8.8b, v8.8b, #2
7383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
7403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:          tst         x13, #1
7413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            beq         1f
7423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            st1         {v8.b}[0], [x0], #1
7433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif
7443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1:
7453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie9:          mov         sp, x18
7463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v8.1d - v11.1d}, [sp], #32
7473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ld1         {v12.1d - v15.1d}, [sp], #32
7483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie            ret
7493a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResizeB\comp\()_K)
7503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endr
7513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie
752