13a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* 23a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Copyright (C) 2015 The Android Open Source Project 33a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 43a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Licensed under the Apache License, Version 2.0 (the "License"); 53a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * you may not use this file except in compliance with the License. 63a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * You may obtain a copy of the License at 73a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 83a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * http://www.apache.org/licenses/LICENSE-2.0 93a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Unless required by applicable law or agreed to in writing, software 113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * distributed under the License is distributed on an "AS IS" BASIS, 123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * See the License for the specific language governing permissions and 143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * limitations under the License. 153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart 183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define END(f) .fnend; .size f, .-f; 193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.eabi_attribute 25,1 @Tag_ABI_align8_preserved 213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.arm 223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1 243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * integer (bicubic has a little overshoot). It would also be possible to add 253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a temporary DC bias to eliminate the sign bit for more precision, but that's 263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * extra arithmetic. 273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VERTBITS, 14 293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The size of the scratch buffer in which we store our vertically convolved 313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * intermediates. 323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSHIFT, 7 343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSIZE, (1 << CHUNKSHIFT) 353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The number of components processed in a single iteration of the innermost 373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * loop. 383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSHIFT, 3 403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSIZE, (1<<VECSHIFT) 413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Read four different lines (except at edges where addresses may be clamped, 433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * which is why we don't simply take base and stride registers), and multiply 443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * and accumulate them by the coefficients in d6[0..3], leaving the results in 453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * q12. This gives eight 16-bit results representing a horizontal line of 2-8 463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * input pixels (depending on number of components per pixel) to be fed into 473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the horizontal scaling pass. 483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are 503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * known to represent negative values and VMLS is used to implement this). 513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Output is VERTBITS signed fixed-point, which must leave room for a little 523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * bit of overshoot beyond [0,1.0). 533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert8, dstlo=d24, dsthi=d25 553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u8 d16, [r4]! 563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u8 d18, [r5]! 573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u8 d20, [r6]! 583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u8 d22, [r7]! 593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q8, d16 603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q9, d18 613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q10, d20 623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q11, d22 633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.u16 q12, d18, d6[1] 643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.u16 q13, d19, d6[1] 653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.u16 q12, d16, d6[0] 663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.u16 q13, d17, d6[0] 673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.u16 q12, d20, d6[2] 683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.u16 q13, d21, d6[2] 693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.u16 q12, d22, d6[3] 703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.u16 q13, d23, d6[3] 713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies), 733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * minus VERTBITS (the number of fraction bits we want to keep from 743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * here on). 753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqshrn.s32 \dstlo, q12, #8 + 16 - VERTBITS 773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqshrn.s32 \dsthi, q13, #8 + 16 - VERTBITS 783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm 793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* As above, but only four 16-bit results into d25. 813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert4 833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u32 d16[0], [r4]! 843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u32 d18[0], [r5]! 853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u32 d20[0], [r6]! 863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.u32 d22[0], [r7]! 873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q8, d16 883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q9, d18 893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q10, d20 903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmovl.u8 q11, d22 913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.u16 q12, d18, d6[1] 923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.u16 q12, d16, d6[0] 933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.u16 q12, d20, d6[2] 943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.u16 q12, d22, d6[3] 953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqshrn.s32 d25, q12, #8 + 16 - VERTBITS 963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm 973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* During horizontal resize having CHUNKSIZE input available means being able 1003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * to produce a varying amount of output, depending on the phase of the data. 1013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This function calculates the minimum number of VECSIZE chunks extracted from 1023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a CHUNKSIZE window (r1), and the threshold value for when the count will be 1033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * one higher than that (r0). 1043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * These work out, conveniently, to be the quotient and remainder from: 1053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE) 1063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 1073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The two values can be packed together in a uint64_t for convenience; and 1083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * they are, in fact, used this way as an arithmetic short-cut later on. 1093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */ 1123a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResize_oscctl_K) 1133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl r2, r0, #VECSHIFT 1143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie movw r0, #:lower16:(CHUNKSIZE << 16) - 1 1153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie movt r0, #:upper16:(CHUNKSIZE << 16) - 1 1163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r0, r0, r2 1173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#if defined(ARCH_ARM_USE_UDIV) 1183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie udiv r1, r0, r2 1193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mls r0, r1, r2, r0 1203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#else 1213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie clz r3, r2 1223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie clz r1, r0 1233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs r3, r3, r1 1243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie movlt r3, #0 1253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r1, #1 1263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl r2, r2, r3 1273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl r3, r1, r3 1283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r1, #0 1293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: cmp r2, r0 1303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie addls r1, r3 1313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subls r0, r2 1323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsrs r3, r3, #1 1333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr r2, r2, #1 1343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 1b 1353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#endif 1363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bx lr 1373a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResize_oscctl_K) 1383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code. 1403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * For the most part the vertical pass (the outer loop) is the same for all 1413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * versions. Exceptions are handled in-line with conditional assembly. 1423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 143e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp comp, 1, 2, 4 1443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 1453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 0 1463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 1473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 1 1483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 1493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 2 1503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else 1513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.error "Unknown component count" 1523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 1533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT) 1543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT) 1553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2 1573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set OSC_STORE, (BUFFER_SIZE + 0) 1583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set OSCSTEP_STORE, (BUFFER_SIZE + 4) 1593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set OSCCTL_STORE, (BUFFER_SIZE + 8) 1603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set AVAIL_STORE, (BUFFER_SIZE + 16) 1613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set SP_STORE, (BUFFER_SIZE + 24) /* should be +20, but rounded up to make a legal constant somewhere */ 1623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* void rsdIntrinsicResizeB\comp\()_K( 1643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t * restrict dst, // r0 1653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size_t count, // r1 1663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint32_t xf, // r2 1673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint32_t xinc, // r3 1683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict srcn, // [sp] -> [sp,#104] -> r4 1693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict src0, // [sp,#4] -> [sp,#108] -> r5 1703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict src1, // [sp,#8] -> [sp,#112] -> r6 1713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict src2, // [sp,#12] -> [sp,#116] -> r7 1723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size_t xclip, // [sp,#16] -> [sp,#120] 1733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size_t avail, // [sp,#20] -> [sp,#124] -> lr 1743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint64_t osc_ctl, // [sp,#24] -> [sp,#128] 1753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * int32_t const *yr); // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access) 1763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1773a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResizeB\comp\()_K) 1783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 1793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vpush {d8-d15} 1803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* align the working buffer on the stack to make it easy to use bit 1823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * twiddling for address calculations and bounds tests. 1833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r12, sp, #BUFFER_SIZE + 32 1853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov lr, sp 1863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bfc r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1 1873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov sp, r12 1883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str lr, [sp,#SP_STORE] 1893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldr r8, [lr,#136] // yr 1913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie adr r9, 8f 1923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s32 {q4}, [r8] 1933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q5}, [r9] 1943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqmovun.s32 d8, q4 // yr 1953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.s16 q6, r2 1963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.s16 q7, r3 1973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmla.s16 q6, q5, q7 // vxf 1983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshl.s16 q7, q7, #VECSHIFT // vxinc 1993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldrd r4,r5, [lr,#104] // srcn, src0 2013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldrd r6,r7, [lr,#112] // src1, src2 2023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Compute starting condition for oscillator used to compute ahead 2043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * of time how many iterations are possible before needing to 2053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * refill the working buffer. This is based on the fixed-point 2063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * index of the last element in the vector of pixels processed in 2073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * each iteration, counting up until it would overflow. 2083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r8, r2, r3 2103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r9, r3, LSL #VECSHIFT 2113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r8, r8, r9 2123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldrd r10,r11, [lr,#128] // osc_ctl 2143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str r8, [sp,#OSC_STORE] 2163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str r9, [sp,#OSCSTEP_STORE] 2173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str r10, [sp,#OSCCTL_STORE] 2183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str r11, [sp,#OSCCTL_STORE+4] 2193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldrd r10,r11, [lr,#120] // xclip,avail 2203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* r4-r7 contain pointers to the four lines of input to be 2233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * convolved. These pointers have been clamped vertically and 2243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * horizontally (which is why it's not a simple row/stride pair), 2253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * and the xclip argument (now in r10) indicates how many pixels 2263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * from true the x position of the pointer is. This value should 2273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * be 0, 1, or 2 only. 2283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 2293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Start by placing four pixels worth of input at the far end of 2303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the buffer. As many as two of these may be clipped, so four 2313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * pixels are fetched, and then the first pixel is duplicated and 2323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the data shifted according to xclip. The source pointers are 2333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * then also adjusted according to xclip so that subsequent fetches 2343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * match. 2353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d6, d8 /* make y coeffs available for vert4 and vert8 macros */ 2373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r8, r12, r10, LSL #COMPONENT_SHIFT + 1 2393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2 2403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r8, r8, #4 * COMPONENT_COUNT * 2 2413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 2423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert4 2433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.s16 d24, d25[0] 2443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12}, [r12] 2453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {d24}, [r8] 2463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {d24}, [r9] 2473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 2483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 2493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.u32 q11, d24[0] 2503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q11,q12}, [r12] 2513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q12}, [r8] 2523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12}, [r9] 2533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 2543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 d28, d29 2553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 d30, d31 2563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.u64 d24, d28 2573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.u64 d25, d28 2583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.u64 d26, d28 2593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.u64 d27, d28 2603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12,q13}, [r12]! 2613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q14,q15}, [r12] 2623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r12, r12, #32 2633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q11,q12}, [r8] 2643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q11,q12}, [r9] 2653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 2663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Count off four pixels into the working buffer, and move count to 2673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * its new home. 2683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub lr, r11, #4 2703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Incoming pointers were to the first _legal_ pixel. Four pixels 2713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * were read unconditionally, but some may have been discarded by 2723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * xclip, so we rewind the pointers to compensate. 2733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r4, r4, r10, LSL #COMPONENT_SHIFT 2753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r5, r5, r10, LSL #COMPONENT_SHIFT 2763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r6, r6, r10, LSL #COMPONENT_SHIFT 2773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r7, r7, r10, LSL #COMPONENT_SHIFT 2783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* First tap starts where we just pre-filled, at the end of the 2803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * buffer. 2813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, #(CHUNKSIZE * 2 - 4) << 16 2833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Use overflowing arithmetic to implement wraparound array 2853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * indexing. 2863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r2, r2, LSL #(15 - CHUNKSHIFT) 2883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r3, r3, LSL #(15 - CHUNKSHIFT) 2893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str lr, [sp,#AVAIL_STORE] 2913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Start of outermost loop. 2933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the 2943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * number of iterations of the inner loop that can be performed and 2953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * get into that. 2963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 2973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The fill is complicated by the possibility of running out of 2983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * input before the scratch buffer is filled. If this isn't a risk 2993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * then it's handled by the simple loop at 2:, otherwise the 3003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * horrible loop at 3:. 3013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: ldr lr, [sp,#AVAIL_STORE] /* get number of pixels available */ 3033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d6, d8 /* put y scaling coefficients somewhere handy */ 3043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs lr, #CHUNKSIZE 3053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bge 2f /* if at least CHUNKSIZE are available... */ 3063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add lr, #CHUNKSIZE /* if they're not... */ 3073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 4f 3083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* ..just sneaking a literal in here after this unconditional branch.. */ 3093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie8: .hword 0, 1, 2, 3, 4, 5, 6, 7 3103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* basic fill loop, processing 8 bytes at a time until there are 3113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * fewer than eight bytes available. 3123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: vert8 3143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub lr, lr, #8 / COMPONENT_COUNT 3153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12}, [r12]! 3163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4: cmp lr, #8 / COMPONENT_COUNT - 1 3173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 3b 3183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 4 3193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie blt 3f 3203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The last pixel (four bytes) if necessary */ 3213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert4 3223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else 3233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp lr, #1 3243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie blt 3f 3253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The last pixels if necessary */ 3263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r4, r4, #8 3273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r5, r5, #8 3283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r6, r6, #8 3293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r7, r7, #8 3303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r4, r4, lr, LSL #COMPONENT_SHIFT 3313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r5, r5, lr, LSL #COMPONENT_SHIFT 3323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r6, r6, lr, LSL #COMPONENT_SHIFT 3333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r7, r7, lr, LSL #COMPONENT_SHIFT 3343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 3353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub lr, sp, lr, LSL #COMPONENT_SHIFT + 1 3363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub sp, sp, #32 3373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub lr, lr, #16 3383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.s16 q13, d25[3] 3403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.u32 q13, d25[1] 3423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12,q13}, [sp] 3443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q12}, [lr] 3453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add sp, sp, #32 3463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 4f 3473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Keep filling until we get to the end of this chunk of the buffer */ 3493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: 3503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.s16 q12, d25[3] 3523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vdup.u32 q12, d25[1] 3543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 3553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.u64 d24, d25 3563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4: vst1.s16 {q12}, [r12]! 3583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie tst r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 3593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 3b 3603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 4f 3613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.align 4 3633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2: /* Quickly pull a chunk of data into the working buffer. 3643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 3663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12}, [r12]! 3673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 3683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q12}, [r12]! 3693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie tst r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 3703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 2b 3713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp lr, #0 3723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 3f 3733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4: /* if we end with 0 pixels left we'll have nothing handy to spread 3743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * across to the right, so we rewind a bit. 3753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov lr, #1 3773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r4, r4, #COMPONENT_COUNT 3783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r5, r5, #COMPONENT_COUNT 3793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r6, r6, #COMPONENT_COUNT 3803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r7, r7, #COMPONENT_COUNT 3813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: str lr, [sp,#AVAIL_STORE] /* done with available pixel count */ 3823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add lr, sp, #OSC_STORE 3833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldrd r8,r9, [lr,#0] /* need osc, osc_step soon */ 3843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldrd r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */ 3853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* copy four taps (width of cubic window) to far end for overflow 3873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * address handling 3883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2 3903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie eor r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2 3913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {d28}, [lr] 3933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q14}, [lr] 3953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 3963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q14,q15}, [lr] 3973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2 3993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 4003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {d28}, [lr] 4013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 4023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q14}, [lr] 4033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 4043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.s16 {q14,q15}, [lr] 4053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 4063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* r11 contains the maximum possible iteration count, but if r8 is 4073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * greater than r10 then this indicates that the count must be 4083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * reduced by one for this iteration to avoid reading past the end 4093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * of the available data. 4103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp r10, r8 4123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sbc lr, r11, #0 4133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mla r8, lr, r9, r8 4153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r8, r8, #(CHUNKSIZE << 16) 4163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie str r8, [sp,#OSC_STORE] /* done with osc */ 4183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* prefer to count pixels, rather than vectors, to clarify the tail 4203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * store case on exit. 4213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov lr, lr, LSL #VECSHIFT 4233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp lr, r1 4243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie movgt lr, r1 4253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub r1, r1, lr 4273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov lr, lr, LSL #COMPONENT_SHIFT 4293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.i16 d10, #3 4313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.i16 d11, #0x8000 4323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp lr, #0 4343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 3f 4353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp r1, #0 4363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 1b /* an extreme case where we shouldn't use code in this structure */ 4373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 9f 4383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie .align 4 4403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2: /* Inner loop continues here, but starts at 3:, see end of loop 4413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * below for explanation. */ 4423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4 4433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u32 {d16[0]}, [r0]! 4443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8 4453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {d16}, [r0]! 4463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16 4473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {q8}, [r0]! 4483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32 4493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {q8,q9}, [r0]! 4503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 4513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Inner loop: here the four x coefficients for each tap are 4523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * calculated in vector code, and the addresses are calculated in 4533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * scalar code, and these calculations are interleaved. 4543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: vshr.u16 q8, q6, #1 4563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r8, r2, LSR #(31 - CHUNKSHIFT) 4573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrdmulh.s16 q9, q8, q8 4583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 4593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrdmulh.s16 q10, q9, q8 4603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r9, r2, LSR #(31 - CHUNKSHIFT) 4613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshll.s16 q11, d18, #2 4623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshll.s16 q12, d19, #2 4633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 4643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d20, d10 4653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q12, d21, d10 4663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r10, r2, LSR #(31 - CHUNKSHIFT) 4673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vhadd.s16 q0, q10, q8 4693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 4703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vsub.s16 q0, q9, q0 4713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r11, r2, LSR #(31 - CHUNKSHIFT) 4723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vaddw.s16 q1, q11, d18 4743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vaddw.s16 q13, q12, d19 4753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 4763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshrn.s32 d2, q1, #1 4773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshrn.s32 d3, q13, #1 4783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 4793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vsub.s16 d2, d2, d11 4803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vsub.s16 d3, d3, d11 // TODO: find a wider d11 and use q-reg operation 4813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 4823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vaddw.s16 q2, q11, d16 4843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vaddw.s16 q13, q12, d17 4853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 4863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshrn.s32 d4, q2, #1 4873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vshrn.s32 d5, q13, #1 4883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 4893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vneg.s16 q2, q2 4903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vhsub.s16 q3, q10, q9 4923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* increment the x fractional parts (oveflow is ignored, as the 4943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * scalar arithmetic shadows this addition with full precision). 4953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vadd.s16 q6, q6, q7 4973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* At this point we have four pointers in r8-r11, pointing to the 4993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * four taps in the scratch buffer that must be convolved together 5003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * to produce an output pixel (one output pixel per pointer). 5013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * These pointers usually overlap, but their spacing is irregular 5023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * so resolving the redundancy through L1 is a pragmatic solution. 5033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 5043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The scratch buffer is made of signed 16-bit data, holding over 5053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * some extra precision, and overshoot, from the vertical pass. 5063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 5073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * We also have the 16-bit unsigned fixed-point weights for each 5083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * of the four taps in q0 - q3. That's eight pixels worth of 5093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * coefficients when we have only four pointers, so calculations 5103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * for four more pixels are interleaved with the fetch and permute 5113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * code for each variant in the following code. 5123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 5133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The data arrangement is less than ideal for any pixel format, 5143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * but permuting loads help to mitigate most of the problems. 5153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 5163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Note also that the two outside taps of a bicubic are negative, 5173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * but these coefficients are unsigned. The sign is hard-coded by 5183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * use of multiply-and-subtract operations. 5193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 5203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 5213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The uchar 1 case. 5223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Issue one lanewise vld4.s16 to load four consecutive pixels from 5233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * one pointer (one pixel) into four different registers; then load 5243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * four consecutive s16 values from the next pointer (pixel) into 5253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the next lane of those four registers, etc., so that we finish 5263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * with q12 - q15 representing the four taps, and each lane 5273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * representing a separate pixel. 5283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 5293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The first vld4 uses a splat to avoid any false dependency on 5303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the previous state of the register. 5313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 5323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d24[],d26[],d28[],d30[]}, [r8] 5333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r8, r2, LSR #(31 - CHUNKSHIFT) 5343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d24[1],d26[1],d28[1],d30[1]}, [r9] 5363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 5373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r9, r2, LSR #(31 - CHUNKSHIFT) 5383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d24[2],d26[2],d28[2],d30[2]}, [r10] 5403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 5413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r10, r2, LSR #(31 - CHUNKSHIFT) 5423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d24[3],d26[3],d28[3],d30[3]}, [r11] 5443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 5453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r11, r2, LSR #(31 - CHUNKSHIFT) 5463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d25[],d27[],d29[],d31[]}, [r8] 5483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 5493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d25[1],d27[1],d29[1],d31[1]}, [r9] 5503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d25[2],d27[2],d29[2],d31[2]}, [r10] 5513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.s16 {d25[3],d27[3],d29[3],d31[3]}, [r11] 5523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q8, d24, d0 5543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q9, d25, d1 5553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q8, d26, d2 5563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q9, d27, d3 5573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q8, d28, d4 5583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q9, d29, d5 5593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q8, d30, d6 5603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q9, d31, d7 5613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs lr, lr, #LOOP_OUTPUT_SIZE 5633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d16, q8, #15 5653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d17, q9, #15 5663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d16, q8, #VERTBITS - 8 5683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 5693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The uchar2 case: 5703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This time load pairs of values into adjacent lanes in q12 - q15 5713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * by aliasing them as u32 data; leaving room for only four pixels, 5723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * so the process has to be done twice. This also means that the 5733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * coefficient registers fail to align with the coefficient data 5743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * (eight separate pixels), so that has to be doubled-up to match. 5753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 5763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d24[],d26[],d28[],d30[]}, [r8] 5773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r8, r2, LSR #(31 - CHUNKSHIFT) 5783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d24[1],d26[1],d28[1],d30[1]}, [r9] 5803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 5813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r9, r2, LSR #(31 - CHUNKSHIFT) 5823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d25[],d27[],d29[],d31[]}, [r10] 5843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 5853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r10, r2, LSR #(31 - CHUNKSHIFT) 5863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d25[1],d27[1],d29[1],d31[1]}, [r11] 5883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 5893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r11, r2, LSR #(31 - CHUNKSHIFT) 5903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 5913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* double-up coefficients to align with component pairs */ 5933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d20, d0 5943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 5953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d21, d2 5963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d22, d4 5973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d23, d6 5983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d0, d20 5993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d2, d21 6003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d4, d22 6013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d6, d23 6023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q8, d24, d0 6043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q9, d25, d20 6053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q8, d26, d2 6063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q9, d27, d21 6073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q8, d28, d4 6083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q9, d29, d22 6093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q8, d30, d6 6103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q9, d31, d23 6113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d16, q8, #15 6133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d17, q9, #15 6143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d24[],d26[],d28[],d30[]}, [r8] 6163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d24[1],d26[1],d28[1],d30[1]}, [r9] 6173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d25[],d27[],d29[],d31[]}, [r10] 6183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld4.u32 {d25[1],d27[1],d29[1],d31[1]}, [r11] 6193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* double-up coefficients to align with component pairs */ 6213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d0, d1 6223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d2, d3 6233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d4, d5 6243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov d6, d7 6253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d0, d1 6263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d2, d3 6273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d4, d5 6283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vzip.s16 d6, d7 6293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q10, d24, d0 6313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q11, d25, d1 6323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d26, d2 6333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d27, d3 6343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d28, d4 6353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d29, d5 6363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q10, d30, d6 6373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q11, d31, d7 6383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs lr, lr, #LOOP_OUTPUT_SIZE 6403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d18, q10, #15 6423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d19, q11, #15 6433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d16, q8, #VERTBITS - 8 6453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d17, q9, #VERTBITS - 8 6463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 6473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The uchar4 case. 6483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This case is comparatively painless because four s16s are the 6493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * smallest addressable unit for a vmul-by-scalar. Rather than 6503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * permute the data, simply arrange the multiplies to suit the way 6513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the data comes in. That's a lot of data, though, so things 6523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * progress in pairs of pixels at a time. 6533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 6543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q12,q13}, [r8] 6553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r8, r2, LSR #(31 - CHUNKSHIFT) 6563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 6573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q14,q15}, [r9] 6583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 6593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r9, r2, LSR #(31 - CHUNKSHIFT) 6603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 6613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q8, d24, d0[0] 6633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q9, d28, d0[1] 6643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q8, d25, d2[0] 6653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q9, d29, d2[1] 6663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q8, d26, d4[0] 6673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q9, d30, d4[1] 6683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q8, d27, d6[0] 6693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q9, d31, d6[1] 6703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* And two more... */ 6723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q12,q13}, [r10] 6733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 6743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r10, r2, LSR #(31 - CHUNKSHIFT) 6753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 6763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q14,q15}, [r11] 6773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 6783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov r11, r2, LSR #(31 - CHUNKSHIFT) 6793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r2, r2, r3 6803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d16, q8, #15 6823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 6833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d17, q9, #15 6843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q10, d24, d0[2] 6863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q11, d28, d0[3] 6873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d25, d2[2] 6883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d29, d2[3] 6893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d26, d4[2] 6903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d30, d4[3] 6913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q10, d27, d6[2] 6923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q11, d31, d6[3] 6933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d18, q10, #15 6953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d19, q11, #15 6963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d16, q8, #VERTBITS - 8 6983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d17, q9, #VERTBITS - 8 6993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* And two more... */ 7013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q12,q13}, [r8] 7023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q14,q15}, [r9] 7033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q10, d24, d1[0] 7053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q11, d28, d1[1] 7063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d25, d3[0] 7073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d29, d3[1] 7083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d26, d5[0] 7093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d30, d5[1] 7103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q10, d27, d7[0] 7113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q11, d31, d7[1] 7123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* And two more... */ 7143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q12,q13}, [r10] 7153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vld1.s16 {q14,q15}, [r11] 7163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs lr, lr, #LOOP_OUTPUT_SIZE 7183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d18, q10, #15 7203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d19, q11, #15 7213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q10, d24, d1[2] 7233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmull.s16 q11, d28, d1[3] 7243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d25, d3[2] 7253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d29, d3[3] 7263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q10, d26, d5[2] 7273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlsl.s16 q11, d30, d5[3] 7283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q10, d27, d7[2] 7293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmlal.s16 q11, d31, d7[3] 7303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d20, q10, #15 7323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrn.s32 d21, q11, #15 7333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 7343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d18, q9, #VERTBITS - 8 7353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vqrshrun.s16 d19, q10, #VERTBITS - 8 7363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 2b /* continue inner loop */ 7383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The inner loop has already been limited to ensure that none of 7393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the earlier iterations could overfill the output, so the store 7403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * appears within the loop but after the conditional branch (at the 7413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * top). At the end, provided it won't overfill, perform the final 7423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * store here. If it would, then break out to the tricky tail case 7433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * instead. 7443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 7453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie blt 1f 7463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Store the amount of data appropriate to the configuration of the 7473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * instance being assembled. 7483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 7493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4 7503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u32 {d16[0]}, [r0]! 7513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8 7523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {d16}, [r0]! 7533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16 7543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {q8}, [r0]! 7553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32 7563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {q8,q9}, [r0]! 7573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 1b /* resume outer loop */ 7593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Partial tail store case: 7603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Different versions of the code need different subsets of the 7613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * following partial stores. Here the number of components and the 7623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size of the chunk of data produced by each inner loop iteration 7633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * is tested to figure out whether or not each phrase is relevant. 7643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 7653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16 7663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst lr, #16 7673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {q8}, [r0]! 7693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov q8, q9 7703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8 7723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst lr, #8 7733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {d16}, [r0]! 7753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vmov.u8 d16, d17 7763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4 7783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst lr, #4 7793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u32 {d16[0]}, [r0]! 7813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vext.u32 d16, d16, d16, #1 7823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2 7843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst lr, #2 7853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u16 {d16[0]}, [r0]! 7873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vext.u16 d16, d16, d16, #1 7883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1 7903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst lr, #1 7913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vst1.u8 {d16[0]}, [r0]! 7933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: 7953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie9: ldr sp, [sp,#SP_STORE] 7963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vpop {d8-d15} 7973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 7983a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResizeB\comp\()_K) 7993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endr 800