13a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* 23a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Copyright (C) 2015 The Android Open Source Project 33a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 43a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Licensed under the Apache License, Version 2.0 (the "License"); 53a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * you may not use this file except in compliance with the License. 63a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * You may obtain a copy of the License at 73a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 83a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * http://www.apache.org/licenses/LICENSE-2.0 93a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Unless required by applicable law or agreed to in writing, software 113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * distributed under the License is distributed on an "AS IS" BASIS, 123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * See the License for the specific language governing permissions and 143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * limitations under the License. 153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie#define END(f) .size f, .-f; 193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1 213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * integer (bicubic has a little overshoot). It would also be possible to add 223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a temporary DC bias to eliminate the sign bit for more precision, but that's 233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * extra arithmetic. 243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VERTBITS, 14 263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The size of the scratch buffer in which we store our vertically convolved 283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * intermediates. 293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */ 313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set CHUNKSIZE, (1 << CHUNKSHIFT) 323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* The number of components processed in a single iteration of the innermost 343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * loop. 353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSHIFT, 3 373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set VECSIZE, (1<<VECSHIFT) 383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Read four different lines (except at edges where addresses may be clamped, 403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * which is why we don't simply take base and stride registers), and multiply 413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * and accumulate them by the coefficients in v3[0..3], leaving the results in 423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * v12. This gives eight 16-bit results representing a horizontal line of 2-8 433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * input pixels (depending on number of components per pixel) to be fed into 443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the horizontal scaling pass. 453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are 473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * known to represent negative values and VMLS is used to implement this). 483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Output is VERTBITS signed fixed-point, which must leave room for a little 493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * v12. This gives eight 16-bit results. 503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert8, dstlo=v12.4h, dsthi=v12.8h 523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v8.8b}, [x4], #8 533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v9.8b}, [x5], #8 543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v10.8b}, [x6], #8 553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v11.8b}, [x7], #8 563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v8.8h, v8.8b 573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v9.8h, v9.8b 583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v10.8h, v10.8b 593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v11.8h, v11.8b 603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umull v12.4s, v9.4h, v3.h[1] 613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umull2 v13.4s, v9.8h, v3.h[1] 623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlsl v12.4s, v8.4h, v3.h[0] 633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlsl2 v13.4s, v8.8h, v3.h[0] 643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlal v12.4s, v10.4h, v3.h[2] 653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlal2 v13.4s, v10.8h, v3.h[2] 663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlsl v12.4s, v11.4h, v3.h[3] 673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlsl2 v13.4s, v11.8h, v3.h[3] 683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies), 703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * minus VERTBITS (the number of fraction bits we want to keep from 713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * here on). 723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS) 743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS) 753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm 763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* As above, but only four 16-bit results into v12hi. 783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.macro vert4, dst=v12.8h 803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v8.s}[0], [x4], #4 813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v9.s}[0], [x5], #4 823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v10.s}[0], [x6], #4 833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v11.s}[0], [x7], #4 843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v8.8h, v8.8b 853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v9.8h, v9.8b 863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v10.8h, v10.8b 873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie uxtl v11.8h, v11.8b 883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umull v12.4s, v9.4h, v3.h[1] 893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlsl v12.4s, v8.4h, v3.h[0] 903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlal v12.4s, v10.4h, v3.h[2] 913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie umlsl v12.4s, v11.4h, v3.h[3] 923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.ifc \dst,v12.8h 933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS) 943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else 953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqshrn \dst, v12.4s, #8 + (16 - VERTBITS) 963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endm 983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* During horizontal resize having CHUNKSIZE input available means being able 1013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * to produce a varying amount of output, depending on the phase of the data. 1023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This function calculates the minimum number of VECSIZE chunks extracted from 1033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * a CHUNKSIZE window (x1), and the threshold value for when the count will be 1043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * one higher than that (x0). 1053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * These work out, conveniently, to be the quotient and remainder from: 1063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE) 1073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 1083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The two values are packed together in a uint64_t for convenience; and 1093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * they are, in fact, used this way as an arithmetic short-cut later on. 1103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */ 1123a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResize_oscctl_K) 1133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl x2, x0, #VECSHIFT 1143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov x0, #(CHUNKSIZE << 16) - 1 1153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x0, x0, x2 1163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie udiv x1, x0, x2 1173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie msub x0, x1, x2, x0 1183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x0, x0, x1, LSL #32 1193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ret 1203a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResize_oscctl_K) 1213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code. 1233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * For the most part the vertical pass (the outer loop) is the same for all 1243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * versions. Exceptions are handled in-line with conditional assembly. 1253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 126e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp comp, 1, 2, 4 1273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 1283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 0 1293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 1303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 1 1313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 1323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_SHIFT, 2 1333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else 1343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.error "Unknown component count" 1353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 1363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT) 1373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT) 1383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2 1403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie/* void rsdIntrinsicResizeB1_K( 1423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t * restrict dst, // x0 1433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size_t count, // x1 1443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint32_t xf, // x2 1453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint32_t xinc, // x3 1463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict srcn, // x4 1473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict src0, // x5 1483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict src1, // x6 1493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint8_t const * restrict src2, // x7 1503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size_t xclip, // [sp,#0] -> [sp,#64] -> x12 1513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size_t avail, // [sp,#8] -> [sp,#72] -> x11 1523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * uint64_t osc_ctl, // [sp,#16] -> [sp,#80] -> x10 1533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * int32 const *yr, // [sp,#24] -> [sp,#88] -> v4 (copied to v3 for scalar access) 1543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1553a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieENTRY(rsdIntrinsicResizeB\comp\()_K) 1563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x8, sp, #32 1573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub sp, sp, #64 1583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.1d - v11.1d}, [sp] 1593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.1d - v15.1d}, [x8] 1603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* align the working buffer on the stack to make it easy to use bit 1623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * twiddling for address calculations. 1633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x12, sp, #BUFFER_SIZE 1653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1 1663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldr x8, [sp,#88] // yr 1683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie adr x9, 8f 1693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v4.4s}, [x8] 1703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v5.8h}, [x9] 1713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqxtun v4.4h, v4.4s // yr 1723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v6.8h, w2 1733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v7.8h, w3 1743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mla v6.8h, v5.8h, v7.8h // vxf 1753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shl v7.8h, v7.8h, #VECSHIFT // vxinc 1763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Compute starting condition for oscillator used to compute ahead 1783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * of time how many iterations are possible before needing to 1793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * refill the working buffer. This is based on the fixed-point 1803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * index of the last element in the vector of pixels processed in 1813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * each iteration, counting up until it would overflow. 1823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 1833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x8, x2, x3 1843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl x9, x3, #VECSHIFT 1853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x8, x8, x9 1863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldr x10, [sp,#80] // osc_ctl 1883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ldp x13,x11, [sp,#64] // xclip, avail 1893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov x18, sp 1913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov sp, x12 1923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 1933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* x4-x7 contain pointers to the four lines of input to be 1943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * convolved. These pointers have been clamped vertically and 1953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * horizontally (which is why it's not a simple row/stride pair), 1963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * and the xclip argument (now in x13) indicates how many pixels 1973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * from true the x position of the pointer is. This value should 1983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * be 0, 1, or 2 only. 1993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 2003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Start by placing four pixels worth of input at the far end of 2013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the buffer. As many as two of these may be clipped, so four 2023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * pixels are fetched, and then the first pixel is duplicated and 2033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the data shifted according to xclip. The source pointers are 2043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * then also adjusted according to xclip so that subsequent fetches 2053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * match. 2063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */ 208e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1) 2093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2 2103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x14, x14, #4 * COMPONENT_COUNT * 2 2113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 2123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert4 v12.4h 2133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v11.4h, v12.h[0] 2143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v11.4h,v12.4h}, [x12] 2153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.4h}, [x14] 2163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.4h}, [x15] 2173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 2183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 2193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v11.4s, v12.s[0] 2203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v11.8h,v12.8h}, [x12] 2213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.8h}, [x14] 2223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.8h}, [x15] 2233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 2243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 v14.4h, v14.8h 2253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 v15.4h, v15.8h 2263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v12.2d, v14.d[0] 2273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v13.2d, v14.d[0] 2283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.8h,v13.8h}, [x12], #32 2293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v14.8h,v15.8h}, [x12] 2303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x12, x12, #32 2313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v11.8h,v12.8h}, [x14] 2323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v11.8h,v12.8h}, [x15] 2333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 2343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Count off four pixels into the working buffer. 2353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x11, x11, #4 2373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Incoming pointers were to the first _legal_ pixel. Four pixels 2383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * were read unconditionally, but some may have been discarded by 2393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * xclip, so we rewind the pointers to compensate. 2403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 241e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh sub x4, x4, x13, LSL #(COMPONENT_SHIFT) 242e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh sub x5, x5, x13, LSL #(COMPONENT_SHIFT) 243e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh sub x6, x6, x13, LSL #(COMPONENT_SHIFT) 244e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh sub x7, x7, x13, LSL #(COMPONENT_SHIFT) 2453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* First tap starts where we just pre-filled, at the end of the 2473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * buffer. 2483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, #(CHUNKSIZE * 2 - 4) << 16 2503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Use overflowing arithmetic to implement wraparound array 2523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * indexing. 2533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl x2, x2, #(47 - CHUNKSHIFT) 2553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl x3, x3, #(47 - CHUNKSHIFT) 2563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 2583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Start of outermost loop. 2593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the 2603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * number of iterations of the inner loop that can be performed and 2613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * get into that. 2623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 2633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The fill is complicated by the possibility of running out of 2643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * input before the scratch buffer is filled. If this isn't a risk 2653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * then it's handled by the simple loop at 2:, otherwise the 2663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * horrible loop at 3:. 2673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */ 2693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs x11, x11, #CHUNKSIZE 2703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bge 2f /* if at least CHUNKSIZE are available... */ 2713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x11, x11, #CHUNKSIZE /* if they're not... */ 2723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 4f 2733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* ..just sneaking a literal in here after this unconditional branch.. */ 2743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie8: .hword 0, 1, 2, 3, 4, 5, 6, 7 2753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* basic fill loop, processing 8 bytes at a time until there are 2763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * fewer than eight bytes available. 2773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 2783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: vert8 2793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x11, x11, #8 / COMPONENT_COUNT 2803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.8h}, [x12], #16 2813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4: cmp x11, #8 / COMPONENT_COUNT - 1 2823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 3b 2833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 4 2843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie blt 3f 2853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The last pixel (four bytes) if necessary */ 2863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert4 2873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.else 2883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp x11, #1 2893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie blt 3f 2903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The last pixels if necessary */ 2913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x4, x4, #8 2923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x5, x5, #8 2933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x6, x6, #8 2943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x7, x7, #8 295e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh add x4, x4, x11, LSL #(COMPONENT_SHIFT) 296e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh add x5, x5, x11, LSL #(COMPONENT_SHIFT) 297e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh add x6, x6, x11, LSL #(COMPONENT_SHIFT) 298e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh add x7, x7, x11, LSL #(COMPONENT_SHIFT) 2993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 300e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1) 3013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub sp, sp, #32 3023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x11, x11, #16 3033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v13.8h, v12.h[7] 3053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v13.4s, v12.s[3] 3073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.8h,v13.8h}, [sp] 3093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.8h}, [x11] 3103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add sp, sp, #32 3113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 4f 3123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Keep filling until we get to the end of this chunk of the buffer */ 3143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: 3153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v12.8h, v12.h[7] 3173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v12.4s, v12.s[3] 3193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 3203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v12.2d, v12.d[1] 3213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4: st1 {v12.8h}, [x12], #16 3233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 3243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 3b 3253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 4f 3263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.align 4 3283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2: /* Quickly pull a chunk of data into the working buffer. 3293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 3313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.8h}, [x12], #16 3323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie vert8 3333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v12.8h}, [x12], #16 3343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 3353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 2b 3363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp x11, #0 3373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bne 3f 3383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie4: /* if we end with 0 pixels left we'll have nothing handy to spread 3393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * across to the right, so we rewind a bit. 3403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov x11, #1 3423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x4, x4, #COMPONENT_COUNT 3433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x5, x5, #COMPONENT_COUNT 3443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x6, x6, #COMPONENT_COUNT 3453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x7, x7, #COMPONENT_COUNT 3463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: /* copy four taps (width of cubic window) to far end for overflow 3473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * address handling 3483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2 3503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2 3513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.4h}, [x13] 3533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.8h}, [x13] 3553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 3563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.8h,v15.8h}, [x13] 3573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2 3593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 3603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v14.4h}, [x13] 3613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 3623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v14.8h}, [x13] 3633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 3643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v14.8h,v15.8h}, [x13] 3653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 3663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The high 32-bits of x10 contains the maximum possible iteration 3673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * count, but if x8 is greater than the low 32-bits of x10 then 3683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * this indicates that the count must be reduced by one for this 3693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * iteration to avoid reading past the end of the available data. 3703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x13, x10, x8 3723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x13, x13, #32 3733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie madd x8, x13, x9, x8 3753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x8, x8, #(CHUNKSIZE << 16) 3763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* prefer to count pixels, rather than vectors, to clarify the tail 3783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * store case on exit. 3793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 3803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl x13, x13, #VECSHIFT 3813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp x13, x1 3823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie csel x13, x1, x13, gt 3833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub x1, x1, x13 3853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsl x13, x13, #COMPONENT_SHIFT 3873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov w14, #0x8000 3893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie movi v30.8h, #3 3903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie dup v31.8h, w14 3913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp x13, #0 3933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 3f 3943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie cmp x1, #0 3953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 1b /* an extreme case where we shouldn't use code in this structure */ 3963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 9f 3973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 3983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie .align 4 3993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie2: /* Inner loop continues here, but starts at 3:, see end of loop 4003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * below for explanation. */ 4013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4 4023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.s}[0], [x0], #4 4033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8 4043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.8b}, [x0], #8 4053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16 4063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.16b}, [x0], #16 4073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32 4083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.16b,v9.16b}, [x0], #32 4093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 4103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Inner loop: here the four x coefficients for each tap are 4113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * calculated in vector code, and the addresses are calculated in 4123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * scalar code, and these calculations are interleaved. 4133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie3: ushr v8.8h, v6.8h, #1 // sxf 4153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x14, x2, #(63 - CHUNKSHIFT) 4163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2 4173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 4183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3 4193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x15, x2, #(63 - CHUNKSHIFT) 4203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sshll v11.4s, v9.4h, #2 4213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sshll2 v12.4s, v9.8h, #2 4223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 4233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v11.4s, v10.4h, v30.4h 4243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v12.4s, v10.8h, v30.8h 4253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x16, x2, #(63 - CHUNKSHIFT) 4263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shadd v0.8h, v10.8h, v8.8h 4283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 4293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub v0.8h, v9.8h, v0.8h 4303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x17, x2, #(63 - CHUNKSHIFT) 4313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie saddw v1.4s, v11.4s, v9.4h 4333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie saddw2 v13.4s, v12.4s, v9.8h 4343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 4353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shrn v1.4h, v1.4s, #1 4363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shrn2 v1.8h, v13.4s, #1 4373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 4383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sub v1.8h, v1.8h, v31.8h 4393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 4403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie saddw v2.4s, v11.4s, v8.4h 4423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie saddw2 v13.4s, v12.4s, v8.8h 4433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 4443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shrn v2.4h, v2.4s, #1 4453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shrn2 v2.8h, v13.4s, #1 4463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 4473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie neg v2.8h, v2.8h 4483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie shsub v3.8h, v10.8h, v9.8h 4503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* increment the x fractional parts (oveflow is ignored, as the 4523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * scalar arithmetic shadows this addition with full precision). 4533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add v6.8h, v6.8h, v7.8h 4553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 4563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* At this point we have four pointers in x8-x11, pointing to the 4573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * four taps in the scratch buffer that must be convolved together 4583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * to produce an output pixel (one output pixel per pointer). 4593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * These pointers usually overlap, but their spacing is irregular 4603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * so resolving the redundancy through L1 is a pragmatic solution. 4613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 4623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The scratch buffer is made of signed 16-bit data, holding over 4633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * some extra precision, and overshoot, from the vertical pass. 4643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 4653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * We also have the 16-bit unsigned fixed-point weights for each 4663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * of the four taps in v0 - v3. That's eight pixels worth of 4673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * coefficients when we have only four pointers, so calculations 4683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * for four more pixels are interleaved with the fetch and permute 4693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * code for each variant in the following code. 4703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 4713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The data arrangement is less than ideal for any pixel format, 4723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * but permuting loads help to mitigate most of the problems. 4733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 4743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Note also that the two outside taps of a bicubic are negative, 4753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * but these coefficients are unsigned. The sign is hard-coded by 4763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * use of multiply-and-subtract operations. 4773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if \comp == 1 4793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The uchar 1 case. 4803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Issue one lanewise ld4.h to load four consecutive pixels from 4813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * one pointer (one pixel) into four different registers; then load 4823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * four consecutive s16 values from the next pointer (pixel) into 4833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the next lane of those four registers, etc., so that we finish 4843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * with v12 - v15 representing the four taps, and each lane 4853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * representing a separate pixel. 4863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * 4873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * The first ld4 uses a splat to avoid any false dependency on 4883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the previous state of the register. 4893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 4903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14] 4913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x14, x2, #(63 - CHUNKSHIFT) 4923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 4933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15] 4943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 4953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x15, x2, #(63 - CHUNKSHIFT) 4963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 4973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16] 4983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 4993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x16, x2, #(63 - CHUNKSHIFT) 5003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 5013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17] 5023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 5033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x17, x2, #(63 - CHUNKSHIFT) 5043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 5053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14] 5063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 5073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15] 5083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16] 5093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17] 5103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v8.4s, v12.4h, v0.4h 5123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull2 v9.4s, v12.8h, v0.8h 5133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v8.4s, v13.4h, v1.4h 5143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v9.4s, v13.8h, v1.8h 5153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v8.4s, v14.4h, v2.4h 5163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v9.4s, v14.8h, v2.8h 5173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal v8.4s, v15.4h, v3.4h 5183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v9.4s, v15.8h, v3.8h 5193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs x13, x13, #LOOP_OUTPUT_SIZE 5213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v8.4h, v8.4s, #15 5233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v8.8h, v9.4s, #15 5243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun v8.8b, v8.8h, #VERTBITS - 8 5263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 2 5273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The uchar2 case: 5283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This time load pairs of values into adjacent lanes in v12 - v15 5293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * by aliasing them as u32 data; leaving room for only four pixels, 5303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * so the process has to be done twice. This also means that the 5313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * coefficient registers fail to align with the coefficient data 5323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * (eight separate pixels), so that has to be doubled-up to match. 5333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 5343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14] 5353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x14, x2, #(63 - CHUNKSHIFT) 5363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 5373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15] 5383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 5393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x15, x2, #(63 - CHUNKSHIFT) 5403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 5413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16] 5423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 5433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x16, x2, #(63 - CHUNKSHIFT) 5443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 5453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17] 5463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 5473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x17, x2, #(63 - CHUNKSHIFT) 5483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 5493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* double-up coefficients to align with component pairs */ 5513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip1 v16.8h, v0.8h, v0.8h 5523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 5533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip1 v17.8h, v1.8h, v1.8h 5543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip1 v18.8h, v2.8h, v2.8h 5553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip1 v19.8h, v3.8h, v3.8h 5563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v8.4s, v12.4h, v16.4h 5583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull2 v9.4s, v12.8h, v16.8h 5593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v8.4s, v13.4h, v17.4h 5603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v9.4s, v13.8h, v17.8h 5613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v8.4s, v14.4h, v18.4h 5623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v9.4s, v14.8h, v18.8h 5633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal v8.4s, v15.4h, v19.4h 5643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v9.4s, v15.8h, v19.8h 5653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v8.4h, v8.4s, #15 5673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v8.8h, v9.4s, #15 5683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14] 5703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15] 5713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16] 5723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17] 5733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* double-up coefficients to align with component pairs */ 5753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip2 v16.8h, v0.8h, v0.8h 5763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip2 v17.8h, v1.8h, v1.8h 5773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip2 v18.8h, v2.8h, v2.8h 5783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie zip2 v19.8h, v3.8h, v3.8h 5793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v10.4s, v12.4h, v16.4h 5813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull2 v11.4s, v12.8h, v16.8h 5823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v10.4s, v13.4h, v17.4h 5833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v11.4s, v13.8h, v17.8h 5843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v10.4s, v14.4h, v18.4h 5853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v11.4s, v14.8h, v18.8h 5863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal v10.4s, v15.4h, v19.4h 5873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v11.4s, v15.8h, v19.8h 5883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs x13, x13, #LOOP_OUTPUT_SIZE 5903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v9.4h, v10.4s, #15 5923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v9.8h, v11.4s, #15 5933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 5943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun v8.8b, v8.8h, #VERTBITS - 8 5953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8 5963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif \comp == 4 5973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The uchar4 case. 5983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * This case is comparatively painless because four s16s are the 5993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * smallest addressable unit for a vmul-by-scalar. Rather than 6003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * permute the data, simply arrange the multiplies to suit the way 6013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the data comes in. That's a lot of data, though, so things 6023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * progress in pairs of pixels at a time. 6033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 6043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.8h,v13.8h}, [x14] 6053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x14, x2, #(63 - CHUNKSHIFT) 6063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 6073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.8h,v15.8h}, [x15] 6083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1) 6093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x15, x2, #(63 - CHUNKSHIFT) 6103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 6113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v8.4s, v12.4h, v0.h[0] 6133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v9.4s, v14.4h, v0.h[1] 6143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v8.4s, v12.8h, v1.h[0] 6153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v9.4s, v14.8h, v1.h[1] 6163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v8.4s, v13.4h, v2.h[0] 6173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v9.4s, v15.4h, v2.h[1] 6183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v8.4s, v13.8h, v3.h[0] 6193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v9.4s, v15.8h, v3.h[1] 6203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* And two more... */ 6223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.8h,v13.8h}, [x16] 6233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1) 6243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x16, x2, #(63 - CHUNKSHIFT) 6253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 6263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.8h,v15.8h}, [x17] 6273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1) 6283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie lsr x17, x2, #(63 - CHUNKSHIFT) 6293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x2, x2, x3 6303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v8.4h, v8.4s, #15 6323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1) 6333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v8.8h, v9.4s, #15 6343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v10.4s, v12.4h, v0.h[2] 6363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v11.4s, v14.4h, v0.h[3] 6373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v10.4s, v12.8h, v1.h[2] 6383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v11.4s, v14.8h, v1.h[3] 6393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v10.4s, v13.4h, v2.h[2] 6403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v11.4s, v15.4h, v2.h[3] 6413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v10.4s, v13.8h, v3.h[2] 6423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v11.4s, v15.8h, v3.h[3] 6433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v9.4h, v10.4s, #15 6453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v9.8h, v11.4s, #15 6463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun v8.8b, v8.8h, #VERTBITS - 8 6483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8 6493a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* And two more... */ 6513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.8h,v13.8h}, [x14] 6523a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.8h,v15.8h}, [x15] 6533a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6543a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v10.4s, v12.4h, v0.h[4] 6553a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v11.4s, v14.4h, v0.h[5] 6563a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v10.4s, v12.8h, v1.h[4] 6573a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v11.4s, v14.8h, v1.h[5] 6583a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v10.4s, v13.4h, v2.h[4] 6593a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v11.4s, v15.4h, v2.h[5] 6603a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v10.4s, v13.8h, v3.h[4] 6613a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v11.4s, v15.8h, v3.h[5] 6623a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6633a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* And two more... */ 6643a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.8h,v13.8h}, [x16] 6653a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v14.8h,v15.8h}, [x17] 6663a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6673a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie subs x13, x13, #LOOP_OUTPUT_SIZE 6683a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6693a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v9.4h, v10.4s, #15 6703a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v9.8h, v11.4s, #15 6713a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6723a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v10.4s, v12.4h, v0.h[6] 6733a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smull v11.4s, v14.4h, v0.h[7] 6743a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v10.4s, v12.8h, v1.h[6] 6753a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl2 v11.4s, v14.8h, v1.h[7] 6763a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v10.4s, v13.4h, v2.h[6] 6773a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlsl v11.4s, v15.4h, v2.h[7] 6783a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v10.4s, v13.8h, v3.h[6] 6793a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie smlal2 v11.4s, v15.8h, v3.h[7] 6803a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6813a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn v10.4h, v10.4s, #15 6823a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrn2 v10.8h, v11.4s, #15 6833a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 6843a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun v9.8b, v9.8h, #VERTBITS - 8 6853a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8 6863a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 6873a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie bgt 2b /* continue inner loop */ 6883a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* The inner loop has already been limited to ensure that none of 6893a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * the earlier iterations could overfill the output, so the store 6903a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * appears within the loop but after the conditional branch (at the 6913a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * top). At the end, provided it won't overfill, perform the final 6923a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * store here. If it would, then break out to the tricky tail case 6933a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * instead. 6943a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 6953a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie blt 1f 6963a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Store the amount of data appropriate to the configuration of the 6973a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * instance being assembled. 6983a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 6993a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if LOOP_OUTPUT_SIZE == 4 7003a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.s}[0], [x0], #4 7013a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 8 7023a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.8b}, [x0], #8 7033a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 16 7043a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.16b}, [x0], #16 7053a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.elseif LOOP_OUTPUT_SIZE == 32 7063a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.16b,v9.16b}, [x0], #32 7073a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7083a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie b 1b /* resume outer loop */ 7093a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie /* Partial tail store case: 7103a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * Different versions of the code need different subsets of the 7113a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * following partial stores. Here the number of components and the 7123a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * size of the chunk of data produced by each inner loop iteration 7133a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie * is tested to figure out whether or not each phrase is relevant. 7143a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie */ 7153a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16 7163a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst x13, #16 7173a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7183a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.16b}, [x0], #16 7193a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie mov v8.16b, v9.16b 7203a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7213a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8 7223a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst x13, #8 7233a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7243a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.8b}, [x0], #8 7253a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ext v8.16b, v8.16b, v8.16b, #8 7263a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7273a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4 7283a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst x13, #4 7293a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7303a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.s}[0], [x0], #4 7313a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ext v8.8b, v8.8b, v8.8b, #4 7323a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7333a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2 7343a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst x13, #2 7353a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7363a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.h}[0], [x0], #2 7373a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ext v8.8b, v8.8b, v8.8b, #2 7383a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7393a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1 7403a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: tst x13, #1 7413a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie beq 1f 7423a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie st1 {v8.b}[0], [x0], #1 7433a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endif 7443a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie1: 7453a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie9: mov sp, x18 7463a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v8.1d - v11.1d}, [sp], #32 7473a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ld1 {v12.1d - v15.1d}, [sp], #32 7483a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie ret 7493a98366c7f53b20f8550ffac82732d60ece794b4Simon HosieEND(rsdIntrinsicResizeB\comp\()_K) 7503a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie.endr 7513a98366c7f53b20f8550ffac82732d60ece794b4Simon Hosie 752