1446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* 2446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Copyright (C) 2014 The Android Open Source Project 3446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 4446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Licensed under the Apache License, Version 2.0 (the "License"); 5446788007efe0a673d0366284026adfa17b36fedSimon Hosie * you may not use this file except in compliance with the License. 6446788007efe0a673d0366284026adfa17b36fedSimon Hosie * You may obtain a copy of the License at 7446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 8446788007efe0a673d0366284026adfa17b36fedSimon Hosie * http://www.apache.org/licenses/LICENSE-2.0 9446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 10446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Unless required by applicable law or agreed to in writing, software 11446788007efe0a673d0366284026adfa17b36fedSimon Hosie * distributed under the License is distributed on an "AS IS" BASIS, 12446788007efe0a673d0366284026adfa17b36fedSimon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13446788007efe0a673d0366284026adfa17b36fedSimon Hosie * See the License for the specific language governing permissions and 14446788007efe0a673d0366284026adfa17b36fedSimon Hosie * limitations under the License. 15446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 16446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17446788007efe0a673d0366284026adfa17b36fedSimon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie#define PRIVATE(f) .text; .align 4; .type f,#function; f: 19446788007efe0a673d0366284026adfa17b36fedSimon Hosie#define END(f) .size f, .-f; 20446788007efe0a673d0366284026adfa17b36fedSimon Hosie 21e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie//#define ARCH_ARM64_USE_BLUR_PRELOAD 22e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Number of fractional bits to preserve in intermediate results. The 245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * intermediate storage is 16-bit, and we started with 8 bit data (the integer 255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * part), so this should be between 0 and 8. 265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 27446788007efe0a673d0366284026adfa17b36fedSimon Hosie.set FRACTION_BITS, 7 28446788007efe0a673d0366284026adfa17b36fedSimon Hosie.set MAX_R, 25 29446788007efe0a673d0366284026adfa17b36fedSimon Hosie 30446788007efe0a673d0366284026adfa17b36fedSimon Hosie 31446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* A quick way of making a line of code conditional on some other condition. 32446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 33446788007efe0a673d0366284026adfa17b36fedSimon Hosie * `ifcc`: 34446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 35446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro ifcc zzz:vararg 36446788007efe0a673d0366284026adfa17b36fedSimon Hosie.if cc 37446788007efe0a673d0366284026adfa17b36fedSimon Hosie \zzz 38446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endif 39446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm 40446788007efe0a673d0366284026adfa17b36fedSimon Hosie 41e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie/* It's not always clear that prefetching is beneficial and this needs further 42e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * testing on different cores, so it's made switchable here. 43e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie */ 44e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#if defined(ARCH_ARM64_USE_BLUR_PRELOAD) 45e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#define VERTPLD(...) prfm PLDL1KEEP, [__VA_ARGS__] 46e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#else 47e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#define VERTPLD(...) nop 48e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#endif 49e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 50446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Fetch 16 columns of bytes (regardless of image format), convolve these 51446788007efe0a673d0366284026adfa17b36fedSimon Hosie * vertically, and leave them in the register file. If working near the top or 52446788007efe0a673d0366284026adfa17b36fedSimon Hosie * bottom of an image then clamp the addressing while loading the data in. 53446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 54446788007efe0a673d0366284026adfa17b36fedSimon Hosie * The convolution is fully unrolled for windows up to max_r, with the 55446788007efe0a673d0366284026adfa17b36fedSimon Hosie * outermost edges calculated first. This way it's possible to branch directly 56446788007efe0a673d0366284026adfa17b36fedSimon Hosie * into the relevant part of the code for an arbitrary convolution radius. Two 57446788007efe0a673d0366284026adfa17b36fedSimon Hosie * variants of the loop are produced; one eliminates the clamping code for a 58446788007efe0a673d0366284026adfa17b36fedSimon Hosie * slight speed advantage. 59446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 60446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Where the macro is called with reg=x, the specified register is taken to 61446788007efe0a673d0366284026adfa17b36fedSimon Hosie * contain a pre-calculated pointer into one of the two loops. 62446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 63446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input: 64446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x1 -- src 65446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x2 -- pitch 66446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x5 -- r 675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x6 -- rup (r, unless clipped to top of source image) 685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x7 -- rdn (r, unless clipped to bottom of source image) 69446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x12 -- switch index 706267c335745f32fb0d898335930da6b0904be577Simon Hosie * v0-v3 -- coefficient table 71446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x13 = -pitch 72446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x15 = top-row in 73ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * x19 = bottom-row in 74446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Output: 75446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x1 += 16 766267c335745f32fb0d898335930da6b0904be577Simon Hosie * v10,v11 -- 16 convolved columns 77446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies: 78446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x10 = upper row pointer 79446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x11 = lower row pointer 806267c335745f32fb0d898335930da6b0904be577Simon Hosie * v12-v15 = temporary sums 81446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 82446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 83446788007efe0a673d0366284026adfa17b36fedSimon Hosie .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 84446788007efe0a673d0366284026adfa17b36fedSimon Hosie 85446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v15.16b}, [x1], #16 86446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov x10, x15 87446788007efe0a673d0366284026adfa17b36fedSimon Hosie 88446788007efe0a673d0366284026adfa17b36fedSimon Hosie uxtl v14.8h, v15.8b 89e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie VERTPLD(x1, #16) 90446788007efe0a673d0366284026adfa17b36fedSimon Hosie uxtl2 v15.8h, v15.16b 91446788007efe0a673d0366284026adfa17b36fedSimon Hosie .if \max_r < 16 // approximate 92446788007efe0a673d0366284026adfa17b36fedSimon Hosie ifcc adr \reg, 1f 93446788007efe0a673d0366284026adfa17b36fedSimon Hosie .else 94446788007efe0a673d0366284026adfa17b36fedSimon Hosie ifcc adrp \reg, 1f 95446788007efe0a673d0366284026adfa17b36fedSimon Hosie ifcc add \reg, \reg, #:lo12:1f 96446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endif 97446788007efe0a673d0366284026adfa17b36fedSimon Hosie 98446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v12.4s, v14.4h, v0.h[0] 99446788007efe0a673d0366284026adfa17b36fedSimon Hosie ifcc sub \reg, \reg, x5, LSL #6 100446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v13.4s, v14.8h, v0.h[0] 101ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie mov x11, x19 102446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v14.4s, v15.4h, v0.h[0] 103446788007efe0a673d0366284026adfa17b36fedSimon Hosie ifcc add \reg, \reg, x5, LSL #3 104446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v15.4s, v15.8h, v0.h[0] 105446788007efe0a673d0366284026adfa17b36fedSimon Hosie br \reg 106446788007efe0a673d0366284026adfa17b36fedSimon Hosie 107e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie /* This version of the vertical fetch loop body is used away from the edges 108e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * of the source image. The pointers start at the top and bottom source rows 109e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * and work their way towards the centre on each iteration. This way the 110e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * number of taps used can be controlled by jumping directly into the middle 111e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * of the loop and running to completion. 112e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * If the loop body changes size then the code which caculates the address of 113e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * the initial iteration must be updated to accordingly. 114e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie */ 115e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .macro vertfetch_noclamp i, dreg 116e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .if 0 < \i && \i <= \max_r 117e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie ld1 {v10.16b}, [x10], x2 118e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie ld1 {v11.16b}, [x11], x13 119e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie uaddl v16.8h, v10.8b, v11.8b 120e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie uaddl2 v11.8h, v10.16b, v11.16b 121e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal v12.4s, v16.4h, \dreg 122e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal2 v13.4s, v16.8h, \dreg 123e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie VERTPLD(x10, #32) 124e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal v14.4s, v11.4h, \dreg 125e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie VERTPLD(x11, #32) 126e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal2 v15.4s, v11.8h, \dreg 127e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .endif 128e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .endm 129e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 130e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie /* This version of the vertical fetch loop body is used near the edges of the 131e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * source image, where one or both of the accesses may start with a clamped 132e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * value, and the row addresses only begin to change after some number of 133e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * iterations before the end. 134e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * If the loop body changes size then the code which caculates the address of 135e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * the initial iteration must be updated to accordingly. 136e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie */ 137e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .macro vertfetch_clamped i, dreg 138e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .if 0 < \i && \i <= \max_r 139446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v10.16b}, [x10], x2 140e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie cmp x6, #\i 141446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v11.16b}, [x11], x13 142e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie csel x10, x15, x10, lo 143446788007efe0a673d0366284026adfa17b36fedSimon Hosie uaddl v16.8h, v10.8b, v11.8b 144e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie cmp x7, #\i 145446788007efe0a673d0366284026adfa17b36fedSimon Hosie uaddl2 v11.8h, v10.16b, v11.16b 146e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie csel x11, x19, x11, lo 147e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal v12.4s, v16.4h, \dreg 148e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal2 v13.4s, v16.8h, \dreg 149e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie VERTPLD(x10, #32) 150e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal v14.4s, v11.4h, \dreg 151e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie VERTPLD(x11, #32) 152e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie umlal2 v15.4s, v11.8h, \dreg 153446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endif 154e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .endm 155e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 156e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie /* Entry into this unrolled loop is computed as a negative index from 157e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * \labelc at the end of the block. 158e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie */ 159e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .align 4 160e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 27, v3.h[3] 161e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 26, v3.h[2] 162e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 25, v3.h[1] 163e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 24, v3.h[0] 164e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 23, v2.h[7] 165e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 22, v2.h[6] 166e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 21, v2.h[5] 167e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 20, v2.h[4] 168e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 19, v2.h[3] 169e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 18, v2.h[2] 170e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 17, v2.h[1] 171e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 16, v2.h[0] 172e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 15, v1.h[7] 173e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 14, v1.h[6] 174e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 13, v1.h[5] 175e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 12, v1.h[4] 176e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 11, v1.h[3] 177e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 10, v1.h[2] 178e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 9, v1.h[1] 179e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 8, v1.h[0] 180e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 7, v0.h[7] 181e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 6, v0.h[6] 182e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 5, v0.h[5] 183e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 4, v0.h[4] 184e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 3, v0.h[3] 185e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 2, v0.h[2] 186e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 1, v0.h[1] 187e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_clamped 0, v0.h[0] 188e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 1: 189e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */ 190e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 191e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie /* Entry into this unrolled loop is computed as a negative index from 192e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * \labelnc at the end of the block. 193e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie */ 194e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .align 4 195e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 27, v3.h[3] 196e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 26, v3.h[2] 197e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 25, v3.h[1] 198e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 24, v3.h[0] 199e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 23, v2.h[7] 200e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 22, v2.h[6] 201e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 21, v2.h[5] 202e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 20, v2.h[4] 203e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 19, v2.h[3] 204e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 18, v2.h[2] 205e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 17, v2.h[1] 206e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 16, v2.h[0] 207e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 15, v1.h[7] 208e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 14, v1.h[6] 209e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 13, v1.h[5] 210e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 12, v1.h[4] 211e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 11, v1.h[3] 212e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 10, v1.h[2] 213e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 9, v1.h[1] 214e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 8, v1.h[0] 215e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 7, v0.h[7] 216e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 6, v0.h[6] 217e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 5, v0.h[5] 218e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 4, v0.h[4] 219e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 3, v0.h[3] 220e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 2, v0.h[2] 221e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 1, v0.h[1] 222e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie vertfetch_noclamp 0, v0.h[0] 223e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie \labelnc : 224e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 225e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .purgem vertfetch_clamped 226e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie .purgem vertfetch_noclamp 227446788007efe0a673d0366284026adfa17b36fedSimon Hosie 228e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie 2: uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 229446788007efe0a673d0366284026adfa17b36fedSimon Hosie add x15, x15, #16 230446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 231ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x19, x19, #16 232446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 233446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 234446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm /*}}}*/ 235446788007efe0a673d0366284026adfa17b36fedSimon Hosie 236446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Some portion of the convolution window (as much as will fit, and all of it 237446788007efe0a673d0366284026adfa17b36fedSimon Hosie * for the uchar1 cases) is kept in the register file to avoid unnecessary 238446788007efe0a673d0366284026adfa17b36fedSimon Hosie * memory accesses. This forces the horizontal loops to be unrolled because 239446788007efe0a673d0366284026adfa17b36fedSimon Hosie * there's no indexed addressing into the register file. 240446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 241446788007efe0a673d0366284026adfa17b36fedSimon Hosie * As in the fetch macro, the operations are ordered from outside to inside, so 242446788007efe0a673d0366284026adfa17b36fedSimon Hosie * that jumping into the middle of the block bypasses the unwanted window taps. 243446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 244446788007efe0a673d0366284026adfa17b36fedSimon Hosie * There are several variants of the macro because of the fixed offets of the 245446788007efe0a673d0366284026adfa17b36fedSimon Hosie * taps -- the wider the maximum radius the further the centre tap is from the 246446788007efe0a673d0366284026adfa17b36fedSimon Hosie * most recently fetched data. This means that pre-filling the window requires 247446788007efe0a673d0366284026adfa17b36fedSimon Hosie * more data that won't be used and it means that rotating the window involves 248446788007efe0a673d0366284026adfa17b36fedSimon Hosie * more mov operations. 249446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 250ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * When the buffer gets too big the buffer at [x9] is used. 251446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 252446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input: 2536267c335745f32fb0d898335930da6b0904be577Simon Hosie * v16-v31,v4-v11 -- convoltion window 254ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * x9 -- pointer to additional convolution window data 255446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Output: 256ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * x9 -- updated buffer pointer (if used) 257446788007efe0a673d0366284026adfa17b36fedSimon Hosie * d31 -- result to be stored 258446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies: 259ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * x12 -- temp buffer pointer 2606267c335745f32fb0d898335930da6b0904be577Simon Hosie * v12-v13 -- temporaries for load and vext operations. 2616267c335745f32fb0d898335930da6b0904be577Simon Hosie * v14-v15 -- intermediate sums 262446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 263446788007efe0a673d0366284026adfa17b36fedSimon Hosie#define TUNED_LIST1 8, 16 264446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv1_8/*{{{*/ 265446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v14.4s, v9.4h, v0.h[0] 266446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v15.4s, v9.8h, v0.h[0] 267446788007efe0a673d0366284026adfa17b36fedSimon Hosie 268ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie adr x16, 100f 269ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldrsh x12, [x16, x5, LSL #1] 270ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x12, x12, x16 271446788007efe0a673d0366284026adfa17b36fedSimon Hosie br x12 272ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie 100: .hword -4 273ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 101f-100b 274ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 102f-100b 275ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 103f-100b 276ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 104f-100b 277ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 105f-100b 278ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 106f-100b 279ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 107f-100b 280ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 108f-100b 281446788007efe0a673d0366284026adfa17b36fedSimon Hosie .align 4 282446788007efe0a673d0366284026adfa17b36fedSimon Hosie 108: umlal v14.4s, v8.4h, v1.h[0] 283446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v8.8h, v1.h[0] 284446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v10.4h, v1.h[0] 285446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v10.8h, v1.h[0] 286446788007efe0a673d0366284026adfa17b36fedSimon Hosie 107: ext v12.16b, v8.16b, v9.16b, #1*2 287446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #7*2 288446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[7] 289446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[7] 290446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[7] 291446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[7] 292446788007efe0a673d0366284026adfa17b36fedSimon Hosie 106: ext v12.16b, v8.16b, v9.16b, #2*2 293446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #6*2 294446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[6] 295446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[6] 296446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[6] 297446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[6] 298446788007efe0a673d0366284026adfa17b36fedSimon Hosie 105: ext v12.16b, v8.16b, v9.16b, #3*2 299446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #5*2 300446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[5] 301446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[5] 302446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[5] 303446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[5] 304446788007efe0a673d0366284026adfa17b36fedSimon Hosie 104: //ext v12.16b, v8.16b, v9.16b, #4*2 305446788007efe0a673d0366284026adfa17b36fedSimon Hosie //ext v13.16b, v9.16b, v10.16b, #4*2 306446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v8.8h, v0.h[4] 307446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v9.4h, v0.h[4] 308446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v9.8h, v0.h[4] 309446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v10.4h, v0.h[4] 310446788007efe0a673d0366284026adfa17b36fedSimon Hosie 103: ext v12.16b, v8.16b, v9.16b, #5*2 311446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #3*2 312446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[3] 313446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[3] 314446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[3] 315446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[3] 316446788007efe0a673d0366284026adfa17b36fedSimon Hosie 102: ext v12.16b, v8.16b, v9.16b, #6*2 317446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #2*2 318446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[2] 319446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[2] 320446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[2] 321446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[2] 322446788007efe0a673d0366284026adfa17b36fedSimon Hosie 101: ext v12.16b, v8.16b, v9.16b, #7*2 323446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #1*2 324446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[1] 325446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[1] 326446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[1] 327446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[1] 328446788007efe0a673d0366284026adfa17b36fedSimon Hosie 329446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v14.4h, v14.4s, #16 330446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v14.8h, v15.4s, #16 331446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 332446788007efe0a673d0366284026adfa17b36fedSimon Hosie 333446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v8.16b, v9.16b 334446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v9.16b, v10.16b 335446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v10.16b, v11.16b 336446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/ 337446788007efe0a673d0366284026adfa17b36fedSimon Hosie 338446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv1_16/*{{{*/ 339446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v14.4s, v8.4h, v0.h[0] 340446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v15.4s, v8.8h, v0.h[0] 341446788007efe0a673d0366284026adfa17b36fedSimon Hosie 342ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie adr x16, 100f 343ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldrsh x12, [x16, x5, LSL #1] 344ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x12, x12, x16 345446788007efe0a673d0366284026adfa17b36fedSimon Hosie br x12 346ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie 100: .hword -4 347ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 101f-100b 348ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 102f-100b 349ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 103f-100b 350ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 104f-100b 351ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 105f-100b 352ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 106f-100b 353ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 107f-100b 354ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 108f-100b 355ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 109f-100b 356ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 110f-100b 357ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 111f-100b 358ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 112f-100b 359ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 113f-100b 360ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 114f-100b 361ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 115f-100b 362ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 116f-100b 363446788007efe0a673d0366284026adfa17b36fedSimon Hosie .align 4 364446788007efe0a673d0366284026adfa17b36fedSimon Hosie 116: //ext v12.16b, v6.16b, v7.16b, #0*2 365446788007efe0a673d0366284026adfa17b36fedSimon Hosie //ext v13.16b, v10.16b, v11.16b, #0*2 366446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v6.4h, v2.h[0] 367446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v6.8h, v2.h[0] 368446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v10.4h, v2.h[0] 369446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v10.8h, v2.h[0] 370446788007efe0a673d0366284026adfa17b36fedSimon Hosie 115: ext v12.16b, v6.16b, v7.16b, #1*2 371446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #7*2 372446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[7] 373446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[7] 374446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[7] 375446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[7] 376446788007efe0a673d0366284026adfa17b36fedSimon Hosie 114: ext v12.16b, v6.16b, v7.16b, #2*2 377446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #6*2 378446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[6] 379446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[6] 380446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[6] 381446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[6] 382446788007efe0a673d0366284026adfa17b36fedSimon Hosie 113: ext v12.16b, v6.16b, v7.16b, #3*2 383446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #5*2 384446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[5] 385446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[5] 386446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[5] 387446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[5] 388446788007efe0a673d0366284026adfa17b36fedSimon Hosie 112: //ext v12.16b, v6.16b, v7.16b, #4*2 389446788007efe0a673d0366284026adfa17b36fedSimon Hosie //ext v13.16b, v9.16b, v10.16b, #4*2 390446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v6.8h, v1.h[4] 391446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v7.4h, v1.h[4] 392446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v9.8h, v1.h[4] 393446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v10.4h, v1.h[4] 394446788007efe0a673d0366284026adfa17b36fedSimon Hosie 111: ext v12.16b, v6.16b, v7.16b, #5*2 395446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #3*2 396446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[3] 397446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[3] 398446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[3] 399446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[3] 400446788007efe0a673d0366284026adfa17b36fedSimon Hosie 110: ext v12.16b, v6.16b, v7.16b, #6*2 401446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #2*2 402446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[2] 403446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[2] 404446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[2] 405446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[2] 406446788007efe0a673d0366284026adfa17b36fedSimon Hosie 109: ext v12.16b, v6.16b, v7.16b, #7*2 407446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #1*2 408446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[1] 409446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[1] 410446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[1] 411446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[1] 412446788007efe0a673d0366284026adfa17b36fedSimon Hosie 108: //ext v12.16b, v7.16b, v8.16b, #0*2 413446788007efe0a673d0366284026adfa17b36fedSimon Hosie //ext v13.16b, v9.16b, v10.16b, #0*2 414446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v7.4h, v1.h[0] 415446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v7.8h, v1.h[0] 416446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v9.4h, v1.h[0] 417446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v9.8h, v1.h[0] 418446788007efe0a673d0366284026adfa17b36fedSimon Hosie 107: ext v12.16b, v7.16b, v8.16b, #1*2 419446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #7*2 420446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[7] 421446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[7] 422446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[7] 423446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[7] 424446788007efe0a673d0366284026adfa17b36fedSimon Hosie 106: ext v12.16b, v7.16b, v8.16b, #2*2 425446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #6*2 426446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[6] 427446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[6] 428446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[6] 429446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[6] 430446788007efe0a673d0366284026adfa17b36fedSimon Hosie 105: ext v12.16b, v7.16b, v8.16b, #3*2 431446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #5*2 432446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[5] 433446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[5] 434446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[5] 435446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[5] 436446788007efe0a673d0366284026adfa17b36fedSimon Hosie 104: //ext v12.16b, v7.16b, v8.16b, #4*2 437446788007efe0a673d0366284026adfa17b36fedSimon Hosie //ext v13.16b, v8.16b, v9.16b, #4*2 438446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v7.8h, v0.h[4] 439446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v8.4h, v0.h[4] 440446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v8.8h, v0.h[4] 441446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v9.4h, v0.h[4] 442446788007efe0a673d0366284026adfa17b36fedSimon Hosie 103: ext v12.16b, v7.16b, v8.16b, #5*2 443446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #3*2 444446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[3] 445446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[3] 446446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[3] 447446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[3] 448446788007efe0a673d0366284026adfa17b36fedSimon Hosie 102: ext v12.16b, v7.16b, v8.16b, #6*2 449446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #2*2 450446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[2] 451446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[2] 452446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[2] 453446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[2] 454446788007efe0a673d0366284026adfa17b36fedSimon Hosie 101: ext v12.16b, v7.16b, v8.16b, #7*2 455446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #1*2 456446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[1] 457446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[1] 458446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[1] 459446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[1] 460446788007efe0a673d0366284026adfa17b36fedSimon Hosie 461446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v14.4h, v14.4s, #16 462446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v14.8h, v15.4s, #16 463446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 464446788007efe0a673d0366284026adfa17b36fedSimon Hosie 465446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v6.16b, v7.16b 466446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v7.16b, v8.16b 467446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v8.16b, v9.16b 468446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v9.16b, v10.16b 469446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v10.16b, v11.16b 470446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/ 471446788007efe0a673d0366284026adfa17b36fedSimon Hosie 472446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv1_25/*{{{*/ 473446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v12.16b, v6.16b, v7.16b, #7*2 474446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v14.4s, v12.4h, v0.h[0] 475446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v15.4s, v12.8h, v0.h[0] 476446788007efe0a673d0366284026adfa17b36fedSimon Hosie 477ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie adr x16, 100f 478ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldrsh x12, [x16, x5, LSL #1] 479ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x12, x12, x16 480446788007efe0a673d0366284026adfa17b36fedSimon Hosie br x12 481ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie 100: .hword -4 482ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 101f-100b 483ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 102f-100b 484ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 103f-100b 485ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 104f-100b 486ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 105f-100b 487ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 106f-100b 488ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 107f-100b 489ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 108f-100b 490ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 109f-100b 491ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 110f-100b 492ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 111f-100b 493ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 112f-100b 494ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 113f-100b 495ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 114f-100b 496ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 115f-100b 497ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 116f-100b 498ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 117f-100b 499ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 118f-100b 500ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 119f-100b 501ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 120f-100b 502ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 121f-100b 503ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 122f-100b 504ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 123f-100b 505ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 124f-100b 506ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 125f-100b 507446788007efe0a673d0366284026adfa17b36fedSimon Hosie .align 4 5084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 125: ext v12.16b, v31.16b, v4.16b, #6*2 509446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v10.16b, v11.16b, #0*2 510446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v3.h[1] 511446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v3.h[1] 512446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v3.h[1] 513446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v3.h[1] 5146267c335745f32fb0d898335930da6b0904be577Simon Hosie 124: ext v12.16b, v31.16b, v4.16b, #7*2 515446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #7*2 516446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v3.h[0] 517446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v3.h[0] 518446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v3.h[0] 519446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v3.h[0] 520446788007efe0a673d0366284026adfa17b36fedSimon Hosie 123: ext v12.16b, v4.16b, v5.16b, #0*2 521446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #6*2 522446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[7] 523446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[7] 524446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[7] 525446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[7] 526446788007efe0a673d0366284026adfa17b36fedSimon Hosie 122: ext v12.16b, v4.16b, v5.16b, #1*2 527446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #5*2 528446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[6] 529446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[6] 530446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[6] 531446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[6] 532446788007efe0a673d0366284026adfa17b36fedSimon Hosie 121: ext v12.16b, v4.16b, v5.16b, #2*2 533446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #4*2 534446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[5] 535446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[5] 536446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[5] 537446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[5] 538446788007efe0a673d0366284026adfa17b36fedSimon Hosie 120: ext v12.16b, v4.16b, v5.16b, #3*2 539446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #3*2 540446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[4] 541446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[4] 542446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[4] 543446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[4] 544446788007efe0a673d0366284026adfa17b36fedSimon Hosie 119: ext v12.16b, v4.16b, v5.16b, #4*2 545446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #2*2 546446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[3] 547446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[3] 548446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[3] 549446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[3] 550446788007efe0a673d0366284026adfa17b36fedSimon Hosie 118: ext v12.16b, v4.16b, v5.16b, #5*2 551446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #1*2 552446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[2] 553446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[2] 554446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[2] 555446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[2] 556446788007efe0a673d0366284026adfa17b36fedSimon Hosie 117: ext v12.16b, v4.16b, v5.16b, #6*2 557446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v9.16b, v10.16b, #0*2 558446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[1] 559446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[1] 560446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[1] 561446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[1] 562446788007efe0a673d0366284026adfa17b36fedSimon Hosie 116: ext v12.16b, v4.16b, v5.16b, #7*2 563446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #7*2 564446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[0] 565446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[0] 566446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v2.h[0] 567446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v2.h[0] 568446788007efe0a673d0366284026adfa17b36fedSimon Hosie 115: ext v12.16b, v5.16b, v6.16b, #0*2 569446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #6*2 570446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[7] 571446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[7] 572446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[7] 573446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[7] 574446788007efe0a673d0366284026adfa17b36fedSimon Hosie 114: ext v12.16b, v5.16b, v6.16b, #1*2 575446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #5*2 576446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[6] 577446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[6] 578446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[6] 579446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[6] 580446788007efe0a673d0366284026adfa17b36fedSimon Hosie 113: ext v12.16b, v5.16b, v6.16b, #2*2 581446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #4*2 582446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[5] 583446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[5] 584446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[5] 585446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[5] 586446788007efe0a673d0366284026adfa17b36fedSimon Hosie 112: ext v12.16b, v5.16b, v6.16b, #3*2 587446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #3*2 588446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[4] 589446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[4] 590446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[4] 591446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[4] 592446788007efe0a673d0366284026adfa17b36fedSimon Hosie 111: ext v12.16b, v5.16b, v6.16b, #4*2 593446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #2*2 594446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[3] 595446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[3] 596446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[3] 597446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[3] 598446788007efe0a673d0366284026adfa17b36fedSimon Hosie 110: ext v12.16b, v5.16b, v6.16b, #5*2 599446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #1*2 600446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[2] 601446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[2] 602446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[2] 603446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[2] 604446788007efe0a673d0366284026adfa17b36fedSimon Hosie 109: ext v12.16b, v5.16b, v6.16b, #6*2 605446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v8.16b, v9.16b, #0*2 606446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[1] 607446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[1] 608446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[1] 609446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[1] 610446788007efe0a673d0366284026adfa17b36fedSimon Hosie 108: ext v12.16b, v5.16b, v6.16b, #7*2 611446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #7*2 612446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v1.h[0] 613446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v1.h[0] 614446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v1.h[0] 615446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v1.h[0] 616446788007efe0a673d0366284026adfa17b36fedSimon Hosie 107: ext v12.16b, v6.16b, v7.16b, #0*2 617446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #6*2 618446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[7] 619446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[7] 620446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[7] 621446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[7] 622446788007efe0a673d0366284026adfa17b36fedSimon Hosie 106: ext v12.16b, v6.16b, v7.16b, #1*2 623446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #5*2 624446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[6] 625446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[6] 626446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[6] 627446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[6] 628446788007efe0a673d0366284026adfa17b36fedSimon Hosie 105: ext v12.16b, v6.16b, v7.16b, #2*2 629446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #4*2 630446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[5] 631446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[5] 632446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[5] 633446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[5] 634446788007efe0a673d0366284026adfa17b36fedSimon Hosie 104: ext v12.16b, v6.16b, v7.16b, #3*2 635446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #3*2 636446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[4] 637446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[4] 638446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[4] 639446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[4] 640446788007efe0a673d0366284026adfa17b36fedSimon Hosie 103: ext v12.16b, v6.16b, v7.16b, #4*2 641446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #2*2 642446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[3] 643446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[3] 644446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[3] 645446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[3] 646446788007efe0a673d0366284026adfa17b36fedSimon Hosie 102: ext v12.16b, v6.16b, v7.16b, #5*2 647446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #1*2 648446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[2] 649446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[2] 650446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[2] 651446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[2] 652446788007efe0a673d0366284026adfa17b36fedSimon Hosie 101: ext v12.16b, v6.16b, v7.16b, #6*2 653446788007efe0a673d0366284026adfa17b36fedSimon Hosie ext v13.16b, v7.16b, v8.16b, #0*2 654446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v0.h[1] 655446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v0.h[1] 656446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v13.4h, v0.h[1] 657446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v13.8h, v0.h[1] 658446788007efe0a673d0366284026adfa17b36fedSimon Hosie 659446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v14.4h, v14.4s, #16 660446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v14.8h, v15.4s, #16 661446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 662446788007efe0a673d0366284026adfa17b36fedSimon Hosie 6634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v31.16b, v4.16b 664446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v4.16b, v5.16b 665446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v5.16b, v6.16b 666446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v6.16b, v7.16b 667446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v7.16b, v8.16b 668446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v8.16b, v9.16b 669446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v9.16b, v10.16b 670446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v10.16b, v11.16b 671446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/ 672446788007efe0a673d0366284026adfa17b36fedSimon Hosie 6734bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie#define TUNED_LIST4 6, 12, 20 674446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv4_6/*{{{*/ 675446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v14.4s, v7.4h, v0.h[0] 676446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v15.4s, v7.8h, v0.h[0] 677446788007efe0a673d0366284026adfa17b36fedSimon Hosie 678ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie adr x16, 100f 679ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldrsh x12, [x16, x5, LSL #1] 680ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x12, x12, x16 681446788007efe0a673d0366284026adfa17b36fedSimon Hosie br x12 682ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie 100: .hword -4 683ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 101f-100b 684ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 102f-100b 685ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 103f-100b 686ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 104f-100b 687ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 105f-100b 688ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 106f-100b 689446788007efe0a673d0366284026adfa17b36fedSimon Hosie .align 4 690446788007efe0a673d0366284026adfa17b36fedSimon Hosie 106: umlal v14.4s, v4.4h, v0.h[6] 691446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v4.8h, v0.h[6] 692446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v10.4h, v0.h[6] 693446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v10.8h, v0.h[6] 694446788007efe0a673d0366284026adfa17b36fedSimon Hosie 105: umlal2 v14.4s, v4.8h, v0.h[5] 695446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v5.4h, v0.h[5] 696446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v9.8h, v0.h[5] 697446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v10.4h, v0.h[5] 698446788007efe0a673d0366284026adfa17b36fedSimon Hosie 104: umlal v14.4s, v5.4h, v0.h[4] 699446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v5.8h, v0.h[4] 700446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v9.4h, v0.h[4] 701446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v9.8h, v0.h[4] 702446788007efe0a673d0366284026adfa17b36fedSimon Hosie 103: umlal2 v14.4s, v5.8h, v0.h[3] 703446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v6.4h, v0.h[3] 704446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v8.8h, v0.h[3] 705446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v9.4h, v0.h[3] 706446788007efe0a673d0366284026adfa17b36fedSimon Hosie 102: umlal v14.4s, v6.4h, v0.h[2] 707446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v6.8h, v0.h[2] 708446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v8.4h, v0.h[2] 709446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v8.8h, v0.h[2] 710446788007efe0a673d0366284026adfa17b36fedSimon Hosie 101: umlal2 v14.4s, v6.8h, v0.h[1] 711446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v7.4h, v0.h[1] 712446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v7.8h, v0.h[1] 713446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v8.4h, v0.h[1] 714446788007efe0a673d0366284026adfa17b36fedSimon Hosie 715446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v14.4h, v14.4s, #16 716446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v14.8h, v15.4s, #16 717446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 718446788007efe0a673d0366284026adfa17b36fedSimon Hosie 719446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v4.16b, v5.16b 720446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v5.16b, v6.16b 721446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v6.16b, v7.16b 722446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v7.16b, v8.16b 723446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v8.16b, v9.16b 724446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v9.16b, v10.16b 725446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v10.16b, v11.16b 726446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/ 727446788007efe0a673d0366284026adfa17b36fedSimon Hosie 728446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv4_12/*{{{*/ 729446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull v14.4s, v4.4h, v0.h[0] 730446788007efe0a673d0366284026adfa17b36fedSimon Hosie umull2 v15.4s, v4.8h, v0.h[0] 731446788007efe0a673d0366284026adfa17b36fedSimon Hosie 732ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie adr x16, 100f 733ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldrsh x12, [x16, x5, LSL #1] 734ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x12, x12, x16 735446788007efe0a673d0366284026adfa17b36fedSimon Hosie br x12 736ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie 100: .hword -4 737ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 101f-100b 738ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 102f-100b 739ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 103f-100b 740ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 104f-100b 741ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 105f-100b 742ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 106f-100b 743ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 107f-100b 744ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 108f-100b 745ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 109f-100b 746ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 110f-100b 747ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 111f-100b 748ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 112f-100b 749446788007efe0a673d0366284026adfa17b36fedSimon Hosie .align 4 7504bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 112: umlal v14.4s, v26.4h, v1.h[4] 7514bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v26.8h, v1.h[4] 752446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v10.4h, v1.h[4] 753446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v10.8h, v1.h[4] 7544bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 111: umlal2 v14.4s, v26.8h, v1.h[3] 7554bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v27.4h, v1.h[3] 756446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v9.8h, v1.h[3] 757446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v10.4h, v1.h[3] 7584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 110: umlal v14.4s, v27.4h, v1.h[2] 7594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v27.8h, v1.h[2] 760446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v9.4h, v1.h[2] 761446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v9.8h, v1.h[2] 7624bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 109: umlal2 v14.4s, v27.8h, v1.h[1] 7634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v28.4h, v1.h[1] 764446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v8.8h, v1.h[1] 765446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v9.4h, v1.h[1] 7664bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 108: umlal v14.4s, v28.4h, v1.h[0] 7674bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v28.8h, v1.h[0] 768446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v8.4h, v1.h[0] 769446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v8.8h, v1.h[0] 7704bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 107: umlal2 v14.4s, v28.8h, v0.h[7] 7714bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v29.4h, v0.h[7] 772446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v7.8h, v0.h[7] 773446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v8.4h, v0.h[7] 7744bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 106: umlal v14.4s, v29.4h, v0.h[6] 7754bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v29.8h, v0.h[6] 776446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v7.4h, v0.h[6] 777446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v7.8h, v0.h[6] 7784bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 105: umlal2 v14.4s, v29.8h, v0.h[5] 7794bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v30.4h, v0.h[5] 780446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v6.8h, v0.h[5] 781446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v7.4h, v0.h[5] 7824bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 104: umlal v14.4s, v30.4h, v0.h[4] 7834bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v30.8h, v0.h[4] 784446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v6.4h, v0.h[4] 785446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v6.8h, v0.h[4] 7864bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 103: umlal2 v14.4s, v30.8h, v0.h[3] 7874bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v31.4h, v0.h[3] 788446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v5.8h, v0.h[3] 789446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v6.4h, v0.h[3] 7904bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 102: umlal v14.4s, v31.4h, v0.h[2] 7914bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v31.8h, v0.h[2] 792446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v5.4h, v0.h[2] 793446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v5.8h, v0.h[2] 7944bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 101: umlal2 v14.4s, v31.8h, v0.h[1] 795446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v4.4h, v0.h[1] 796446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v4.8h, v0.h[1] 797446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v5.4h, v0.h[1] 798446788007efe0a673d0366284026adfa17b36fedSimon Hosie 799446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v14.4h, v14.4s, #16 800446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v14.8h, v15.4s, #16 801446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 802446788007efe0a673d0366284026adfa17b36fedSimon Hosie 8034bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v26.16b, v27.16b 8044bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v27.16b, v28.16b 8054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v28.16b, v29.16b 8064bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v29.16b, v30.16b 8074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v30.16b, v31.16b 8084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v31.16b, v4.16b 8094bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v4.16b, v5.16b 8104bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v5.16b, v6.16b 8114bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v6.16b, v7.16b 8124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v7.16b, v8.16b 8134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v8.16b, v9.16b 8144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v9.16b, v10.16b 8154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v10.16b, v11.16b 8164bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie.endm/*}}}*/ 8174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 8184bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie.macro hconv4_20/*{{{*/ 8194bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umull v14.4s, v28.4h, v0.h[0] 8204bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umull2 v15.4s, v28.8h, v0.h[0] 8214bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 8224bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie adr x16, 100f 8234bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie ldrsh x12, [x16, x5, LSL #1] 8244bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie add x12, x12, x16 8254bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie br x12 8264bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 100: .hword -4 8274bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 101f-100b 8284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 102f-100b 8294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 103f-100b 8304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 104f-100b 8314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 105f-100b 8324bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 106f-100b 8334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 107f-100b 8344bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 108f-100b 8354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 109f-100b 8364bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 110f-100b 8374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 111f-100b 8384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 112f-100b 8394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 113f-100b 8404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 114f-100b 8414bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 115f-100b 8424bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 116f-100b 8434bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 117f-100b 8444bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 118f-100b 8454bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 119f-100b 8464bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .hword 120f-100b 8474bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie .align 4 8484bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 8494bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 120: umlal v14.4s, v18.4h, v2.h[4] 8504bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v18.8h, v2.h[4] 8514bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v10.4h, v2.h[4] 8524bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v10.8h, v2.h[4] 8534bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 119: umlal2 v14.4s, v18.8h, v2.h[3] 8544bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v19.4h, v2.h[3] 8554bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v9.8h, v2.h[3] 8564bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v10.4h, v2.h[3] 8574bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 118: umlal v14.4s, v19.4h, v2.h[2] 8584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v19.8h, v2.h[2] 8594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v9.4h, v2.h[2] 8604bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v9.8h, v2.h[2] 8614bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 117: umlal2 v14.4s, v19.8h, v2.h[1] 8624bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v20.4h, v2.h[1] 8634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v8.8h, v2.h[1] 8644bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v9.4h, v2.h[1] 8654bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 116: umlal v14.4s, v20.4h, v2.h[0] 8664bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v20.8h, v2.h[0] 8674bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v8.4h, v2.h[0] 8684bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v8.8h, v2.h[0] 8694bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 115: umlal2 v14.4s, v20.8h, v1.h[7] 8704bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v21.4h, v1.h[7] 8714bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v7.8h, v1.h[7] 8724bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v8.4h, v1.h[7] 8734bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 114: umlal v14.4s, v21.4h, v1.h[6] 8744bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v21.8h, v1.h[6] 8754bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v7.4h, v1.h[6] 8764bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v7.8h, v1.h[6] 8774bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 113: umlal2 v14.4s, v21.8h, v1.h[5] 8784bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v22.4h, v1.h[5] 8794bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v6.8h, v1.h[5] 8804bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v7.4h, v1.h[5] 8814bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 112: umlal v14.4s, v22.4h, v1.h[4] 8824bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v22.8h, v1.h[4] 8834bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v6.4h, v1.h[4] 8844bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v6.8h, v1.h[4] 8854bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 111: umlal2 v14.4s, v22.8h, v1.h[3] 8864bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v23.4h, v1.h[3] 8874bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v5.8h, v1.h[3] 8884bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v6.4h, v1.h[3] 8894bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 110: umlal v14.4s, v23.4h, v1.h[2] 8904bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v23.8h, v1.h[2] 8914bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v5.4h, v1.h[2] 8924bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v5.8h, v1.h[2] 8934bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 109: umlal2 v14.4s, v23.8h, v1.h[1] 8944bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v24.4h, v1.h[1] 8954bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v4.8h, v1.h[1] 8964bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v5.4h, v1.h[1] 8974bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 108: umlal v14.4s, v24.4h, v1.h[0] 8984bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v24.8h, v1.h[0] 8994bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v4.4h, v1.h[0] 9004bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v4.8h, v1.h[0] 9014bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 107: umlal2 v14.4s, v24.8h, v0.h[7] 9024bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v25.4h, v0.h[7] 9034bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v31.8h, v0.h[7] 9044bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v4.4h, v0.h[7] 9054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 106: umlal v14.4s, v25.4h, v0.h[6] 9064bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v25.8h, v0.h[6] 9074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v31.4h, v0.h[6] 9084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v31.8h, v0.h[6] 9094bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 105: umlal2 v14.4s, v25.8h, v0.h[5] 9104bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v26.4h, v0.h[5] 9114bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v30.8h, v0.h[5] 9124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v31.4h, v0.h[5] 9134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 104: umlal v14.4s, v26.4h, v0.h[4] 9144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v26.8h, v0.h[4] 9154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v30.4h, v0.h[4] 9164bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v30.8h, v0.h[4] 9174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 103: umlal2 v14.4s, v26.8h, v0.h[3] 9184bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v27.4h, v0.h[3] 9194bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v29.8h, v0.h[3] 9204bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v30.4h, v0.h[3] 9214bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 102: umlal v14.4s, v27.4h, v0.h[2] 9224bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v27.8h, v0.h[2] 9234bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v29.4h, v0.h[2] 9244bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v29.8h, v0.h[2] 9254bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 101: umlal2 v14.4s, v27.8h, v0.h[1] 9264bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v28.4h, v0.h[1] 9274bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v28.8h, v0.h[1] 9284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v29.4h, v0.h[1] 9294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 9304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie uqrshrn v14.4h, v14.4s, #16 9314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie uqrshrn2 v14.8h, v15.4s, #16 9324bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 9334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 9344bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v18.16b, v19.16b 9354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v19.16b, v20.16b 9364bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v20.16b, v21.16b 9374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v21.16b, v22.16b 9384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v22.16b, v23.16b 9394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v23.16b, v24.16b 9404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v24.16b, v25.16b 9414bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v25.16b, v26.16b 9424bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v26.16b, v27.16b 9434bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v27.16b, v28.16b 9444bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v28.16b, v29.16b 9454bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v29.16b, v30.16b 9464bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v30.16b, v31.16b 9474bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v31.16b, v4.16b 948446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v4.16b, v5.16b 949446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v5.16b, v6.16b 950446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v6.16b, v7.16b 951446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v7.16b, v8.16b 952446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v8.16b, v9.16b 953446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v9.16b, v10.16b 954446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v10.16b, v11.16b 955446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/ 956446788007efe0a673d0366284026adfa17b36fedSimon Hosie 957446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv4_25/*{{{*/ 9584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umull2 v14.4s, v25.8h, v0.h[0] 9594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umull v15.4s, v26.4h, v0.h[0] 960446788007efe0a673d0366284026adfa17b36fedSimon Hosie 961ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie adr x16, 100f 962ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldrsh x12, [x16, x5, LSL #1] 963ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x12, x12, x16 964446788007efe0a673d0366284026adfa17b36fedSimon Hosie br x12 965ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie 100: .hword -4 966ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 101f-100b 967ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 102f-100b 968ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 103f-100b 969ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 104f-100b 970ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 105f-100b 971ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 106f-100b 972ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 107f-100b 973ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 108f-100b 974ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 109f-100b 975ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 110f-100b 976ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 111f-100b 977ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 112f-100b 978ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 113f-100b 979ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 114f-100b 980ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 115f-100b 981ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 116f-100b 982ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 117f-100b 983ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 118f-100b 984ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 119f-100b 985ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 120f-100b 986ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 121f-100b 987ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 122f-100b 988ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 123f-100b 989ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 124f-100b 990ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie .hword 125f-100b 991446788007efe0a673d0366284026adfa17b36fedSimon Hosie .align 4 9924bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 9934bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 125: ld1 {v12.8h}, [x9] 994446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v3.h[1] 995446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v3.h[1] 996446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v10.4h, v3.h[1] 997446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v10.8h, v3.h[1] 9984bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 124: add x12, x9, #0x08 9994bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1000446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.4h}, [x12], #8 10014bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1002446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v13.4h}, [x12] 1003446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v3.h[0] 1004446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v13.4h, v3.h[0] 10054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v9.8h, v3.h[0] 1006446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v10.4h, v3.h[0] 10074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 123: add x12, x9, #0x10 10084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1009446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.8h}, [x12] 1010446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[7] 1011446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[7] 10124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v9.4h, v2.h[7] 10134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v9.8h, v2.h[7] 10144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 122: add x12, x9, #0x18 10154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1016446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.4h}, [x12], #8 10174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1018446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v13.4h}, [x12] 1019446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[6] 1020446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v13.4h, v2.h[6] 10214bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v8.8h, v2.h[6] 10224bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v9.4h, v2.h[6] 10234bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 121: add x12, x9, #0x20 10244bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1025446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.8h}, [x12] 1026446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[5] 1027446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[5] 10284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v8.4h, v2.h[5] 10294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v8.8h, v2.h[5] 10304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 120: add x12, x9, #0x28 10314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1032446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.4h}, [x12], #8 10334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1034446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v13.4h}, [x12] 1035446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[4] 1036446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v13.4h, v2.h[4] 10374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v7.8h, v2.h[4] 10384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v8.4h, v2.h[4] 10394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 119: add x12, x9, #0x30 10404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 1041446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.8h}, [x12] 1042446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[3] 1043446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v12.8h, v2.h[3] 10444bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v7.4h, v2.h[3] 10454bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v7.8h, v2.h[3] 10464bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 118: add x12, x9, #0x38 10474bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x12, x12, #0x40 10484bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie ld1 {v12.4h}, [x12] 1049446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v12.4h, v2.h[2] 10504bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v17.4h, v2.h[2] 10514bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v6.8h, v2.h[2] 10524bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v7.4h, v2.h[2] 10534bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 117: umlal v14.4s, v17.4h, v2.h[1] 10544bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v17.8h, v2.h[1] 10554bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v6.4h, v2.h[1] 10564bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v6.8h, v2.h[1] 10574bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 116: umlal2 v14.4s, v17.8h, v2.h[0] 10584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v18.4h, v2.h[0] 10594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v5.8h, v2.h[0] 10604bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v6.4h, v2.h[0] 10614bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 115: umlal v14.4s, v18.4h, v1.h[7] 10624bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v18.8h, v1.h[7] 10634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v5.4h, v1.h[7] 10644bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v5.8h, v1.h[7] 10654bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 114: umlal2 v14.4s, v18.8h, v1.h[6] 10664bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v19.4h, v1.h[6] 1067446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v14.4s, v4.8h, v1.h[6] 10684bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v5.4h, v1.h[6] 10694bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 113: umlal v14.4s, v19.4h, v1.h[5] 10704bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v19.8h, v1.h[5] 1071446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v14.4s, v4.4h, v1.h[5] 1072446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal2 v15.4s, v4.8h, v1.h[5] 10734bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 112: umlal2 v14.4s, v19.8h, v1.h[4] 10744bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v20.4h, v1.h[4] 10754bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v31.8h, v1.h[4] 1076446788007efe0a673d0366284026adfa17b36fedSimon Hosie umlal v15.4s, v4.4h, v1.h[4] 10774bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 111: umlal v14.4s, v20.4h, v1.h[3] 10784bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v20.8h, v1.h[3] 10794bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v31.4h, v1.h[3] 10804bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v31.8h, v1.h[3] 10814bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 110: umlal2 v14.4s, v20.8h, v1.h[2] 10824bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v21.4h, v1.h[2] 10834bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v30.8h, v1.h[2] 10844bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v31.4h, v1.h[2] 10854bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 109: umlal v14.4s, v21.4h, v1.h[1] 10864bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v21.8h, v1.h[1] 10874bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v30.4h, v1.h[1] 10884bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v30.8h, v1.h[1] 10894bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 108: umlal2 v14.4s, v21.8h, v1.h[0] 10904bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v22.4h, v1.h[0] 10914bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v29.8h, v1.h[0] 10924bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v30.4h, v1.h[0] 10934bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 107: umlal v14.4s, v22.4h, v0.h[7] 10944bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v22.8h, v0.h[7] 10954bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v29.4h, v0.h[7] 10964bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v29.8h, v0.h[7] 10974bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 106: umlal2 v14.4s, v22.8h, v0.h[6] 10984bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v23.4h, v0.h[6] 10994bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v28.8h, v0.h[6] 11004bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v29.4h, v0.h[6] 11014bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 105: umlal v14.4s, v23.4h, v0.h[5] 11024bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v23.8h, v0.h[5] 11034bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v28.4h, v0.h[5] 11044bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v28.8h, v0.h[5] 11054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 104: umlal2 v14.4s, v23.8h, v0.h[4] 11064bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v24.4h, v0.h[4] 11074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v27.8h, v0.h[4] 11084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v28.4h, v0.h[4] 11094bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 103: umlal v14.4s, v24.4h, v0.h[3] 11104bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v24.8h, v0.h[3] 11114bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v27.4h, v0.h[3] 11124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v27.8h, v0.h[3] 11134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 102: umlal2 v14.4s, v24.8h, v0.h[2] 11144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v25.4h, v0.h[2] 11154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v14.4s, v26.8h, v0.h[2] 11164bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v15.4s, v27.4h, v0.h[2] 11174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie 101: umlal v14.4s, v25.4h, v0.h[1] 11184bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v25.8h, v0.h[1] 11194bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal v14.4s, v26.4h, v0.h[1] 11204bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie umlal2 v15.4s, v26.8h, v0.h[1] 1121446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1122446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v14.4h, v14.4s, #16 1123446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn2 v14.8h, v15.4s, #16 1124446788007efe0a673d0366284026adfa17b36fedSimon Hosie uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1125446788007efe0a673d0366284026adfa17b36fedSimon Hosie 11264bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie st1 {v17.16b}, [x9], #16 11274bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x9, x9, #0x40 11284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v17.16b, v18.16b 11294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v18.16b, v19.16b 11304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v19.16b, v20.16b 11314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v20.16b, v21.16b 11324bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v21.16b, v22.16b 11334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v22.16b, v23.16b 11344bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v23.16b, v24.16b 11354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v24.16b, v25.16b 11364bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v25.16b, v26.16b 11374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v26.16b, v27.16b 11384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v27.16b, v28.16b 11394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v28.16b, v29.16b 11404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v29.16b, v30.16b 11414bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v30.16b, v31.16b 11424bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie mov v31.16b, v4.16b 1143446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v4.16b, v5.16b 1144446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v5.16b, v6.16b 1145446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v6.16b, v7.16b 1146446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v7.16b, v8.16b 1147446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v8.16b, v9.16b 1148446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v9.16b, v10.16b 1149446788007efe0a673d0366284026adfa17b36fedSimon Hosie mov v10.16b, v11.16b 1150446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/ 1151446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1152446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Dedicated function wrapper for the fetch macro, for the cases where 1153446788007efe0a673d0366284026adfa17b36fedSimon Hosie * performance isn't that important, to keep code size down. 1154446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 1155ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon HosiePRIVATE(fetch_generic_asm) 1156446788007efe0a673d0366284026adfa17b36fedSimon Hosie stp x10, x11, [sp, #-16]! 1157446788007efe0a673d0366284026adfa17b36fedSimon Hosie fetch 1158446788007efe0a673d0366284026adfa17b36fedSimon Hosie ldp x10, x11, [sp], #16 1159446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 1160446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(fetch_generic_asm) 1161446788007efe0a673d0366284026adfa17b36fedSimon Hosie 11625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 11635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory 11645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * beyond that limit, and filling the rest of the vector with the last legal 11655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * pixel. 11665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Result is in v10 and v11. v8 and v9 are filled with the first legal pixel. 11675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Note: This function can read beyond the right edge of input if the image is 11685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * narrower than 16 bytes. 11695a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 11705a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampleft1) 11715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie stp x29, x30, [sp, #-16]! 11725a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_generic_asm 11735a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v8.8h, v10.h[0] 11745a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v9.8h, v10.h[0] 11755a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ands x12, x10, #15 11765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie beq 1f 11775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x1, x1, x12 11785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x15, x15, x12 11795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x19, x19, x12 11805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x10, x10, x12 11815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, sp, x12, LSL #1 11825a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub sp, sp, #64 11835a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, x12, #32 11845a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp] 11855a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ld1 {v10.8h,v11.8h}, [x12] 11865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add sp, sp, #64 11875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: ldp x29, x30, [sp], #16 11885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ret 11895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampleft1) 11905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 11915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampleft4) 11925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie stp x29, x30, [sp, #-16]! 11935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_generic_asm 11945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v8.2d, v10.d[0] 11955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v9.2d, v10.d[0] 11965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ands x12, x10, #15 11975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie beq 1f 11985a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x1, x1, x12 11995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x15, x15, x12 12005a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x19, x19, x12 12015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x10, x10, x12 12025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, sp, x12, LSL #1 12035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub sp, sp, #64 12045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, x12, #32 12055a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp] 12065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ld1 {v10.8h,v11.8h}, [x12] 12075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add sp, sp, #64 12085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: ldp x29, x30, [sp], #16 12095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ret 12105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampleft4) 12115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 12125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding 12135a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * reading memory beyond that limit, and filling the rest of the vector with 12145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the last legal pixel. 12155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Result is in v10 and v11. v12 and v13 are filled with the last legal pixel. 12165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Note: This function can read beyond the left edge of input if the image is 12175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * narrower than 16 bytes. 12185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 12195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampright1) 12205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie stp x29, x30, [sp, #-16]! 12215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, xzr, x11 12225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ands x12, x12, #15 12235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie beq 1f 12245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x1, x1, x12 12255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x15, x15, x12 12265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x19, x19, x12 12275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_generic_asm 12285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v12.8h, v11.h[7] 12295a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v13.8h, v11.h[7] 12305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, xzr, x11 12315a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie and x12, x12, #15 12325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub sp, sp, #64 12335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add x12, sp, x12, LSL #1 12345a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp] 12355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ld1 {v10.8h,v11.8h}, [x12] 12365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add sp, sp, #64 12375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldp x29, x30, [sp], #16 12385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ret 12395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: bl fetch_generic_asm 12405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v12.8h, v11.h[7] 12415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v13.8h, v11.h[7] 12425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldp x29, x30, [sp], #16 12435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ret 12445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampright1) 12455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 12465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampright4) 12475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie stp x29, x30, [sp, #-16]! 12485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, xzr, x11 12495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ands x12, x12, #15 12505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie beq 1f 12515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x1, x1, x12 12525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x15, x15, x12 12535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x19, x19, x12 12545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_generic_asm 12555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v12.2d, v11.d[1] 12565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v13.2d, v11.d[1] 12575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, xzr, x11 12585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie and x12, x12, #15 12595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub sp, sp, #64 12605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add x12, sp, x12, LSL #1 12615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp] 12625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ld1 {v10.8h,v11.8h}, [x12] 12635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add sp, sp, #64 12645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldp x29, x30, [sp], #16 12655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ret 12665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: bl fetch_generic_asm 12675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v12.2d, v11.d[1] 12685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v13.2d, v11.d[1] 12695a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldp x29, x30, [sp], #16 12705a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ret 12715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampright4) 12725a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 12735a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th 12745a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * value across to fill the rest of the register pair. Used for filling the 12755a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * right hand edge of the window when reading too close to the right hand edge 12765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * of the image. 127728c034238b8891398da625b070279c34185b3494Simon Hosie * Also returns a dup-ed copy of the last element in v12 for the tail-fill 127828c034238b8891398da625b070279c34185b3494Simon Hosie * case (this happens incidentally in common path, but must be done 127928c034238b8891398da625b070279c34185b3494Simon Hosie * deliberately in the fast-out path). 1280446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 12815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(prefill_sweepright1) 128228c034238b8891398da625b070279c34185b3494Simon Hosie ands x12, x11, #15 128328c034238b8891398da625b070279c34185b3494Simon Hosie beq 1f 128428c034238b8891398da625b070279c34185b3494Simon Hosie sub x12, x12, #1 128528c034238b8891398da625b070279c34185b3494Simon Hosie sub sp, sp, #64 128628c034238b8891398da625b070279c34185b3494Simon Hosie st1 {v10.8h,v11.8h}, [sp] 128728c034238b8891398da625b070279c34185b3494Simon Hosie add x12, sp, x12, LSL #1 128828c034238b8891398da625b070279c34185b3494Simon Hosie ld1r {v12.8h}, [x12] 12895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ld1r {v13.8h}, [x12] 12905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {v12.8h,v13.8h}, [x12] 129128c034238b8891398da625b070279c34185b3494Simon Hosie ld1 {v10.8h,v11.8h}, [sp] 129228c034238b8891398da625b070279c34185b3494Simon Hosie add sp, sp, #64 1293446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 129428c034238b8891398da625b070279c34185b3494Simon Hosie1: dup v12.8h, v11.h[7] 12955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v13.8h, v11.h[7] 129628c034238b8891398da625b070279c34185b3494Simon Hosie ret 12975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(prefill_sweepright1) 1298446788007efe0a673d0366284026adfa17b36fedSimon Hosie 12995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(prefill_sweepright4) 130028c034238b8891398da625b070279c34185b3494Simon Hosie ands x12, x11, #15 130128c034238b8891398da625b070279c34185b3494Simon Hosie beq 1f 130228c034238b8891398da625b070279c34185b3494Simon Hosie sub x12, x12, #4 130328c034238b8891398da625b070279c34185b3494Simon Hosie sub sp, sp, #64 130428c034238b8891398da625b070279c34185b3494Simon Hosie st1 {v10.8h,v11.8h}, [sp] 130528c034238b8891398da625b070279c34185b3494Simon Hosie add x12, sp, x12, LSL #1 130628c034238b8891398da625b070279c34185b3494Simon Hosie ld1r {v12.2d}, [x12] 13075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {v13.8h}, [x12] 130828c034238b8891398da625b070279c34185b3494Simon Hosie ld1 {v10.8h,v11.8h}, [sp] 130928c034238b8891398da625b070279c34185b3494Simon Hosie add sp, sp, #64 131028c034238b8891398da625b070279c34185b3494Simon Hosie ret 1311446788007efe0a673d0366284026adfa17b36fedSimon Hosie1: dup v12.2d, v11.d[1] 13125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie dup v13.2d, v11.d[1] 1313446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 13145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(prefill_sweepright4) 13155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 13165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* The main loop keeps a sliding window of data that has already been convolved 13175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * in the vertical axis for the current line. This usually stays in the 13185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register file, but spills to memory for large windows. The first thing that 13195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * needs to be done at start-up is to fill this window with image data, taking 13205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * into account the padding needed if the left or right edges of the image fall 13215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * within this window. 13225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1323446788007efe0a673d0366284026adfa17b36fedSimon Hosie 13245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Because the window is in the register file writes to it cannot be indexed 13255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * by another register. Consequently the fill loops are unrolled to address 13265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the registers directly. This macro distinguishes between writes to the 13275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register file and writes to the spill buffer (indicated by a destination 13285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register named xx). 1329446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 13305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_out ra, rb, sra, srb 13315a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .ifc \ra,xx 13325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .ifc \rb,xx 13335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {\sra,\srb}, [x9], #32 1334446788007efe0a673d0366284026adfa17b36fedSimon Hosie .else 13354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie bic x9, x9, #0x40 13365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie st1 {\sra}, [x9], #16 13375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov \rb, \srb 13385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .endif 13395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .else 13405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .ifnc \ra,\sra 13415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov \ra, \sra 13425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .endif 13435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .ifnc \rb,\srb 13445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov \rb, \srb 13455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .endif 1346446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endif 1347446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm 1348446788007efe0a673d0366284026adfa17b36fedSimon Hosie 13495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* This macro provides the list of registers representing the window, and the 13505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * cases where the register file is too small and a spill buffer is used 13515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * instead. 13525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Since several specialisations of each function are generated, this also 13535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * culls superfluous iterations, and sets the variable `i` for subsequent 13545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * macros indicating the current index into the window. 13555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 13565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_list, macro, nextmacro, max_r, step, label 13575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label 13585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .if windowsize >= (\line * 16) 13595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .set i, windowsize - (\line * 16) 13605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie\label\macro\line: 13615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step 13625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .endif 13635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .endm 13645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label 13655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label 13665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 11, 10, xx, v17.16b, \step, \label 13675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 10, 9, v18.16b, v19.16b, \step, \label 13685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 9, 8, v20.16b, v21.16b, \step, \label 13695a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 8, 7, v22.16b, v23.16b, \step, \label 13705a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 7, 6, v24.16b, v25.16b, \step, \label 13715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 6, 5, v26.16b, v27.16b, \step, \label 13725a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 5, 4, v28.16b, v29.16b, \step, \label 13735a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 4, 3, v30.16b, v31.16b, \step, \label 13745a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 3, 2, v4.16b, v5.16b, \step, \label 13755a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 2, 1, v6.16b, v7.16b, \step, \label 13765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ifneeded \macro \nextmacro, 1, 0, v8.16b, v9.16b, \step, \label 13775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie\label\macro\()0: 13785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie b \label\()_end 13795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .purgem ifneeded 13805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm 13815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 13825a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* These macros represent the possible stages of filling the window. 13835a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Each macro is unrolled enough times that it can fill the entire window 13845a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * itself, but normally it will have to hand control to subsequent macros 13855a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * part-way through and this is done using labels named \next and \after, where 13865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * \next is the next macro starting at the same window position and \after is 13875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the next macro starting after the current window position. 13885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 13895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 13905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* leftfill: v8 and v9 contain the left padding value. While the window 13915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * extends outside of the image on the left-hand side, and at least 16 more 13925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * padding values are needed in the window, store v8 and v9 into the window. 13935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Otherwise skip forward to storing image data. 13945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 13955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_leftfill, next, after, ra, rb, step 13965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie cmp x10, #i+16 13975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie blo \next 13985a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_out \ra, \rb, v8.16b, v9.16b 13995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm 14005a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* leftedge: The very first non-fill or partial-fill chunk from the image is 14025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * already loaded (as it was used to calculate the left padding value), so 14035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * store it here, and then drop into the regular load/store cycle in the next 14045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * macro. 14055a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 14065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_leftedge, next, after, ra, rb, step 14075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: prefill_out \ra, \rb, v10.16b, v11.16b 14085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie b \after 14095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm 14105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* dofetch: Copy chunks of the image into the window without any complications 14125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * from edge conditions. 14135a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 14145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_dofetch, next, after, ra, rb, step 14155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie cmp x11, #i+16 14165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bls \next 1417446788007efe0a673d0366284026adfa17b36fedSimon Hosie bl fetch_generic_asm 14185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_out \ra, \rb, v10.16b, v11.16b 14195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm 14205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond 14225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the right-hand edge of the image. In that case sweep the last valid pixel 14235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * across the rest of the chunk, and in either case prepare padding data in v12 14245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * and v13 for the next macro. This is done in fetch_clampright. 14255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * This only happens once before going on to the next macro. 14265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Sometimes leftedge also covers the rightedge case, in which case this has 14275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * to be skipped altogether. 14285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 14295a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_rightedge, next, after, ra, rb, step 14305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie cmp x11, #i 14315a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bls \next 14325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_clampright\step 14335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_out \ra, \rb, v10.16b, v11.16b 14345a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie b \after 14355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm 14365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* rightfill: The rest of the window is simply filled with right padding from 14385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * v12 and v13. 14395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 14405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_rightfill, next, after, ra, rb, step 14415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_out \ra, \rb, v12.16b, v13.16b 1442446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm 1443446788007efe0a673d0366284026adfa17b36fedSimon Hosie 14445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Here all of the macros above are unrolled and laid out in the proper order. 14455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 14465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_body, max_r, step, label 14475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_list leftfill, leftedge, \max_r, \step, \label 14485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_list leftedge, dofetch, \max_r, \step, \label 14495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_list dofetch, rightedge, \max_r, \step, \label 14505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_list rightedge, rightfill, \max_r, \step, \label 14515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill_list rightfill, oops, \max_r, \step, \label 14525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie\label\()_end: 14535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm 14545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 1456446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Fill the convolution window with context data. The aim here is to load 14575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * exactly 2*r columns, and in the main loop to read as many columns as will be 14585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * written. This is complicated by the window being divided into chunks at 14595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register boundaries, and the need to handle cases when the input starts very 14605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * close to the left or right (or both) edges of the image and the need to fill 14615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the spaces that leaves with left and right edge padding values. 1462446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 1463446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input: 1464446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x1 -- src 1465446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x2 -- pitch 1466446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x3 -- count 14675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x4 -- available image data right of src pointer 1468446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x5 -- r 1469446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x6 -- rup 1470446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x7 -- rdn 14715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x8 -- available image data left of src pointer 1472446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x9 -- buffer (if needed) 1473446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x13 = -pitch 1474446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x15 = top-row in 1475ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * x19 = bottom-row in 1476446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Output: 14775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x4 -= min(inlen, count + windowsize - centertap) 14785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x1 += min(inlen, count + windowsize - centertap) 14795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x15 += min(inlen, count + windowsize - centertap) 14805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x19 += min(inlen, count + windowsize - centertap) 1481446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies: 1482446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x10 -- fill start index in the window 1483446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x11 -- fill stop index in the window 1484446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x12 -- scratch 1485446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 14865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill step=1, max_r=25, label=xx 14875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15) 14885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.set centertap, (windowsize - \max_r * \step) 14895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x10, #centertap 14905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie subs x10, x10, x8 14915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x10, xzr, x10, lo 14925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie subs x11, x4, #windowsize - centertap 14945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x11, xzr, x11, hs 14955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add x11, x11, #windowsize 14965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 14975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* x10 indicates where in the window legal image data begins. 14985a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * x11 indicates where in the window legal image date ends. 14995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * When starting near the centre of a large image these would be 15005a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * zero and windowsize respectively, but when starting near the 15015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * edges this can change. 15025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * When starting on the leftmost pixel, x10 will be centertap. 15035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * When starting on the rightmost pixel, x11 will be centertap+1. 15045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1505446788007efe0a673d0366284026adfa17b36fedSimon Hosie 15065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* x4 indicates how much data there is between the current pointers 15075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * and the right edge of the image. The pointers currently point 15085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * to the data needed at centertap. The subsequent code will 15095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * consume (windowsize - x10) data, but only the data from 15105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * centertap to windowsize comes out of x4's budget. 15115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 15125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: subs x4, x4, #windowsize - centertap 15135a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x4, xzr, x4, lo 15145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 15155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* And the pointers need to rewind to the start of the window. 15165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 15175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x1, x1, #centertap 15185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x15, x15, #centertap 15195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x19, x19, #centertap 15205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 15215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Unless x8 indicated that there wasn't that much data available. 15225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1523446788007efe0a673d0366284026adfa17b36fedSimon Hosie add x1, x1, x10 1524446788007efe0a673d0366284026adfa17b36fedSimon Hosie add x15, x15, x10 1525ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie add x19, x19, x10 15265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 15275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Get the first chunk, and add padding to align it to the window 15285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * if necessary. 1529446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 15305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_clampleft\step 1531446788007efe0a673d0366284026adfa17b36fedSimon Hosie 15325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Sometimes the start and the end of the window are in the same 15335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * chunk. In that case both ends need filler at the outset. 15345a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 15355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x12, x11, #1 15365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie eor x12, x10, x12 15375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie cmp x12, #16 15385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bhs 1f 15395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl prefill_sweepright\step 15405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 15415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Iterate through all the points in the window and fill them in 15425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * with padding or image data as needed. 15435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 15445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: prefill_body \max_r, \step, \label 1545446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm 1546446788007efe0a673d0366284026adfa17b36fedSimon Hosie 15475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* The main body of the convolve functions. Having already pre-filled the 15485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * convolution window with 2*r input values, the logic settles into a regular 15495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * pattern of reading and writing at a 1:1 rate until either input or output 15505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * expires. The input leads the output by r values, so when processing all the 15515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * way to the right-hand edge, or within r pixels of that edge, the input will 15525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * run out first. In the case of very narrow images, or sub-windows starting 15535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * near the right edge, the input may already have run out while the 15545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * convolution window was being filled and this loop will start with a 15555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * zero-length input. 15565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * 15575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Once the input runs out, the rest of the output must be processed by padding 15585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the remainder of the window with pad value from the last valid pixel from 15595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the source. 1560446788007efe0a673d0366284026adfa17b36fedSimon Hosie * 1561446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input: 1562446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x0 = dst 1563446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x1 = src 1564446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x2 = pitch 1565446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x3 = count 1566446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x4 = inlen 1567446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x5 = r 1568446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x6 = rup 1569446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x7 = rdn 1570446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x9 = buffer 1571446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x13 = -pitch 1572446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x15 = top-row in 1573ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * x19 = bottom-row in 1574446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies 1575446788007efe0a673d0366284026adfa17b36fedSimon Hosie * x8 = fetch code pointer 1576446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 15775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro conv_body core, step=1, max_r=25, labelc="", labelnc="" 15785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 15795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* If x4 >= x3 then there's no need for clipping. The main loop 15805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * needs to exit when either x3 or x4 runs out, so clamp x4 to be 15815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * no greater than x3 and use x4 for the loop. 15825a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * However, if x4 comes out of the loop with less than 16 bytes 15835a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * left, a partial read would be necessary to avoid reading beyond 15845a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the end of the image. To avoid this, clamp x4 to the next 15855a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * multiple of 16, which is still sufficient to force it out of the 15865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * loop but doesn't imply a rewind. 15875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 15885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add x12, x3, #15 15895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bic x12, x12, #15 15905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie cmp x4, x12 15915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x4, x12, x4, hi 15925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 15935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* First calculate the entry-point into the internal fetch logic. 15945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * This is done so the same function can service several kernel 15955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * sizes. 15965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1597446788007efe0a673d0366284026adfa17b36fedSimon Hosie adrp x8, \labelnc 1598446788007efe0a673d0366284026adfa17b36fedSimon Hosie add x8, x8, #:lo12:\labelnc 1599446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x8, x8, x5, LSL #5 1600446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x8, x8, x5, LSL #3 1601446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x5, x6 1602446788007efe0a673d0366284026adfa17b36fedSimon Hosie ccmp x5, x7, #0, eq 1603446788007efe0a673d0366284026adfa17b36fedSimon Hosie beq 5f 1604446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1605446788007efe0a673d0366284026adfa17b36fedSimon Hosie /* if (r != rup || r != rdn) then the address-clamping table should 1606446788007efe0a673d0366284026adfa17b36fedSimon Hosie * be used rather than the short-cut version. 1607446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 1608446788007efe0a673d0366284026adfa17b36fedSimon Hosie adrp x8, \labelc 1609446788007efe0a673d0366284026adfa17b36fedSimon Hosie add x8, x8, #:lo12:\labelc 1610446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x8, x8, x5, LSL #6 1611446788007efe0a673d0366284026adfa17b36fedSimon Hosie add x8, x8, x5, LSL #3 1612446788007efe0a673d0366284026adfa17b36fedSimon Hosie b 5f 1613446788007efe0a673d0366284026adfa17b36fedSimon Hosie 16145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Main loop: ... */ 16155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie .align 4 16165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie3: /* first perform a vertical convolution from memory to get the next 16175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * 16 taps of the horizontal window into the register file... 16185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 16195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 16205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 16215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* ...then perform a horizontal convolution on that window to 16225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * produce eight output bytes, and slide the window along. 16235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * This has to be done twice to match the 16-way vertical pass. 16245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * It would be preferable to have twice the work done in \core, but 16255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * that would demand yet another variant on those macros and would 16265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * perturb the register allocation severely. 1627446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 1628446788007efe0a673d0366284026adfa17b36fedSimon Hosie \core 1629446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v15.8b}, [x0], #8 1630446788007efe0a673d0366284026adfa17b36fedSimon Hosie \core 1631446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v15.8b}, [x0], #8 1632446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1633446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x3, x3, #16 1634446788007efe0a673d0366284026adfa17b36fedSimon Hosie5: subs x4, x4, #16 16355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bhi 3b 16365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Here there's 16 or fewer bytes available before the edge of the 16375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * source image. x4 holds that count minus 16 (because it was 16385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * decremented before the first iteration ran). The last read may 16395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * not be a whole chunk, and beyond that a fill value must be used. 16405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * 16415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Of course, none of that matters if there's no more output to 16425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * produce... 16435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 16445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie cbz x3, 5f 16455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 16465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Oh well. */ 1647446788007efe0a673d0366284026adfa17b36fedSimon Hosie adds x4, x4, #16 1648446788007efe0a673d0366284026adfa17b36fedSimon Hosie bne 1f 1649446788007efe0a673d0366284026adfa17b36fedSimon Hosie .if \step==1 1650446788007efe0a673d0366284026adfa17b36fedSimon Hosie dup v10.8h, v9.h[7] 1651446788007efe0a673d0366284026adfa17b36fedSimon Hosie dup v11.8h, v9.h[7] 1652446788007efe0a673d0366284026adfa17b36fedSimon Hosie .else 1653446788007efe0a673d0366284026adfa17b36fedSimon Hosie dup v10.2d, v9.d[1] 1654446788007efe0a673d0366284026adfa17b36fedSimon Hosie dup v11.2d, v9.d[1] 1655446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endif 16565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie b 3f 1657446788007efe0a673d0366284026adfa17b36fedSimon Hosie 16585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* To avoid reading past end of input, rewind pointers by (16-x4) 16595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * to ensure that they're exactly 16 bytes from the edge. 16605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 16615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1: mov x11, x4 16625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bl fetch_clampright\step 16635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* Now to put this padding to use, perform any remaining 16645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * iterations. This is done at half the rate of the main loop, 16655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * because there's no longer pressure from a 16-lane window filler. 16665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1667446788007efe0a673d0366284026adfa17b36fedSimon Hosie3: \core 1668446788007efe0a673d0366284026adfa17b36fedSimon Hosie .if \step==1 1669446788007efe0a673d0366284026adfa17b36fedSimon Hosie dup v11.8h, v11.h[7] 1670446788007efe0a673d0366284026adfa17b36fedSimon Hosie .else 1671446788007efe0a673d0366284026adfa17b36fedSimon Hosie dup v11.2d, v11.d[1] 1672446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endif 1673446788007efe0a673d0366284026adfa17b36fedSimon Hosie subs x3, x3, #8 1674446788007efe0a673d0366284026adfa17b36fedSimon Hosie blo 4f 1675446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v15.8b}, [x0], #8 16765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bne 3b 16775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie b 5f 16785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 16795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* If the final iteration contained 0 < l < 8 values, then perform 16805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * a piecewise store of the final vector. 16815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1682446788007efe0a673d0366284026adfa17b36fedSimon Hosie4: tbz x3, #2, 1f 1683446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v15.s}[0], [x0], #4 168428c034238b8891398da625b070279c34185b3494Simon Hosie ext v15.8b, v15.8b, v15.8b, #4 1685446788007efe0a673d0366284026adfa17b36fedSimon Hosie1: tbz x3, #1, 1f 1686446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v15.h}[0], [x0], #2 168728c034238b8891398da625b070279c34185b3494Simon Hosie ext v15.8b, v15.8b, v15.8b, #2 1688446788007efe0a673d0366284026adfa17b36fedSimon Hosie1: tbz x3, #0, 5f 1689446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v15.b}[0], [x0], #1 169028c034238b8891398da625b070279c34185b3494Simon Hosie ext v15.8b, v15.8b, v15.8b, #1 16915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie5: mov x0, #0 1692446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm 1693446788007efe0a673d0366284026adfa17b36fedSimon Hosie 16945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie 1695e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp r, TUNED_LIST1, 25 1696ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon HosiePRIVATE(convolve1_\r) 1697446788007efe0a673d0366284026adfa17b36fedSimon Hosie stp x29,x30, [sp, #-16]! 1698446788007efe0a673d0366284026adfa17b36fedSimon Hosie 16995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill step=1, max_r=\r, label=.Lcnv1_\r 1700446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1702446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1703446788007efe0a673d0366284026adfa17b36fedSimon Hosie ldp x29,x30, [sp], #16 1704446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 1705446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(convolve1_\r) 1706446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endr 1707446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1708e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp r, TUNED_LIST4, 25 1709ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon HosiePRIVATE(convolve4_\r) 17105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x9, sp, #0x40 17115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie stp x29,x30, [sp, #-(16 + 0x40 + 0x80)]! 17125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie bic x9, x9, #0x7f 1713446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie /* x9 now points to a 0x40 byte buffer on the stack whose address 17155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * has the low 7 bits clear. This allows easy address calculation 17165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * in the wrap-around cases. 17175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */ 1718446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie prefill step=4, max_r=\r, label=.Lcnv4_\r 1720446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1722446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldp x29,x30, [sp], #(16 + 0x40 + 0x80) 1724446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 1725446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(convolve4_\r) 1726446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endr 1727446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1728446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* void rsdIntrinsicBlurU1_K( 1729446788007efe0a673d0366284026adfa17b36fedSimon Hosie * void *out, // x0 1730446788007efe0a673d0366284026adfa17b36fedSimon Hosie * void *in, // x1 1731446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t w, // x2 1732446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t h, // x3 1733446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t p, // x4 1734446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t x, // x5 1735446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t y, // x6 1736446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t count, // x7 1737446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t r, // [sp] 1738446788007efe0a673d0366284026adfa17b36fedSimon Hosie * uint16_t *tab); // [sp,#8] 1739446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 1740446788007efe0a673d0366284026adfa17b36fedSimon HosieENTRY(rsdIntrinsicBlurU1_K) 1741ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie stp x19,x30, [sp, #-16]! 1742446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x8, sp, #32 1743446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub sp, sp, #64 1744446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v8.1d - v11.1d}, [sp] 1745446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v12.1d - v15.1d}, [x8] 17465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x8, x5 // x 17475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldr w5, [sp,#80] // r 17485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x9, x2, x8 // w - x 17495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x10, x3, x6 // h - y 17505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x2, x4 // pitch 17515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x3, x7 // count 17525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x7, x10, #1 // h - y - 1 17535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x4, x9 // inlen = (w - x) 1754446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1755ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldr x12, [sp, #88] // tab 1756446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add x1, x1, x8 // src += x 1758446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1759446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x6, x5 17605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x6, x5, x6, hs // rup = min(r, y) 1761446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x7, x5 17625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x7, x5, x7, hs // rdn = min(r, h - y - 1) 1763446788007efe0a673d0366284026adfa17b36fedSimon Hosie 17645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x13, xzr, x2 // -pitch 1765446788007efe0a673d0366284026adfa17b36fedSimon Hosie msub x15, x2, x6, x1 1766ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie madd x19, x2, x7, x1 1767446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1768446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v0.8h,v1.8h}, [x12], #32 1769446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v2.8h,v3.8h}, [x12], #32 1770446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1771446788007efe0a673d0366284026adfa17b36fedSimon Hosie adr x30, 1f 1772e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh .irp r, TUNED_LIST1 1773446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x5, #\r 1774446788007efe0a673d0366284026adfa17b36fedSimon Hosie bls convolve1_\r 1775446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endr 1776446788007efe0a673d0366284026adfa17b36fedSimon Hosie b convolve1_25 1777446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1778446788007efe0a673d0366284026adfa17b36fedSimon Hosie1: ld1 {v8.1d - v11.1d}, [sp], #32 1779446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.1d - v15.1d}, [sp], #32 1780ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldp x19,x30, [sp], #16 1781446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 1782446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(rsdIntrinsicBlurU1_K) 1783446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1784446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* void rsdIntrinsicBlurU4_K( 1785446788007efe0a673d0366284026adfa17b36fedSimon Hosie * void *out, // x0 1786446788007efe0a673d0366284026adfa17b36fedSimon Hosie * void *in, // x1 1787446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t w, // x2 1788446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t h, // x3 1789446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t p, // x4 1790446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t x, // x5 1791446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t y, // x6 1792446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t count, // x7 1793446788007efe0a673d0366284026adfa17b36fedSimon Hosie * size_t r, // [sp] 1794446788007efe0a673d0366284026adfa17b36fedSimon Hosie * uint16_t *tab); // [sp,#8] 1795446788007efe0a673d0366284026adfa17b36fedSimon Hosie */ 1796446788007efe0a673d0366284026adfa17b36fedSimon HosieENTRY(rsdIntrinsicBlurU4_K) 1797ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie stp x19,x30, [sp, #-16]! 1798446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x8, sp, #32 1799446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub sp, sp, #64 1800446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v8.1d - v11.1d}, [sp] 1801446788007efe0a673d0366284026adfa17b36fedSimon Hosie st1 {v12.1d - v15.1d}, [x8] 18025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie lsl x8, x5, #2 // x 18035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie lsl x2, x2, #2 18045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie ldr w5, [sp,#80] // r 18055a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x9, x2, x8 // w - x 18065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x10, x3, x6 // h - y 18075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x2, x4 // pitch 18085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie lsl x3, x7, #2 // count 18095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie sub x7, x10, #1 // h - y - 1 18105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie mov x4, x9 // inlen = (w - x) 1811446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1812ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldr x12, [sp, #88] 1813446788007efe0a673d0366284026adfa17b36fedSimon Hosie 18145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie add x1, x1, x8 // in += x 1815446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1816446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x6, x5 18175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x6, x5, x6, hs // rup = min(r, y) 1818446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x7, x5 18195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie csel x7, x5, x7, hs // rdn = min(r, h - y - 1) 1820446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1821446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1822446788007efe0a673d0366284026adfa17b36fedSimon Hosie sub x13, xzr, x2 1823446788007efe0a673d0366284026adfa17b36fedSimon Hosie msub x15, x2, x6, x1 1824ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie madd x19, x2, x7, x1 1825446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1826446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v0.8h,v1.8h}, [x12], #32 1827446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v2.8h,v3.8h}, [x12], #32 1828446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1829446788007efe0a673d0366284026adfa17b36fedSimon Hosie adr x30, 1f 1830e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh .irp r, TUNED_LIST4 1831446788007efe0a673d0366284026adfa17b36fedSimon Hosie cmp x5, #\r 1832446788007efe0a673d0366284026adfa17b36fedSimon Hosie bls convolve4_\r 1833446788007efe0a673d0366284026adfa17b36fedSimon Hosie .endr 1834446788007efe0a673d0366284026adfa17b36fedSimon Hosie b convolve4_25 1835446788007efe0a673d0366284026adfa17b36fedSimon Hosie 1836446788007efe0a673d0366284026adfa17b36fedSimon Hosie1: ld1 {v8.1d - v11.1d}, [sp], #32 1837446788007efe0a673d0366284026adfa17b36fedSimon Hosie ld1 {v12.1d - v15.1d}, [sp], #32 1838ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie ldp x19,x30, [sp], #16 1839446788007efe0a673d0366284026adfa17b36fedSimon Hosie ret 1840446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(rsdIntrinsicBlurU4_K) 1841