1446788007efe0a673d0366284026adfa17b36fedSimon Hosie/*
2446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Copyright (C) 2014 The Android Open Source Project
3446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
4446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Licensed under the Apache License, Version 2.0 (the "License");
5446788007efe0a673d0366284026adfa17b36fedSimon Hosie * you may not use this file except in compliance with the License.
6446788007efe0a673d0366284026adfa17b36fedSimon Hosie * You may obtain a copy of the License at
7446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
8446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      http://www.apache.org/licenses/LICENSE-2.0
9446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
10446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Unless required by applicable law or agreed to in writing, software
11446788007efe0a673d0366284026adfa17b36fedSimon Hosie * distributed under the License is distributed on an "AS IS" BASIS,
12446788007efe0a673d0366284026adfa17b36fedSimon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13446788007efe0a673d0366284026adfa17b36fedSimon Hosie * See the License for the specific language governing permissions and
14446788007efe0a673d0366284026adfa17b36fedSimon Hosie * limitations under the License.
15446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
16446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17446788007efe0a673d0366284026adfa17b36fedSimon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19446788007efe0a673d0366284026adfa17b36fedSimon Hosie#define END(f) .size f, .-f;
20446788007efe0a673d0366284026adfa17b36fedSimon Hosie
21e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie//#define ARCH_ARM64_USE_BLUR_PRELOAD
22e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie
235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Number of fractional bits to preserve in intermediate results.  The
245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * intermediate storage is 16-bit, and we started with 8 bit data (the integer
255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * part), so this should be between 0 and 8.
265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
27446788007efe0a673d0366284026adfa17b36fedSimon Hosie.set FRACTION_BITS, 7
28446788007efe0a673d0366284026adfa17b36fedSimon Hosie.set MAX_R, 25
29446788007efe0a673d0366284026adfa17b36fedSimon Hosie
30446788007efe0a673d0366284026adfa17b36fedSimon Hosie
31446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* A quick way of making a line of code conditional on some other condition.
32446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
33446788007efe0a673d0366284026adfa17b36fedSimon Hosie * `ifcc`:
34446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
35446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro ifcc zzz:vararg
36446788007efe0a673d0366284026adfa17b36fedSimon Hosie.if cc
37446788007efe0a673d0366284026adfa17b36fedSimon Hosie            \zzz
38446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endif
39446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm
40446788007efe0a673d0366284026adfa17b36fedSimon Hosie
41e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie/* It's not always clear that prefetching is beneficial and this needs further
42e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie * testing on different cores, so it's made switchable here.
43e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie */
44e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
45e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
46e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#else
47e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#define VERTPLD(...) nop
48e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie#endif
49e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie
50446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Fetch 16 columns of bytes (regardless of image format), convolve these
51446788007efe0a673d0366284026adfa17b36fedSimon Hosie * vertically, and leave them in the register file.  If working near the top or
52446788007efe0a673d0366284026adfa17b36fedSimon Hosie * bottom of an image then clamp the addressing while loading the data in.
53446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
54446788007efe0a673d0366284026adfa17b36fedSimon Hosie * The convolution is fully unrolled for windows up to max_r, with the
55446788007efe0a673d0366284026adfa17b36fedSimon Hosie * outermost edges calculated first.  This way it's possible to branch directly
56446788007efe0a673d0366284026adfa17b36fedSimon Hosie * into the relevant part of the code for an arbitrary convolution radius.  Two
57446788007efe0a673d0366284026adfa17b36fedSimon Hosie * variants of the loop are produced; one eliminates the clamping code for a
58446788007efe0a673d0366284026adfa17b36fedSimon Hosie * slight speed advantage.
59446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
60446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Where the macro is called with reg=x, the specified register is taken to
61446788007efe0a673d0366284026adfa17b36fedSimon Hosie * contain a pre-calculated pointer into one of the two loops.
62446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
63446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input:
64446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x1 -- src
65446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x2 -- pitch
66446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x5 -- r
675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x6 -- rup (r, unless clipped to top of source image)
685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x7 -- rdn (r, unless clipped to bottom of source image)
69446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x12 -- switch index
706267c335745f32fb0d898335930da6b0904be577Simon Hosie *      v0-v3 -- coefficient table
71446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x13 = -pitch
72446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x15 = top-row in
73ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie *      x19 = bottom-row in
74446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Output:
75446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x1 += 16
766267c335745f32fb0d898335930da6b0904be577Simon Hosie *      v10,v11 -- 16 convolved columns
77446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies:
78446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x10 = upper row pointer
79446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x11 = lower row pointer
806267c335745f32fb0d898335930da6b0904be577Simon Hosie *      v12-v15 = temporary sums
81446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
82446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
83446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
84446788007efe0a673d0366284026adfa17b36fedSimon Hosie
85446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v15.16b}, [x1], #16
86446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         x10, x15
87446788007efe0a673d0366284026adfa17b36fedSimon Hosie
88446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uxtl        v14.8h, v15.8b
89e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            VERTPLD(x1, #16)
90446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uxtl2       v15.8h, v15.16b
91446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .if \max_r < 16 // approximate
92446788007efe0a673d0366284026adfa17b36fedSimon Hosie    ifcc    adr         \reg, 1f
93446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .else
94446788007efe0a673d0366284026adfa17b36fedSimon Hosie    ifcc    adrp        \reg, 1f
95446788007efe0a673d0366284026adfa17b36fedSimon Hosie    ifcc    add         \reg, \reg, #:lo12:1f
96446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .endif
97446788007efe0a673d0366284026adfa17b36fedSimon Hosie
98446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v12.4s, v14.4h, v0.h[0]
99446788007efe0a673d0366284026adfa17b36fedSimon Hosie    ifcc    sub         \reg, \reg, x5, LSL #6
100446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v13.4s, v14.8h, v0.h[0]
101ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            mov         x11, x19
102446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v14.4s, v15.4h, v0.h[0]
103446788007efe0a673d0366284026adfa17b36fedSimon Hosie    ifcc    add         \reg, \reg, x5, LSL #3
104446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v15.4s, v15.8h, v0.h[0]
105446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          \reg
106446788007efe0a673d0366284026adfa17b36fedSimon Hosie
107e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  /* This version of the vertical fetch loop body is used away from the edges
108e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * of the source image.  The pointers start at the top and bottom source rows
109e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * and work their way towards the centre on each iteration.  This way the
110e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * number of taps used can be controlled by jumping directly into the middle
111e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * of the loop and running to completion.
112e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * If the loop body changes size then the code which caculates the address of
113e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * the initial iteration must be updated to accordingly.
114e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   */
115e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .macro vertfetch_noclamp i, dreg
116e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie    .if 0 < \i && \i <= \max_r
117e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            ld1         {v10.16b}, [x10], x2
118e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            ld1         {v11.16b}, [x11], x13
119e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            uaddl       v16.8h, v10.8b, v11.8b
120e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            uaddl2      v11.8h, v10.16b, v11.16b
121e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal       v12.4s, v16.4h, \dreg
122e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal2      v13.4s, v16.8h, \dreg
123e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            VERTPLD(x10, #32)
124e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal       v14.4s, v11.4h, \dreg
125e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            VERTPLD(x11, #32)
126e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal2      v15.4s, v11.8h, \dreg
127e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie    .endif
128e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .endm
129e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie
130e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  /* This version of the vertical fetch loop body is used near the edges of the
131e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * source image, where one or both of the accesses may start with a clamped
132e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * value, and the row addresses only begin to change after some number of
133e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * iterations before the end.
134e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * If the loop body changes size then the code which caculates the address of
135e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * the initial iteration must be updated to accordingly.
136e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   */
137e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .macro vertfetch_clamped i, dreg
138e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie    .if 0 < \i && \i <= \max_r
139446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v10.16b}, [x10], x2
140e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            cmp         x6, #\i
141446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v11.16b}, [x11], x13
142e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            csel        x10, x15, x10, lo
143446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uaddl       v16.8h, v10.8b, v11.8b
144e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            cmp         x7, #\i
145446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uaddl2      v11.8h, v10.16b, v11.16b
146e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            csel        x11, x19, x11, lo
147e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal       v12.4s, v16.4h, \dreg
148e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal2      v13.4s, v16.8h, \dreg
149e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            VERTPLD(x10, #32)
150e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal       v14.4s, v11.4h, \dreg
151e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            VERTPLD(x11, #32)
152e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie            umlal2      v15.4s, v11.8h, \dreg
153446788007efe0a673d0366284026adfa17b36fedSimon Hosie    .endif
154e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .endm
155e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie
156e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  /* Entry into this unrolled loop is computed as a negative index from
157e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * \labelc at the end of the block.
158e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   */
159e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .align 4
160e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 27, v3.h[3]
161e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 26, v3.h[2]
162e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 25, v3.h[1]
163e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 24, v3.h[0]
164e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 23, v2.h[7]
165e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 22, v2.h[6]
166e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 21, v2.h[5]
167e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 20, v2.h[4]
168e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 19, v2.h[3]
169e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 18, v2.h[2]
170e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 17, v2.h[1]
171e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 16, v2.h[0]
172e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 15, v1.h[7]
173e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 14, v1.h[6]
174e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 13, v1.h[5]
175e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 12, v1.h[4]
176e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 11, v1.h[3]
177e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped 10, v1.h[2]
178e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  9, v1.h[1]
179e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  8, v1.h[0]
180e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  7, v0.h[7]
181e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  6, v0.h[6]
182e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  5, v0.h[5]
183e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  4, v0.h[4]
184e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  3, v0.h[3]
185e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  2, v0.h[2]
186e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  1, v0.h[1]
187e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_clamped  0, v0.h[0]
188e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  1:
189e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
190e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie
191e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  /* Entry into this unrolled loop is computed as a negative index from
192e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   * \labelnc at the end of the block.
193e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie   */
194e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .align 4
195e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 27, v3.h[3]
196e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 26, v3.h[2]
197e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 25, v3.h[1]
198e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 24, v3.h[0]
199e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 23, v2.h[7]
200e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 22, v2.h[6]
201e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 21, v2.h[5]
202e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 20, v2.h[4]
203e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 19, v2.h[3]
204e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 18, v2.h[2]
205e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 17, v2.h[1]
206e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 16, v2.h[0]
207e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 15, v1.h[7]
208e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 14, v1.h[6]
209e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 13, v1.h[5]
210e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 12, v1.h[4]
211e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 11, v1.h[3]
212e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp 10, v1.h[2]
213e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  9, v1.h[1]
214e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  8, v1.h[0]
215e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  7, v0.h[7]
216e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  6, v0.h[6]
217e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  5, v0.h[5]
218e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  4, v0.h[4]
219e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  3, v0.h[3]
220e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  2, v0.h[2]
221e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  1, v0.h[1]
222e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  vertfetch_noclamp  0, v0.h[0]
223e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  \labelnc :
224e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie
225e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .purgem vertfetch_clamped
226e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  .purgem vertfetch_noclamp
227446788007efe0a673d0366284026adfa17b36fedSimon Hosie
228e0bb9e833075eb665ac10b70c8b5bc8edf0e93a9Simon Hosie  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
229446788007efe0a673d0366284026adfa17b36fedSimon Hosie            add         x15, x15, #16
230446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
231ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x19, x19, #16
232446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
233446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
234446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm /*}}}*/
235446788007efe0a673d0366284026adfa17b36fedSimon Hosie
236446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Some portion of the convolution window (as much as will fit, and all of it
237446788007efe0a673d0366284026adfa17b36fedSimon Hosie * for the uchar1 cases) is kept in the register file to avoid unnecessary
238446788007efe0a673d0366284026adfa17b36fedSimon Hosie * memory accesses.  This forces the horizontal loops to be unrolled because
239446788007efe0a673d0366284026adfa17b36fedSimon Hosie * there's no indexed addressing into the register file.
240446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
241446788007efe0a673d0366284026adfa17b36fedSimon Hosie * As in the fetch macro, the operations are ordered from outside to inside, so
242446788007efe0a673d0366284026adfa17b36fedSimon Hosie * that jumping into the middle of the block bypasses the unwanted window taps.
243446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
244446788007efe0a673d0366284026adfa17b36fedSimon Hosie * There are several variants of the macro because of the fixed offets of the
245446788007efe0a673d0366284026adfa17b36fedSimon Hosie * taps -- the wider the maximum radius the further the centre tap is from the
246446788007efe0a673d0366284026adfa17b36fedSimon Hosie * most recently fetched data.  This means that pre-filling the window requires
247446788007efe0a673d0366284026adfa17b36fedSimon Hosie * more data that won't be used and it means that rotating the window involves
248446788007efe0a673d0366284026adfa17b36fedSimon Hosie * more mov operations.
249446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
250ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie * When the buffer gets too big the buffer at [x9] is used.
251446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
252446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input:
2536267c335745f32fb0d898335930da6b0904be577Simon Hosie *      v16-v31,v4-v11 -- convoltion window
254ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie *      x9 -- pointer to additional convolution window data
255446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Output:
256ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie *      x9 -- updated buffer pointer (if used)
257446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      d31 -- result to be stored
258446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies:
259ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie *      x12 -- temp buffer pointer
2606267c335745f32fb0d898335930da6b0904be577Simon Hosie *      v12-v13 -- temporaries for load and vext operations.
2616267c335745f32fb0d898335930da6b0904be577Simon Hosie *      v14-v15 -- intermediate sums
262446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
263446788007efe0a673d0366284026adfa17b36fedSimon Hosie#define TUNED_LIST1 8, 16
264446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv1_8/*{{{*/
265446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v14.4s, v9.4h, v0.h[0]
266446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v15.4s, v9.8h, v0.h[0]
267446788007efe0a673d0366284026adfa17b36fedSimon Hosie
268ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            adr         x16, 100f
269ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
270ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x12, x12, x16
271446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          x12
272ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie   100:     .hword -4
273ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 101f-100b
274ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 102f-100b
275ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 103f-100b
276ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 104f-100b
277ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 105f-100b
278ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 106f-100b
279ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 107f-100b
280ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 108f-100b
281446788007efe0a673d0366284026adfa17b36fedSimon Hosie            .align      4
282446788007efe0a673d0366284026adfa17b36fedSimon Hosie    108:    umlal       v14.4s, v8.4h, v1.h[0]
283446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v8.8h, v1.h[0]
284446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v10.4h, v1.h[0]
285446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v10.8h, v1.h[0]
286446788007efe0a673d0366284026adfa17b36fedSimon Hosie    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
287446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #7*2
288446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[7]
289446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[7]
290446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[7]
291446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[7]
292446788007efe0a673d0366284026adfa17b36fedSimon Hosie    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
293446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #6*2
294446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[6]
295446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[6]
296446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[6]
297446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[6]
298446788007efe0a673d0366284026adfa17b36fedSimon Hosie    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
299446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #5*2
300446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[5]
301446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[5]
302446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[5]
303446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[5]
304446788007efe0a673d0366284026adfa17b36fedSimon Hosie    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
305446788007efe0a673d0366284026adfa17b36fedSimon Hosie            //ext         v13.16b, v9.16b, v10.16b, #4*2
306446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v8.8h, v0.h[4]
307446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v9.4h, v0.h[4]
308446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v9.8h, v0.h[4]
309446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v10.4h, v0.h[4]
310446788007efe0a673d0366284026adfa17b36fedSimon Hosie    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
311446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #3*2
312446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[3]
313446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[3]
314446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[3]
315446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[3]
316446788007efe0a673d0366284026adfa17b36fedSimon Hosie    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
317446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #2*2
318446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[2]
319446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[2]
320446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[2]
321446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[2]
322446788007efe0a673d0366284026adfa17b36fedSimon Hosie    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
323446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #1*2
324446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[1]
325446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[1]
326446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[1]
327446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[1]
328446788007efe0a673d0366284026adfa17b36fedSimon Hosie
329446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v14.4h, v14.4s, #16
330446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v14.8h, v15.4s, #16
331446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
332446788007efe0a673d0366284026adfa17b36fedSimon Hosie
333446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v8.16b, v9.16b
334446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v9.16b, v10.16b
335446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v10.16b, v11.16b
336446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/
337446788007efe0a673d0366284026adfa17b36fedSimon Hosie
338446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv1_16/*{{{*/
339446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v14.4s, v8.4h, v0.h[0]
340446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v15.4s, v8.8h, v0.h[0]
341446788007efe0a673d0366284026adfa17b36fedSimon Hosie
342ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            adr         x16, 100f
343ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
344ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x12, x12, x16
345446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          x12
346ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie   100:     .hword -4
347ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 101f-100b
348ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 102f-100b
349ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 103f-100b
350ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 104f-100b
351ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 105f-100b
352ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 106f-100b
353ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 107f-100b
354ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 108f-100b
355ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 109f-100b
356ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 110f-100b
357ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 111f-100b
358ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 112f-100b
359ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 113f-100b
360ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 114f-100b
361ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 115f-100b
362ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 116f-100b
363446788007efe0a673d0366284026adfa17b36fedSimon Hosie            .align 4
364446788007efe0a673d0366284026adfa17b36fedSimon Hosie    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
365446788007efe0a673d0366284026adfa17b36fedSimon Hosie            //ext         v13.16b, v10.16b, v11.16b, #0*2
366446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v6.4h, v2.h[0]
367446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v6.8h, v2.h[0]
368446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v10.4h, v2.h[0]
369446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v10.8h, v2.h[0]
370446788007efe0a673d0366284026adfa17b36fedSimon Hosie    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
371446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #7*2
372446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[7]
373446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[7]
374446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[7]
375446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[7]
376446788007efe0a673d0366284026adfa17b36fedSimon Hosie    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
377446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #6*2
378446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[6]
379446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[6]
380446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[6]
381446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[6]
382446788007efe0a673d0366284026adfa17b36fedSimon Hosie    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
383446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #5*2
384446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[5]
385446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[5]
386446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[5]
387446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[5]
388446788007efe0a673d0366284026adfa17b36fedSimon Hosie    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
389446788007efe0a673d0366284026adfa17b36fedSimon Hosie            //ext         v13.16b, v9.16b, v10.16b, #4*2
390446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v6.8h, v1.h[4]
391446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v7.4h, v1.h[4]
392446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v9.8h, v1.h[4]
393446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v10.4h, v1.h[4]
394446788007efe0a673d0366284026adfa17b36fedSimon Hosie    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
395446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #3*2
396446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[3]
397446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[3]
398446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[3]
399446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[3]
400446788007efe0a673d0366284026adfa17b36fedSimon Hosie    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
401446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #2*2
402446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[2]
403446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[2]
404446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[2]
405446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[2]
406446788007efe0a673d0366284026adfa17b36fedSimon Hosie    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
407446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #1*2
408446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[1]
409446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[1]
410446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[1]
411446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[1]
412446788007efe0a673d0366284026adfa17b36fedSimon Hosie    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
413446788007efe0a673d0366284026adfa17b36fedSimon Hosie            //ext         v13.16b, v9.16b, v10.16b, #0*2
414446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v7.4h, v1.h[0]
415446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v7.8h, v1.h[0]
416446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v9.4h, v1.h[0]
417446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v9.8h, v1.h[0]
418446788007efe0a673d0366284026adfa17b36fedSimon Hosie    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
419446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #7*2
420446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[7]
421446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[7]
422446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[7]
423446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[7]
424446788007efe0a673d0366284026adfa17b36fedSimon Hosie    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
425446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #6*2
426446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[6]
427446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[6]
428446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[6]
429446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[6]
430446788007efe0a673d0366284026adfa17b36fedSimon Hosie    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
431446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #5*2
432446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[5]
433446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[5]
434446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[5]
435446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[5]
436446788007efe0a673d0366284026adfa17b36fedSimon Hosie    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
437446788007efe0a673d0366284026adfa17b36fedSimon Hosie            //ext         v13.16b, v8.16b, v9.16b, #4*2
438446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v7.8h, v0.h[4]
439446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v8.4h, v0.h[4]
440446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v8.8h, v0.h[4]
441446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v9.4h, v0.h[4]
442446788007efe0a673d0366284026adfa17b36fedSimon Hosie    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
443446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #3*2
444446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[3]
445446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[3]
446446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[3]
447446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[3]
448446788007efe0a673d0366284026adfa17b36fedSimon Hosie    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
449446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #2*2
450446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[2]
451446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[2]
452446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[2]
453446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[2]
454446788007efe0a673d0366284026adfa17b36fedSimon Hosie    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
455446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #1*2
456446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[1]
457446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[1]
458446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[1]
459446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[1]
460446788007efe0a673d0366284026adfa17b36fedSimon Hosie
461446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v14.4h, v14.4s, #16
462446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v14.8h, v15.4s, #16
463446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
464446788007efe0a673d0366284026adfa17b36fedSimon Hosie
465446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v6.16b, v7.16b
466446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v7.16b, v8.16b
467446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v8.16b, v9.16b
468446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v9.16b, v10.16b
469446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v10.16b, v11.16b
470446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/
471446788007efe0a673d0366284026adfa17b36fedSimon Hosie
472446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv1_25/*{{{*/
473446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v12.16b, v6.16b, v7.16b, #7*2
474446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v14.4s, v12.4h, v0.h[0]
475446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v15.4s, v12.8h, v0.h[0]
476446788007efe0a673d0366284026adfa17b36fedSimon Hosie
477ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            adr         x16, 100f
478ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
479ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x12, x12, x16
480446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          x12
481ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie   100:     .hword -4
482ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 101f-100b
483ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 102f-100b
484ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 103f-100b
485ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 104f-100b
486ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 105f-100b
487ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 106f-100b
488ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 107f-100b
489ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 108f-100b
490ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 109f-100b
491ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 110f-100b
492ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 111f-100b
493ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 112f-100b
494ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 113f-100b
495ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 114f-100b
496ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 115f-100b
497ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 116f-100b
498ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 117f-100b
499ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 118f-100b
500ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 119f-100b
501ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 120f-100b
502ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 121f-100b
503ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 122f-100b
504ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 123f-100b
505ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 124f-100b
506ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 125f-100b
507446788007efe0a673d0366284026adfa17b36fedSimon Hosie            .align 4
5084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
509446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v10.16b, v11.16b, #0*2
510446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v3.h[1]
511446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v3.h[1]
512446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v3.h[1]
513446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v3.h[1]
5146267c335745f32fb0d898335930da6b0904be577Simon Hosie    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
515446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #7*2
516446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v3.h[0]
517446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v3.h[0]
518446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v3.h[0]
519446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v3.h[0]
520446788007efe0a673d0366284026adfa17b36fedSimon Hosie    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
521446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #6*2
522446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[7]
523446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[7]
524446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[7]
525446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[7]
526446788007efe0a673d0366284026adfa17b36fedSimon Hosie    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
527446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #5*2
528446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[6]
529446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[6]
530446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[6]
531446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[6]
532446788007efe0a673d0366284026adfa17b36fedSimon Hosie    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
533446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #4*2
534446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[5]
535446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[5]
536446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[5]
537446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[5]
538446788007efe0a673d0366284026adfa17b36fedSimon Hosie    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
539446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #3*2
540446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[4]
541446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[4]
542446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[4]
543446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[4]
544446788007efe0a673d0366284026adfa17b36fedSimon Hosie    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
545446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #2*2
546446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[3]
547446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[3]
548446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[3]
549446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[3]
550446788007efe0a673d0366284026adfa17b36fedSimon Hosie    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
551446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #1*2
552446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[2]
553446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[2]
554446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[2]
555446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[2]
556446788007efe0a673d0366284026adfa17b36fedSimon Hosie    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
557446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v9.16b, v10.16b, #0*2
558446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[1]
559446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[1]
560446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[1]
561446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[1]
562446788007efe0a673d0366284026adfa17b36fedSimon Hosie    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
563446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #7*2
564446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[0]
565446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[0]
566446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v2.h[0]
567446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v2.h[0]
568446788007efe0a673d0366284026adfa17b36fedSimon Hosie    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
569446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #6*2
570446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[7]
571446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[7]
572446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[7]
573446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[7]
574446788007efe0a673d0366284026adfa17b36fedSimon Hosie    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
575446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #5*2
576446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[6]
577446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[6]
578446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[6]
579446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[6]
580446788007efe0a673d0366284026adfa17b36fedSimon Hosie    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
581446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #4*2
582446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[5]
583446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[5]
584446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[5]
585446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[5]
586446788007efe0a673d0366284026adfa17b36fedSimon Hosie    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
587446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #3*2
588446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[4]
589446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[4]
590446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[4]
591446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[4]
592446788007efe0a673d0366284026adfa17b36fedSimon Hosie    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
593446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #2*2
594446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[3]
595446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[3]
596446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[3]
597446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[3]
598446788007efe0a673d0366284026adfa17b36fedSimon Hosie    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
599446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #1*2
600446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[2]
601446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[2]
602446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[2]
603446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[2]
604446788007efe0a673d0366284026adfa17b36fedSimon Hosie    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
605446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v8.16b, v9.16b, #0*2
606446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[1]
607446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[1]
608446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[1]
609446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[1]
610446788007efe0a673d0366284026adfa17b36fedSimon Hosie    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
611446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #7*2
612446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v1.h[0]
613446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v1.h[0]
614446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v1.h[0]
615446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v1.h[0]
616446788007efe0a673d0366284026adfa17b36fedSimon Hosie    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
617446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #6*2
618446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[7]
619446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[7]
620446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[7]
621446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[7]
622446788007efe0a673d0366284026adfa17b36fedSimon Hosie    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
623446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #5*2
624446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[6]
625446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[6]
626446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[6]
627446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[6]
628446788007efe0a673d0366284026adfa17b36fedSimon Hosie    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
629446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #4*2
630446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[5]
631446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[5]
632446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[5]
633446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[5]
634446788007efe0a673d0366284026adfa17b36fedSimon Hosie    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
635446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #3*2
636446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[4]
637446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[4]
638446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[4]
639446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[4]
640446788007efe0a673d0366284026adfa17b36fedSimon Hosie    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
641446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #2*2
642446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[3]
643446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[3]
644446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[3]
645446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[3]
646446788007efe0a673d0366284026adfa17b36fedSimon Hosie    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
647446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #1*2
648446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[2]
649446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[2]
650446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[2]
651446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[2]
652446788007efe0a673d0366284026adfa17b36fedSimon Hosie    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
653446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ext         v13.16b, v7.16b, v8.16b, #0*2
654446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v0.h[1]
655446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v0.h[1]
656446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v13.4h, v0.h[1]
657446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v13.8h, v0.h[1]
658446788007efe0a673d0366284026adfa17b36fedSimon Hosie
659446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v14.4h, v14.4s, #16
660446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v14.8h, v15.4s, #16
661446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
662446788007efe0a673d0366284026adfa17b36fedSimon Hosie
6634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v31.16b, v4.16b
664446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v4.16b, v5.16b
665446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v5.16b, v6.16b
666446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v6.16b, v7.16b
667446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v7.16b, v8.16b
668446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v8.16b, v9.16b
669446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v9.16b, v10.16b
670446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v10.16b, v11.16b
671446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/
672446788007efe0a673d0366284026adfa17b36fedSimon Hosie
6734bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie#define TUNED_LIST4 6, 12, 20
674446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv4_6/*{{{*/
675446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v14.4s, v7.4h, v0.h[0]
676446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v15.4s, v7.8h, v0.h[0]
677446788007efe0a673d0366284026adfa17b36fedSimon Hosie
678ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            adr         x16, 100f
679ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
680ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x12, x12, x16
681446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          x12
682ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie   100:     .hword -4
683ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 101f-100b
684ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 102f-100b
685ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 103f-100b
686ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 104f-100b
687ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 105f-100b
688ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 106f-100b
689446788007efe0a673d0366284026adfa17b36fedSimon Hosie            .align      4
690446788007efe0a673d0366284026adfa17b36fedSimon Hosie    106:    umlal       v14.4s, v4.4h,  v0.h[6]
691446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v4.8h,  v0.h[6]
692446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v10.4h, v0.h[6]
693446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v10.8h, v0.h[6]
694446788007efe0a673d0366284026adfa17b36fedSimon Hosie    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
695446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v5.4h, v0.h[5]
696446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v9.8h, v0.h[5]
697446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v10.4h, v0.h[5]
698446788007efe0a673d0366284026adfa17b36fedSimon Hosie    104:    umlal       v14.4s, v5.4h, v0.h[4]
699446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v5.8h, v0.h[4]
700446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v9.4h, v0.h[4]
701446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v9.8h, v0.h[4]
702446788007efe0a673d0366284026adfa17b36fedSimon Hosie    103:    umlal2      v14.4s, v5.8h, v0.h[3]
703446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v6.4h, v0.h[3]
704446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v8.8h, v0.h[3]
705446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v9.4h, v0.h[3]
706446788007efe0a673d0366284026adfa17b36fedSimon Hosie    102:    umlal       v14.4s, v6.4h, v0.h[2]
707446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v6.8h, v0.h[2]
708446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v8.4h, v0.h[2]
709446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v8.8h, v0.h[2]
710446788007efe0a673d0366284026adfa17b36fedSimon Hosie    101:    umlal2      v14.4s, v6.8h, v0.h[1]
711446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v7.4h, v0.h[1]
712446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v7.8h, v0.h[1]
713446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v8.4h, v0.h[1]
714446788007efe0a673d0366284026adfa17b36fedSimon Hosie
715446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v14.4h, v14.4s, #16
716446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v14.8h, v15.4s, #16
717446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
718446788007efe0a673d0366284026adfa17b36fedSimon Hosie
719446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v4.16b, v5.16b
720446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v5.16b, v6.16b
721446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v6.16b, v7.16b
722446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v7.16b, v8.16b
723446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v8.16b, v9.16b
724446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v9.16b, v10.16b
725446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v10.16b, v11.16b
726446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/
727446788007efe0a673d0366284026adfa17b36fedSimon Hosie
728446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv4_12/*{{{*/
729446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull       v14.4s, v4.4h, v0.h[0]
730446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umull2      v15.4s, v4.8h, v0.h[0]
731446788007efe0a673d0366284026adfa17b36fedSimon Hosie
732ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            adr         x16, 100f
733ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
734ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x12, x12, x16
735446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          x12
736ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie   100:     .hword -4
737ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 101f-100b
738ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 102f-100b
739ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 103f-100b
740ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 104f-100b
741ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 105f-100b
742ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 106f-100b
743ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 107f-100b
744ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 108f-100b
745ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 109f-100b
746ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 110f-100b
747ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 111f-100b
748ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 112f-100b
749446788007efe0a673d0366284026adfa17b36fedSimon Hosie            .align 4
7504bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    112:    umlal       v14.4s, v26.4h, v1.h[4]
7514bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v26.8h, v1.h[4]
752446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v10.4h, v1.h[4]
753446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v10.8h, v1.h[4]
7544bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    111:    umlal2      v14.4s, v26.8h, v1.h[3]
7554bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v27.4h, v1.h[3]
756446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v9.8h, v1.h[3]
757446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v10.4h, v1.h[3]
7584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    110:    umlal       v14.4s, v27.4h, v1.h[2]
7594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v27.8h, v1.h[2]
760446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v9.4h, v1.h[2]
761446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v9.8h, v1.h[2]
7624bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    109:    umlal2      v14.4s, v27.8h, v1.h[1]
7634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v28.4h, v1.h[1]
764446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v8.8h, v1.h[1]
765446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v9.4h, v1.h[1]
7664bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    108:    umlal       v14.4s, v28.4h, v1.h[0]
7674bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v28.8h, v1.h[0]
768446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v8.4h, v1.h[0]
769446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v8.8h, v1.h[0]
7704bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    107:    umlal2      v14.4s, v28.8h, v0.h[7]
7714bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v29.4h, v0.h[7]
772446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v7.8h, v0.h[7]
773446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v8.4h, v0.h[7]
7744bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    106:    umlal       v14.4s, v29.4h, v0.h[6]
7754bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v29.8h, v0.h[6]
776446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v7.4h, v0.h[6]
777446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v7.8h, v0.h[6]
7784bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    105:    umlal2      v14.4s, v29.8h, v0.h[5]
7794bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v30.4h, v0.h[5]
780446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v6.8h, v0.h[5]
781446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v7.4h, v0.h[5]
7824bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    104:    umlal       v14.4s, v30.4h, v0.h[4]
7834bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v30.8h, v0.h[4]
784446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v6.4h, v0.h[4]
785446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v6.8h, v0.h[4]
7864bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    103:    umlal2      v14.4s, v30.8h, v0.h[3]
7874bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v31.4h, v0.h[3]
788446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v5.8h, v0.h[3]
789446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v6.4h, v0.h[3]
7904bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    102:    umlal       v14.4s, v31.4h, v0.h[2]
7914bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v31.8h, v0.h[2]
792446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v5.4h, v0.h[2]
793446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v5.8h, v0.h[2]
7944bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    101:    umlal2      v14.4s, v31.8h, v0.h[1]
795446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v4.4h,  v0.h[1]
796446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v4.8h,  v0.h[1]
797446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v5.4h, v0.h[1]
798446788007efe0a673d0366284026adfa17b36fedSimon Hosie
799446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v14.4h, v14.4s, #16
800446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v14.8h, v15.4s, #16
801446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
802446788007efe0a673d0366284026adfa17b36fedSimon Hosie
8034bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v26.16b, v27.16b
8044bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v27.16b, v28.16b
8054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v28.16b, v29.16b
8064bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v29.16b, v30.16b
8074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v30.16b, v31.16b
8084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v31.16b, v4.16b
8094bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v4.16b, v5.16b
8104bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v5.16b, v6.16b
8114bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v6.16b, v7.16b
8124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v7.16b, v8.16b
8134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v8.16b, v9.16b
8144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v9.16b, v10.16b
8154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v10.16b, v11.16b
8164bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie.endm/*}}}*/
8174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie
8184bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie.macro hconv4_20/*{{{*/
8194bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umull       v14.4s, v28.4h, v0.h[0]
8204bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umull2      v15.4s, v28.8h, v0.h[0]
8214bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie
8224bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            adr         x16, 100f
8234bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
8244bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            add         x12, x12, x16
8254bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            br          x12
8264bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie   100:     .hword -4
8274bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 101f-100b
8284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 102f-100b
8294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 103f-100b
8304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 104f-100b
8314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 105f-100b
8324bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 106f-100b
8334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 107f-100b
8344bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 108f-100b
8354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 109f-100b
8364bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 110f-100b
8374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 111f-100b
8384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 112f-100b
8394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 113f-100b
8404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 114f-100b
8414bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 115f-100b
8424bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 116f-100b
8434bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 117f-100b
8444bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 118f-100b
8454bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 119f-100b
8464bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .hword 120f-100b
8474bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            .align 4
8484bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie
8494bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    120:    umlal       v14.4s, v18.4h, v2.h[4]
8504bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v18.8h, v2.h[4]
8514bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v10.4h, v2.h[4]
8524bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v10.8h, v2.h[4]
8534bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    119:    umlal2      v14.4s, v18.8h, v2.h[3]
8544bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v19.4h, v2.h[3]
8554bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v9.8h,  v2.h[3]
8564bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v10.4h, v2.h[3]
8574bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    118:    umlal       v14.4s, v19.4h, v2.h[2]
8584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v19.8h, v2.h[2]
8594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v9.4h,  v2.h[2]
8604bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v9.8h,  v2.h[2]
8614bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    117:    umlal2      v14.4s, v19.8h, v2.h[1]
8624bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v20.4h, v2.h[1]
8634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v8.8h,  v2.h[1]
8644bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v9.4h,  v2.h[1]
8654bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    116:    umlal       v14.4s, v20.4h, v2.h[0]
8664bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v20.8h, v2.h[0]
8674bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v8.4h,  v2.h[0]
8684bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v8.8h,  v2.h[0]
8694bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    115:    umlal2      v14.4s, v20.8h, v1.h[7]
8704bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v21.4h, v1.h[7]
8714bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v7.8h,  v1.h[7]
8724bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v8.4h,  v1.h[7]
8734bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    114:    umlal       v14.4s, v21.4h, v1.h[6]
8744bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v21.8h, v1.h[6]
8754bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v7.4h,  v1.h[6]
8764bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v7.8h,  v1.h[6]
8774bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    113:    umlal2      v14.4s, v21.8h, v1.h[5]
8784bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v22.4h, v1.h[5]
8794bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v6.8h,  v1.h[5]
8804bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v7.4h,  v1.h[5]
8814bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    112:    umlal       v14.4s, v22.4h, v1.h[4]
8824bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v22.8h, v1.h[4]
8834bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v6.4h,  v1.h[4]
8844bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v6.8h,  v1.h[4]
8854bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    111:    umlal2      v14.4s, v22.8h, v1.h[3]
8864bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v23.4h, v1.h[3]
8874bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v5.8h,  v1.h[3]
8884bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v6.4h,  v1.h[3]
8894bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    110:    umlal       v14.4s, v23.4h, v1.h[2]
8904bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v23.8h, v1.h[2]
8914bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v5.4h,  v1.h[2]
8924bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v5.8h,  v1.h[2]
8934bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    109:    umlal2      v14.4s, v23.8h, v1.h[1]
8944bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v24.4h, v1.h[1]
8954bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v4.8h,  v1.h[1]
8964bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v5.4h,  v1.h[1]
8974bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    108:    umlal       v14.4s, v24.4h, v1.h[0]
8984bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v24.8h, v1.h[0]
8994bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v4.4h,  v1.h[0]
9004bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v4.8h,  v1.h[0]
9014bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    107:    umlal2      v14.4s, v24.8h, v0.h[7]
9024bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v25.4h, v0.h[7]
9034bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v31.8h, v0.h[7]
9044bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v4.4h,  v0.h[7]
9054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    106:    umlal       v14.4s, v25.4h, v0.h[6]
9064bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v25.8h, v0.h[6]
9074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v31.4h, v0.h[6]
9084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v31.8h, v0.h[6]
9094bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    105:    umlal2      v14.4s, v25.8h, v0.h[5]
9104bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v26.4h, v0.h[5]
9114bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v30.8h, v0.h[5]
9124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v31.4h, v0.h[5]
9134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    104:    umlal       v14.4s, v26.4h, v0.h[4]
9144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v26.8h, v0.h[4]
9154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v30.4h, v0.h[4]
9164bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v30.8h, v0.h[4]
9174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    103:    umlal2      v14.4s, v26.8h, v0.h[3]
9184bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v27.4h, v0.h[3]
9194bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v29.8h, v0.h[3]
9204bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v30.4h, v0.h[3]
9214bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    102:    umlal       v14.4s, v27.4h, v0.h[2]
9224bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v27.8h, v0.h[2]
9234bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v29.4h, v0.h[2]
9244bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v29.8h, v0.h[2]
9254bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    101:    umlal2      v14.4s, v27.8h, v0.h[1]
9264bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v28.4h, v0.h[1]
9274bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v28.8h, v0.h[1]
9284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v29.4h, v0.h[1]
9294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie
9304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            uqrshrn     v14.4h, v14.4s, #16
9314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            uqrshrn2    v14.8h, v15.4s, #16
9324bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
9334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie
9344bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v18.16b, v19.16b
9354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v19.16b, v20.16b
9364bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v20.16b, v21.16b
9374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v21.16b, v22.16b
9384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v22.16b, v23.16b
9394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v23.16b, v24.16b
9404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v24.16b, v25.16b
9414bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v25.16b, v26.16b
9424bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v26.16b, v27.16b
9434bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v27.16b, v28.16b
9444bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v28.16b, v29.16b
9454bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v29.16b, v30.16b
9464bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v30.16b, v31.16b
9474bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v31.16b, v4.16b
948446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v4.16b, v5.16b
949446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v5.16b, v6.16b
950446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v6.16b, v7.16b
951446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v7.16b, v8.16b
952446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v8.16b, v9.16b
953446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v9.16b, v10.16b
954446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v10.16b, v11.16b
955446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/
956446788007efe0a673d0366284026adfa17b36fedSimon Hosie
957446788007efe0a673d0366284026adfa17b36fedSimon Hosie.macro hconv4_25/*{{{*/
9584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umull2      v14.4s, v25.8h, v0.h[0]
9594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umull       v15.4s, v26.4h, v0.h[0]
960446788007efe0a673d0366284026adfa17b36fedSimon Hosie
961ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            adr         x16, 100f
962ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldrsh       x12, [x16, x5, LSL #1]
963ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x12, x12, x16
964446788007efe0a673d0366284026adfa17b36fedSimon Hosie            br          x12
965ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie   100:     .hword -4
966ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 101f-100b
967ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 102f-100b
968ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 103f-100b
969ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 104f-100b
970ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 105f-100b
971ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 106f-100b
972ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 107f-100b
973ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 108f-100b
974ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 109f-100b
975ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 110f-100b
976ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 111f-100b
977ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 112f-100b
978ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 113f-100b
979ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 114f-100b
980ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 115f-100b
981ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 116f-100b
982ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 117f-100b
983ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 118f-100b
984ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 119f-100b
985ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 120f-100b
986ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 121f-100b
987ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 122f-100b
988ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 123f-100b
989ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 124f-100b
990ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            .hword 125f-100b
991446788007efe0a673d0366284026adfa17b36fedSimon Hosie            .align 4
9924bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie
9934bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    125:    ld1         {v12.8h}, [x9]
994446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v3.h[1]
995446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v3.h[1]
996446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v10.4h, v3.h[1]
997446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v10.8h, v3.h[1]
9984bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    124:    add         x12, x9, #0x08
9994bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1000446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.4h}, [x12], #8
10014bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1002446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v13.4h}, [x12]
1003446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v3.h[0]
1004446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v13.4h, v3.h[0]
10054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v9.8h,  v3.h[0]
1006446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v10.4h, v3.h[0]
10074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    123:    add         x12, x9, #0x10
10084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1009446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.8h}, [x12]
1010446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[7]
1011446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[7]
10124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v9.4h,  v2.h[7]
10134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v9.8h,  v2.h[7]
10144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    122:    add         x12, x9, #0x18
10154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1016446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.4h}, [x12], #8
10174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1018446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v13.4h}, [x12]
1019446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[6]
1020446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v13.4h, v2.h[6]
10214bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v8.8h,  v2.h[6]
10224bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v9.4h,  v2.h[6]
10234bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    121:    add         x12, x9, #0x20
10244bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1025446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.8h}, [x12]
1026446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[5]
1027446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[5]
10284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v8.4h,  v2.h[5]
10294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v8.8h,  v2.h[5]
10304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    120:    add         x12, x9, #0x28
10314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1032446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.4h}, [x12], #8
10334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1034446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v13.4h}, [x12]
1035446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[4]
1036446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v13.4h, v2.h[4]
10374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v7.8h,  v2.h[4]
10384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v8.4h,  v2.h[4]
10394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    119:    add         x12, x9, #0x30
10404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
1041446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.8h}, [x12]
1042446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[3]
1043446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v12.8h, v2.h[3]
10444bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v7.4h,  v2.h[3]
10454bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v7.8h,  v2.h[3]
10464bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    118:    add         x12, x9, #0x38
10474bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x12, x12, #0x40
10484bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            ld1         {v12.4h}, [x12]
1049446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v12.4h, v2.h[2]
10504bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v17.4h, v2.h[2]
10514bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v6.8h,  v2.h[2]
10524bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v7.4h,  v2.h[2]
10534bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    117:    umlal       v14.4s, v17.4h, v2.h[1]
10544bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v17.8h, v2.h[1]
10554bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v6.4h,  v2.h[1]
10564bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v6.8h,  v2.h[1]
10574bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    116:    umlal2      v14.4s, v17.8h, v2.h[0]
10584bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v18.4h, v2.h[0]
10594bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v5.8h,  v2.h[0]
10604bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v6.4h,  v2.h[0]
10614bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    115:    umlal       v14.4s, v18.4h, v1.h[7]
10624bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v18.8h, v1.h[7]
10634bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v5.4h,  v1.h[7]
10644bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v5.8h,  v1.h[7]
10654bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    114:    umlal2      v14.4s, v18.8h, v1.h[6]
10664bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v19.4h, v1.h[6]
1067446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v14.4s, v4.8h,  v1.h[6]
10684bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v5.4h,  v1.h[6]
10694bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    113:    umlal       v14.4s, v19.4h, v1.h[5]
10704bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v19.8h, v1.h[5]
1071446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v14.4s, v4.4h,  v1.h[5]
1072446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal2      v15.4s, v4.8h,  v1.h[5]
10734bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    112:    umlal2      v14.4s, v19.8h, v1.h[4]
10744bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v20.4h, v1.h[4]
10754bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v31.8h, v1.h[4]
1076446788007efe0a673d0366284026adfa17b36fedSimon Hosie            umlal       v15.4s, v4.4h,  v1.h[4]
10774bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    111:    umlal       v14.4s, v20.4h, v1.h[3]
10784bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v20.8h, v1.h[3]
10794bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v31.4h, v1.h[3]
10804bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v31.8h, v1.h[3]
10814bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    110:    umlal2      v14.4s, v20.8h, v1.h[2]
10824bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v21.4h, v1.h[2]
10834bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v30.8h, v1.h[2]
10844bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v31.4h, v1.h[2]
10854bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    109:    umlal       v14.4s, v21.4h, v1.h[1]
10864bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v21.8h, v1.h[1]
10874bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v30.4h, v1.h[1]
10884bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v30.8h, v1.h[1]
10894bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    108:    umlal2      v14.4s, v21.8h, v1.h[0]
10904bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v22.4h, v1.h[0]
10914bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v29.8h, v1.h[0]
10924bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v30.4h, v1.h[0]
10934bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    107:    umlal       v14.4s, v22.4h, v0.h[7]
10944bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v22.8h, v0.h[7]
10954bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v29.4h, v0.h[7]
10964bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v29.8h, v0.h[7]
10974bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    106:    umlal2      v14.4s, v22.8h, v0.h[6]
10984bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v23.4h, v0.h[6]
10994bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v28.8h, v0.h[6]
11004bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v29.4h, v0.h[6]
11014bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    105:    umlal       v14.4s, v23.4h, v0.h[5]
11024bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v23.8h, v0.h[5]
11034bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v28.4h, v0.h[5]
11044bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v28.8h, v0.h[5]
11054bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    104:    umlal2      v14.4s, v23.8h, v0.h[4]
11064bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v24.4h, v0.h[4]
11074bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v27.8h, v0.h[4]
11084bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v28.4h, v0.h[4]
11094bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    103:    umlal       v14.4s, v24.4h, v0.h[3]
11104bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v24.8h, v0.h[3]
11114bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v27.4h, v0.h[3]
11124bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v27.8h, v0.h[3]
11134bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    102:    umlal2      v14.4s, v24.8h, v0.h[2]
11144bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v25.4h, v0.h[2]
11154bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v14.4s, v26.8h, v0.h[2]
11164bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v15.4s, v27.4h, v0.h[2]
11174bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie    101:    umlal       v14.4s, v25.4h, v0.h[1]
11184bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v25.8h, v0.h[1]
11194bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal       v14.4s, v26.4h, v0.h[1]
11204bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            umlal2      v15.4s, v26.8h, v0.h[1]
1121446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1122446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v14.4h, v14.4s, #16
1123446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn2    v14.8h, v15.4s, #16
1124446788007efe0a673d0366284026adfa17b36fedSimon Hosie            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1125446788007efe0a673d0366284026adfa17b36fedSimon Hosie
11264bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            st1         {v17.16b}, [x9], #16
11274bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x9, x9, #0x40
11284bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v17.16b, v18.16b
11294bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v18.16b, v19.16b
11304bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v19.16b, v20.16b
11314bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v20.16b, v21.16b
11324bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v21.16b, v22.16b
11334bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v22.16b, v23.16b
11344bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v23.16b, v24.16b
11354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v24.16b, v25.16b
11364bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v25.16b, v26.16b
11374bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v26.16b, v27.16b
11384bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v27.16b, v28.16b
11394bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v28.16b, v29.16b
11404bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v29.16b, v30.16b
11414bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v30.16b, v31.16b
11424bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            mov         v31.16b, v4.16b
1143446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v4.16b, v5.16b
1144446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v5.16b, v6.16b
1145446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v6.16b, v7.16b
1146446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v7.16b, v8.16b
1147446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v8.16b, v9.16b
1148446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v9.16b, v10.16b
1149446788007efe0a673d0366284026adfa17b36fedSimon Hosie            mov         v10.16b, v11.16b
1150446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm/*}}}*/
1151446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1152446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Dedicated function wrapper for the fetch macro, for the cases where
1153446788007efe0a673d0366284026adfa17b36fedSimon Hosie * performance isn't that important, to keep code size down.
1154446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
1155ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon HosiePRIVATE(fetch_generic_asm)
1156446788007efe0a673d0366284026adfa17b36fedSimon Hosie            stp         x10, x11, [sp, #-16]!
1157446788007efe0a673d0366284026adfa17b36fedSimon Hosie            fetch
1158446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ldp         x10, x11, [sp], #16
1159446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
1160446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(fetch_generic_asm)
1161446788007efe0a673d0366284026adfa17b36fedSimon Hosie
11625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
11635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
11645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * beyond that limit, and filling the rest of the vector with the last legal
11655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * pixel.
11665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
11675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Note: This function can read beyond the right edge of input if the image is
11685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * narrower than 16 bytes.
11695a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
11705a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampleft1)
11715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            stp         x29, x30, [sp, #-16]!
11725a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_generic_asm
11735a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v8.8h, v10.h[0]
11745a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v9.8h, v10.h[0]
11755a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ands        x12, x10, #15
11765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            beq         1f
11775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x1, x1, x12
11785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x15, x15, x12
11795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x19, x19, x12
11805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x10, x10, x12
11815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, sp, x12, LSL #1
11825a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         sp, sp, #64
11835a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, x12, #32
11845a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
11855a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ld1         {v10.8h,v11.8h}, [x12]
11865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         sp, sp, #64
11875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          ldp         x29, x30, [sp], #16
11885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ret
11895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampleft1)
11905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
11915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampleft4)
11925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            stp         x29, x30, [sp, #-16]!
11935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_generic_asm
11945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v8.2d, v10.d[0]
11955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v9.2d, v10.d[0]
11965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ands        x12, x10, #15
11975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            beq         1f
11985a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x1, x1, x12
11995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x15, x15, x12
12005a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x19, x19, x12
12015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x10, x10, x12
12025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, sp, x12, LSL #1
12035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         sp, sp, #64
12045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, x12, #32
12055a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
12065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ld1         {v10.8h,v11.8h}, [x12]
12075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         sp, sp, #64
12085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          ldp         x29, x30, [sp], #16
12095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ret
12105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampleft4)
12115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
12125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
12135a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * reading memory beyond that limit, and filling the rest of the vector with
12145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the last legal pixel.
12155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
12165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Note: This function can read beyond the left edge of input if the image is
12175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * narrower than 16 bytes.
12185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
12195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampright1)
12205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            stp         x29, x30, [sp, #-16]!
12215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, xzr, x11
12225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ands        x12, x12, #15
12235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            beq         1f
12245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x1, x1, x12
12255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x15, x15, x12
12265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x19, x19, x12
12275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_generic_asm
12285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v12.8h, v11.h[7]
12295a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v13.8h, v11.h[7]
12305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, xzr, x11
12315a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            and         x12, x12, #15
12325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         sp, sp, #64
12335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         x12, sp, x12, LSL #1
12345a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
12355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ld1         {v10.8h,v11.8h}, [x12]
12365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         sp, sp, #64
12375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldp         x29, x30, [sp], #16
12385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ret
12395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          bl          fetch_generic_asm
12405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v12.8h, v11.h[7]
12415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v13.8h, v11.h[7]
12425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldp         x29, x30, [sp], #16
12435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ret
12445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampright1)
12455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
12465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(fetch_clampright4)
12475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            stp         x29, x30, [sp, #-16]!
12485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, xzr, x11
12495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ands        x12, x12, #15
12505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            beq         1f
12515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x1, x1, x12
12525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x15, x15, x12
12535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x19, x19, x12
12545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_generic_asm
12555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v12.2d, v11.d[1]
12565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v13.2d, v11.d[1]
12575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, xzr, x11
12585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            and         x12, x12, #15
12595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         sp, sp, #64
12605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         x12, sp, x12, LSL #1
12615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
12625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ld1         {v10.8h,v11.8h}, [x12]
12635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         sp, sp, #64
12645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldp         x29, x30, [sp], #16
12655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ret
12665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          bl          fetch_generic_asm
12675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v12.2d, v11.d[1]
12685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v13.2d, v11.d[1]
12695a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldp         x29, x30, [sp], #16
12705a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ret
12715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(fetch_clampright4)
12725a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
12735a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
12745a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * value across to fill the rest of the register pair.  Used for filling the
12755a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * right hand edge of the window when reading too close to the right hand edge
12765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * of the image.
127728c034238b8891398da625b070279c34185b3494Simon Hosie * Also returns a dup-ed copy of the last element in v12 for the tail-fill
127828c034238b8891398da625b070279c34185b3494Simon Hosie * case (this happens incidentally in common path, but must be done
127928c034238b8891398da625b070279c34185b3494Simon Hosie * deliberately in the fast-out path).
1280446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
12815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(prefill_sweepright1)
128228c034238b8891398da625b070279c34185b3494Simon Hosie            ands        x12, x11, #15
128328c034238b8891398da625b070279c34185b3494Simon Hosie            beq         1f
128428c034238b8891398da625b070279c34185b3494Simon Hosie            sub         x12, x12, #1
128528c034238b8891398da625b070279c34185b3494Simon Hosie            sub         sp, sp, #64
128628c034238b8891398da625b070279c34185b3494Simon Hosie            st1         {v10.8h,v11.8h}, [sp]
128728c034238b8891398da625b070279c34185b3494Simon Hosie            add         x12, sp, x12, LSL #1
128828c034238b8891398da625b070279c34185b3494Simon Hosie            ld1r        {v12.8h}, [x12]
12895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ld1r        {v13.8h}, [x12]
12905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {v12.8h,v13.8h}, [x12]
129128c034238b8891398da625b070279c34185b3494Simon Hosie            ld1         {v10.8h,v11.8h}, [sp]
129228c034238b8891398da625b070279c34185b3494Simon Hosie            add         sp, sp, #64
1293446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
129428c034238b8891398da625b070279c34185b3494Simon Hosie1:          dup         v12.8h, v11.h[7]
12955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v13.8h, v11.h[7]
129628c034238b8891398da625b070279c34185b3494Simon Hosie            ret
12975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(prefill_sweepright1)
1298446788007efe0a673d0366284026adfa17b36fedSimon Hosie
12995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosiePRIVATE(prefill_sweepright4)
130028c034238b8891398da625b070279c34185b3494Simon Hosie            ands        x12, x11, #15
130128c034238b8891398da625b070279c34185b3494Simon Hosie            beq         1f
130228c034238b8891398da625b070279c34185b3494Simon Hosie            sub         x12, x12, #4
130328c034238b8891398da625b070279c34185b3494Simon Hosie            sub         sp, sp, #64
130428c034238b8891398da625b070279c34185b3494Simon Hosie            st1         {v10.8h,v11.8h}, [sp]
130528c034238b8891398da625b070279c34185b3494Simon Hosie            add         x12, sp, x12, LSL #1
130628c034238b8891398da625b070279c34185b3494Simon Hosie            ld1r        {v12.2d}, [x12]
13075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {v13.8h}, [x12]
130828c034238b8891398da625b070279c34185b3494Simon Hosie            ld1         {v10.8h,v11.8h}, [sp]
130928c034238b8891398da625b070279c34185b3494Simon Hosie            add         sp, sp, #64
131028c034238b8891398da625b070279c34185b3494Simon Hosie            ret
1311446788007efe0a673d0366284026adfa17b36fedSimon Hosie1:          dup         v12.2d, v11.d[1]
13125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            dup         v13.2d, v11.d[1]
1313446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
13145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon HosieEND(prefill_sweepright4)
13155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
13165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* The main loop keeps a sliding window of data that has already been convolved
13175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * in the vertical axis for the current line.  This usually stays in the
13185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register file, but spills to memory for large windows.  The first thing that
13195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * needs to be done at start-up is to fill this window with image data, taking
13205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * into account the padding needed if the left or right edges of the image fall
13215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * within this window.
13225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
1323446788007efe0a673d0366284026adfa17b36fedSimon Hosie
13245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Because the window is in the register file writes to it cannot be indexed
13255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * by another register.  Consequently the fill loops are unrolled to address
13265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the registers directly.  This macro distinguishes between writes to the
13275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register file and writes to the spill buffer (indicated by a destination
13285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register named xx).
1329446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
13305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_out ra, rb, sra, srb
13315a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie  .ifc \ra,xx
13325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .ifc \rb,xx
13335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {\sra,\srb}, [x9], #32
1334446788007efe0a673d0366284026adfa17b36fedSimon Hosie    .else
13354bea0d3b51fcdd9976af72c553a4a1d492016ca2Simon Hosie            bic         x9, x9, #0x40
13365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            st1         {\sra}, [x9], #16
13375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         \rb, \srb
13385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .endif
13395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie  .else
13405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .ifnc \ra,\sra
13415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         \ra, \sra
13425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .endif
13435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .ifnc \rb,\srb
13445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         \rb, \srb
13455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .endif
1346446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .endif
1347446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm
1348446788007efe0a673d0366284026adfa17b36fedSimon Hosie
13495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* This macro provides the list of registers representing the window, and the
13505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * cases where the register file is too small and a spill buffer is used
13515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * instead.
13525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Since several specialisations of each function are generated, this also
13535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * culls superfluous iterations, and sets the variable `i` for subsequent
13545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * macros indicating the current index into the window.
13555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
13565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_list, macro, nextmacro, max_r, step, label
13575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
13585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .if windowsize >= (\line * 16)
13595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie      .set i, windowsize - (\line * 16)
13605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie\label\macro\line:
13615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
13625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie    .endif
13635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie  .endm
13645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
13655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
13665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
13675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
13685a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
13695a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
13705a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
13715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
13725a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
13735a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
13745a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
13755a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
13765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
13775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie\label\macro\()0:
13785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            b           \label\()_end
13795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie  .purgem ifneeded
13805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm
13815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
13825a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* These macros represent the possible stages of filling the window.
13835a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Each macro is unrolled enough times that it can fill the entire window
13845a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * itself, but normally it will have to hand control to subsequent macros
13855a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * part-way through and this is done using labels named \next and \after, where
13865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * \next is the next macro starting at the same window position and \after is
13875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the next macro starting after the current window position.
13885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
13895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
13905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* leftfill: v8 and v9 contain the left padding value.  While the window
13915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * extends outside of the image on the left-hand side, and at least 16 more
13925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * padding values are needed in the window, store v8 and v9 into the window.
13935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Otherwise skip forward to storing image data.
13945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
13955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_leftfill, next, after, ra, rb, step
13965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            cmp         x10, #i+16
13975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            blo         \next
13985a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_out \ra, \rb, v8.16b, v9.16b
13995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm
14005a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* leftedge: The very first non-fill or partial-fill chunk from the image is
14025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * already loaded (as it was used to calculate the left padding value), so
14035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * store it here, and then drop into the regular load/store cycle in the next
14045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * macro.
14055a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
14065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_leftedge, next, after, ra, rb, step
14075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          prefill_out \ra, \rb, v10.16b, v11.16b
14085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            b           \after
14095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm
14105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* dofetch: Copy chunks of the image into the window without any complications
14125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * from edge conditions.
14135a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
14145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_dofetch, next, after, ra, rb, step
14155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            cmp         x11, #i+16
14165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bls         \next
1417446788007efe0a673d0366284026adfa17b36fedSimon Hosie            bl          fetch_generic_asm
14185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_out \ra, \rb, v10.16b, v11.16b
14195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm
14205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
14225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the right-hand edge of the image.  In that case sweep the last valid pixel
14235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * across the rest of the chunk, and in either case prepare padding data in v12
14245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * and v13 for the next macro.  This is done in fetch_clampright.
14255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * This only happens once before going on to the next macro.
14265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Sometimes leftedge also covers the rightedge case, in which case this has
14275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * to be skipped altogether.
14285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
14295a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_rightedge, next, after, ra, rb, step
14305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            cmp         x11, #i
14315a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bls         \next
14325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_clampright\step
14335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_out \ra, \rb, v10.16b, v11.16b
14345a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            b           \after
14355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm
14365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* rightfill: The rest of the window is simply filled with right padding from
14385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * v12 and v13.
14395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
14405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_rightfill, next, after, ra, rb, step
14415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_out \ra, \rb, v12.16b, v13.16b
1442446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm
1443446788007efe0a673d0366284026adfa17b36fedSimon Hosie
14445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* Here all of the macros above are unrolled and laid out in the proper order.
14455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie */
14465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill_body, max_r, step, label
14475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_list leftfill,  leftedge,   \max_r, \step, \label
14485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_list leftedge,  dofetch,    \max_r, \step, \label
14495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_list dofetch,   rightedge,  \max_r, \step, \label
14505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_list rightedge, rightfill,  \max_r, \step, \label
14515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill_list rightfill, oops,       \max_r, \step, \label
14525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie\label\()_end:
14535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.endm
14545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
1456446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* Fill the convolution window with context data.  The aim here is to load
14575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * exactly 2*r columns, and in the main loop to read as many columns as will be
14585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * written.  This is complicated by the window being divided into chunks at
14595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * register boundaries, and the need to handle cases when the input starts very
14605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * close to the left or right (or both) edges of the image and the need to fill
14615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the spaces that leaves with left and right edge padding values.
1462446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
1463446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input:
1464446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x1 -- src
1465446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x2 -- pitch
1466446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x3 -- count
14675a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x4 -- available image data right of src pointer
1468446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x5 -- r
1469446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x6 -- rup
1470446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x7 -- rdn
14715a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x8 -- available image data left of src pointer
1472446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x9 -- buffer (if needed)
1473446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x13 = -pitch
1474446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x15 = top-row in
1475ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie *      x19 = bottom-row in
1476446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Output:
14775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x4 -= min(inlen, count + windowsize - centertap)
14785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x1 += min(inlen, count + windowsize - centertap)
14795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x15 += min(inlen, count + windowsize - centertap)
14805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *      x19 += min(inlen, count + windowsize - centertap)
1481446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies:
1482446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x10 -- fill start index in the window
1483446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x11 -- fill stop index in the window
1484446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x12 -- scratch
1485446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
14865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro prefill step=1, max_r=25, label=xx
14875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
14885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.set centertap, (windowsize - \max_r * \step)
14895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x10, #centertap
14905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            subs        x10, x10, x8
14915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x10, xzr, x10, lo
14925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            subs        x11, x4, #windowsize - centertap
14945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x11, xzr, x11, hs
14955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         x11, x11, #windowsize
14965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
14975a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* x10 indicates where in the window legal image data begins.
14985a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * x11 indicates where in the window legal image date ends.
14995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * When starting near the centre of a large image these would be
15005a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * zero and windowsize respectively, but when starting near the
15015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * edges this can change.
15025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * When starting on the leftmost pixel, x10 will be centertap.
15035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * When starting on the rightmost pixel, x11 will be centertap+1.
15045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
1505446788007efe0a673d0366284026adfa17b36fedSimon Hosie
15065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* x4 indicates how much data there is between the current pointers
15075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * and the right edge of the image.  The pointers currently point
15085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * to the data needed at centertap.  The subsequent code will
15095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * consume (windowsize - x10) data, but only the data from
15105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * centertap to windowsize comes out of x4's budget.
15115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
15125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          subs        x4, x4, #windowsize - centertap
15135a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x4, xzr, x4, lo
15145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
15155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* And the pointers need to rewind to the start of the window.
15165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
15175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x1, x1, #centertap
15185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x15, x15, #centertap
15195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x19, x19, #centertap
15205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
15215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Unless x8 indicated that there wasn't that much data available.
15225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
1523446788007efe0a673d0366284026adfa17b36fedSimon Hosie            add         x1, x1, x10
1524446788007efe0a673d0366284026adfa17b36fedSimon Hosie            add         x15, x15, x10
1525ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            add         x19, x19, x10
15265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
15275a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Get the first chunk, and add padding to align it to the window
15285a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * if necessary.
1529446788007efe0a673d0366284026adfa17b36fedSimon Hosie             */
15305a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_clampleft\step
1531446788007efe0a673d0366284026adfa17b36fedSimon Hosie
15325a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Sometimes the start and the end of the window are in the same
15335a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * chunk.  In that case both ends need filler at the outset.
15345a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
15355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x12, x11, #1
15365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            eor         x12,  x10, x12
15375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            cmp         x12, #16
15385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bhs         1f
15395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          prefill_sweepright\step
15405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
15415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Iterate through all the points in the window and fill them in
15425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * with padding or image data as needed.
15435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
15445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          prefill_body \max_r, \step, \label
1545446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm
1546446788007efe0a673d0366284026adfa17b36fedSimon Hosie
15475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie/* The main body of the convolve functions.  Having already pre-filled the
15485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * convolution window with 2*r input values, the logic settles into a regular
15495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * pattern of reading and writing at a 1:1 rate until either input or output
15505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * expires.  The input leads the output by r values, so when processing all the
15515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * way to the right-hand edge, or within r pixels of that edge, the input will
15525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * run out first.  In the case of very narrow images, or sub-windows starting
15535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * near the right edge, the input may already have run out while the
15545a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * convolution window was being filled and this loop will start with a
15555a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * zero-length input.
15565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie *
15575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * Once the input runs out, the rest of the output must be processed by padding
15585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the remainder of the window with pad value from the last valid pixel from
15595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie * the source.
1560446788007efe0a673d0366284026adfa17b36fedSimon Hosie *
1561446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Input:
1562446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x0 = dst
1563446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x1 = src
1564446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x2 = pitch
1565446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x3 = count
1566446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x4 = inlen
1567446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x5 = r
1568446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x6 = rup
1569446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x7 = rdn
1570446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x9 = buffer
1571446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x13 = -pitch
1572446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x15 = top-row in
1573ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie *      x19 = bottom-row in
1574446788007efe0a673d0366284026adfa17b36fedSimon Hosie * Modifies
1575446788007efe0a673d0366284026adfa17b36fedSimon Hosie *      x8 = fetch code pointer
1576446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
15775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
15785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
15795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* If x4 >= x3 then there's no need for clipping.  The main loop
15805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
15815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * no greater than x3 and use x4 for the loop.
15825a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * However, if x4 comes out of the loop with less than 16 bytes
15835a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * left, a partial read would be necessary to avoid reading beyond
15845a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * the end of the image.  To avoid this, clamp x4 to the next
15855a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * multiple of 16, which is still sufficient to force it out of the
15865a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * loop but doesn't imply a rewind.
15875a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
15885a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         x12, x3, #15
15895a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bic         x12, x12, #15
15905a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            cmp         x4, x12
15915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x4, x12, x4, hi
15925a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
15935a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* First calculate the entry-point into the internal fetch logic.
15945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * This is done so the same function can service several kernel
15955a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * sizes.
15965a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
1597446788007efe0a673d0366284026adfa17b36fedSimon Hosie            adrp        x8, \labelnc
1598446788007efe0a673d0366284026adfa17b36fedSimon Hosie            add         x8, x8, #:lo12:\labelnc
1599446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x8, x8, x5, LSL #5
1600446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x8, x8, x5, LSL #3
1601446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x5, x6
1602446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ccmp        x5, x7, #0, eq
1603446788007efe0a673d0366284026adfa17b36fedSimon Hosie            beq         5f
1604446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1605446788007efe0a673d0366284026adfa17b36fedSimon Hosie            /* if (r != rup || r != rdn) then the address-clamping table should
1606446788007efe0a673d0366284026adfa17b36fedSimon Hosie             * be used rather than the short-cut version.
1607446788007efe0a673d0366284026adfa17b36fedSimon Hosie             */
1608446788007efe0a673d0366284026adfa17b36fedSimon Hosie            adrp        x8, \labelc
1609446788007efe0a673d0366284026adfa17b36fedSimon Hosie            add         x8, x8, #:lo12:\labelc
1610446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x8, x8, x5, LSL #6
1611446788007efe0a673d0366284026adfa17b36fedSimon Hosie            add         x8, x8, x5, LSL #3
1612446788007efe0a673d0366284026adfa17b36fedSimon Hosie            b           5f
1613446788007efe0a673d0366284026adfa17b36fedSimon Hosie
16145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Main loop: ... */
16155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            .align  4
16165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie3:          /* first perform a vertical convolution from memory to get the next
16175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * 16 taps of the horizontal window into the register file...
16185a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
16195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
16205a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
16215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* ...then perform a horizontal convolution on that window to
16225a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * produce eight output bytes, and slide the window along.
16235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * This has to be done twice to match the 16-way vertical pass.
16245a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * It would be preferable to have twice the work done in \core, but
16255a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * that would demand yet another variant on those macros and would
16265a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * perturb the register allocation severely.
1627446788007efe0a673d0366284026adfa17b36fedSimon Hosie             */
1628446788007efe0a673d0366284026adfa17b36fedSimon Hosie            \core
1629446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v15.8b}, [x0], #8
1630446788007efe0a673d0366284026adfa17b36fedSimon Hosie            \core
1631446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v15.8b}, [x0], #8
1632446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1633446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x3, x3, #16
1634446788007efe0a673d0366284026adfa17b36fedSimon Hosie5:          subs        x4, x4, #16
16355a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bhi         3b
16365a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Here there's 16 or fewer bytes available before the edge of the
16375a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * source image.  x4 holds that count minus 16 (because it was
16385a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * decremented before the first iteration ran).  The last read may
16395a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * not be a whole chunk, and beyond that a fill value must be used.
16405a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             *
16415a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * Of course, none of that matters if there's no more output to
16425a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * produce...
16435a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
16445a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            cbz         x3, 5f
16455a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
16465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Oh well. */
1647446788007efe0a673d0366284026adfa17b36fedSimon Hosie            adds        x4, x4, #16
1648446788007efe0a673d0366284026adfa17b36fedSimon Hosie            bne         1f
1649446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .if \step==1
1650446788007efe0a673d0366284026adfa17b36fedSimon Hosie            dup         v10.8h, v9.h[7]
1651446788007efe0a673d0366284026adfa17b36fedSimon Hosie            dup         v11.8h, v9.h[7]
1652446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .else
1653446788007efe0a673d0366284026adfa17b36fedSimon Hosie            dup         v10.2d, v9.d[1]
1654446788007efe0a673d0366284026adfa17b36fedSimon Hosie            dup         v11.2d, v9.d[1]
1655446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .endif
16565a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            b           3f
1657446788007efe0a673d0366284026adfa17b36fedSimon Hosie
16585a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* To avoid reading past end of input, rewind pointers by (16-x4)
16595a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * to ensure that they're exactly 16 bytes from the edge.
16605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
16615a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie1:          mov         x11, x4
16625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bl          fetch_clampright\step
16635a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* Now to put this padding to use, perform any remaining
16645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * iterations.  This is done at half the rate of the main loop,
16655a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * because there's no longer pressure from a 16-lane window filler.
16665a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
1667446788007efe0a673d0366284026adfa17b36fedSimon Hosie3:          \core
1668446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .if \step==1
1669446788007efe0a673d0366284026adfa17b36fedSimon Hosie            dup         v11.8h, v11.h[7]
1670446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .else
1671446788007efe0a673d0366284026adfa17b36fedSimon Hosie            dup         v11.2d, v11.d[1]
1672446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .endif
1673446788007efe0a673d0366284026adfa17b36fedSimon Hosie            subs        x3, x3, #8
1674446788007efe0a673d0366284026adfa17b36fedSimon Hosie            blo         4f
1675446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v15.8b}, [x0], #8
16765a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bne         3b
16775a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            b           5f
16785a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
16795a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* If the final iteration contained 0 < l < 8 values, then perform
16805a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * a piecewise store of the final vector.
16815a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
1682446788007efe0a673d0366284026adfa17b36fedSimon Hosie4:          tbz         x3, #2, 1f
1683446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v15.s}[0], [x0], #4
168428c034238b8891398da625b070279c34185b3494Simon Hosie            ext         v15.8b, v15.8b, v15.8b, #4
1685446788007efe0a673d0366284026adfa17b36fedSimon Hosie1:          tbz         x3, #1, 1f
1686446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v15.h}[0], [x0], #2
168728c034238b8891398da625b070279c34185b3494Simon Hosie            ext         v15.8b, v15.8b, v15.8b, #2
1688446788007efe0a673d0366284026adfa17b36fedSimon Hosie1:          tbz         x3, #0, 5f
1689446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v15.b}[0], [x0], #1
169028c034238b8891398da625b070279c34185b3494Simon Hosie            ext         v15.8b, v15.8b, v15.8b, #1
16915a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie5:          mov         x0, #0
1692446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endm
1693446788007efe0a673d0366284026adfa17b36fedSimon Hosie
16945a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie
1695e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp r, TUNED_LIST1, 25
1696ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon HosiePRIVATE(convolve1_\r)
1697446788007efe0a673d0366284026adfa17b36fedSimon Hosie            stp         x29,x30, [sp, #-16]!
1698446788007efe0a673d0366284026adfa17b36fedSimon Hosie
16995a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill     step=1, max_r=\r, label=.Lcnv1_\r
1700446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17015a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1702446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1703446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ldp         x29,x30, [sp], #16
1704446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
1705446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(convolve1_\r)
1706446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endr
1707446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1708e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh.irp r, TUNED_LIST4, 25
1709ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon HosiePRIVATE(convolve4_\r)
17105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x9, sp, #0x40
17115a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
17125a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            bic         x9, x9, #0x7f
1713446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            /* x9 now points to a 0x40 byte buffer on the stack whose address
17155a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * has the low 7 bits clear.  This allows easy address calculation
17165a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             * in the wrap-around cases.
17175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie             */
1718446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            prefill     step=4, max_r=\r, label=.Lcnv4_\r
1720446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17215a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1722446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17235a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
1724446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
1725446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(convolve4_\r)
1726446788007efe0a673d0366284026adfa17b36fedSimon Hosie.endr
1727446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1728446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* void rsdIntrinsicBlurU1_K(
1729446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  void *out,      // x0
1730446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  void *in,       // x1
1731446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t w,       // x2
1732446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t h,       // x3
1733446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t p,       // x4
1734446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t x,       // x5
1735446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t y,       // x6
1736446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t count,   // x7
1737446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t r,       // [sp]
1738446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  uint16_t *tab); // [sp,#8]
1739446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
1740446788007efe0a673d0366284026adfa17b36fedSimon HosieENTRY(rsdIntrinsicBlurU1_K)
1741ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            stp         x19,x30, [sp, #-16]!
1742446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x8, sp, #32
1743446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         sp, sp, #64
1744446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v8.1d - v11.1d}, [sp]
1745446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v12.1d - v15.1d}, [x8]
17465a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x8, x5          // x
17475a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldr         w5, [sp,#80]    // r
17485a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x9, x2, x8      // w - x
17495a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x10, x3, x6     // h - y
17505a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x2, x4          // pitch
17515a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x3, x7          // count
17525a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x7, x10, #1     // h - y - 1
17535a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x4, x9          // inlen = (w - x)
1754446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1755ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldr         x12, [sp, #88] // tab
1756446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17575a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         x1, x1, x8      // src += x
1758446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1759446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x6, x5
17605a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x6, x5, x6, hs  // rup = min(r, y)
1761446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x7, x5
17625a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1763446788007efe0a673d0366284026adfa17b36fedSimon Hosie
17645a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x13, xzr, x2    // -pitch
1765446788007efe0a673d0366284026adfa17b36fedSimon Hosie            msub        x15, x2, x6, x1
1766ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            madd        x19, x2, x7, x1
1767446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1768446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v0.8h,v1.8h}, [x12], #32
1769446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v2.8h,v3.8h}, [x12], #32
1770446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1771446788007efe0a673d0366284026adfa17b36fedSimon Hosie            adr         x30, 1f
1772e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh  .irp r, TUNED_LIST1
1773446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x5, #\r
1774446788007efe0a673d0366284026adfa17b36fedSimon Hosie            bls         convolve1_\r
1775446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .endr
1776446788007efe0a673d0366284026adfa17b36fedSimon Hosie            b           convolve1_25
1777446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1778446788007efe0a673d0366284026adfa17b36fedSimon Hosie1:          ld1         {v8.1d - v11.1d}, [sp], #32
1779446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.1d - v15.1d}, [sp], #32
1780ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldp         x19,x30, [sp], #16
1781446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
1782446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(rsdIntrinsicBlurU1_K)
1783446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1784446788007efe0a673d0366284026adfa17b36fedSimon Hosie/* void rsdIntrinsicBlurU4_K(
1785446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  void *out,      // x0
1786446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  void *in,       // x1
1787446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t w,       // x2
1788446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t h,       // x3
1789446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t p,       // x4
1790446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t x,       // x5
1791446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t y,       // x6
1792446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t count,   // x7
1793446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  size_t r,       // [sp]
1794446788007efe0a673d0366284026adfa17b36fedSimon Hosie *                  uint16_t *tab); // [sp,#8]
1795446788007efe0a673d0366284026adfa17b36fedSimon Hosie */
1796446788007efe0a673d0366284026adfa17b36fedSimon HosieENTRY(rsdIntrinsicBlurU4_K)
1797ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            stp         x19,x30, [sp, #-16]!
1798446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x8, sp, #32
1799446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         sp, sp, #64
1800446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v8.1d - v11.1d}, [sp]
1801446788007efe0a673d0366284026adfa17b36fedSimon Hosie            st1         {v12.1d - v15.1d}, [x8]
18025a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            lsl         x8, x5, #2      // x
18035a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            lsl         x2, x2, #2
18045a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            ldr         w5, [sp,#80]    // r
18055a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x9, x2, x8      // w - x
18065a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x10, x3, x6     // h - y
18075a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x2, x4          // pitch
18085a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            lsl         x3, x7, #2      // count
18095a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            sub         x7, x10, #1     // h - y - 1
18105a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            mov         x4, x9          // inlen = (w - x)
1811446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1812ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldr         x12, [sp, #88]
1813446788007efe0a673d0366284026adfa17b36fedSimon Hosie
18145a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            add         x1, x1, x8      // in += x
1815446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1816446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x6, x5
18175a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x6, x5, x6, hs  // rup = min(r, y)
1818446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x7, x5
18195a1f196d68d54513c081958adf4ce3dcafed9ea2Simon Hosie            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1820446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1821446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1822446788007efe0a673d0366284026adfa17b36fedSimon Hosie            sub         x13, xzr, x2
1823446788007efe0a673d0366284026adfa17b36fedSimon Hosie            msub        x15, x2, x6, x1
1824ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            madd        x19, x2, x7, x1
1825446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1826446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v0.8h,v1.8h}, [x12], #32
1827446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v2.8h,v3.8h}, [x12], #32
1828446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1829446788007efe0a673d0366284026adfa17b36fedSimon Hosie            adr         x30, 1f
1830e2af295f94c8fb17ba51d0e6a199d5ca265f92daChih-Hung Hsieh  .irp r, TUNED_LIST4
1831446788007efe0a673d0366284026adfa17b36fedSimon Hosie            cmp         x5, #\r
1832446788007efe0a673d0366284026adfa17b36fedSimon Hosie            bls         convolve4_\r
1833446788007efe0a673d0366284026adfa17b36fedSimon Hosie  .endr
1834446788007efe0a673d0366284026adfa17b36fedSimon Hosie            b           convolve4_25
1835446788007efe0a673d0366284026adfa17b36fedSimon Hosie
1836446788007efe0a673d0366284026adfa17b36fedSimon Hosie1:          ld1         {v8.1d - v11.1d}, [sp], #32
1837446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ld1         {v12.1d - v15.1d}, [sp], #32
1838ea76eb386a2d851d50be69ebeb7ae593f84a5be9Simon Hosie            ldp         x19,x30, [sp], #16
1839446788007efe0a673d0366284026adfa17b36fedSimon Hosie            ret
1840446788007efe0a673d0366284026adfa17b36fedSimon HosieEND(rsdIntrinsicBlurU4_K)
1841