rsCpuIntrinsics_advsimd_Blur.S revision 446788007efe0a673d0366284026adfa17b36fed
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20.set FRACTION_BITS, 7
21.set MAX_R, 25
22
23
24/* A quick way of making a line of code conditional on some other condition.
25 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
26 * `ifcc`:
27 */
28.macro ifcc zzz:vararg
29.if cc
30            \zzz
31.endif
32.endm
33
34/* Fetch 16 columns of bytes (regardless of image format), convolve these
35 * vertically, and leave them in the register file.  If working near the top or
36 * bottom of an image then clamp the addressing while loading the data in.
37 *
38 * The convolution is fully unrolled for windows up to max_r, with the
39 * outermost edges calculated first.  This way it's possible to branch directly
40 * into the relevant part of the code for an arbitrary convolution radius.  Two
41 * variants of the loop are produced; one eliminates the clamping code for a
42 * slight speed advantage.
43 *
44 * Where the macro is called with reg=x, the specified register is taken to
45 * contain a pre-calculated pointer into one of the two loops.
46 *
47 * Input:
48 *      x1 -- src
49 *      x2 -- pitch
50 *      x5 -- r
51 *      x6 -- rup
52 *      x7 -- rdn
53 *      x12 -- switch index
54 *      q0-q3 -- coefficient table
55 *      x13 = -pitch
56 *      x15 = top-row in
57 *      x16 = bottom-row in
58 * Output:
59 *      x1 += 16
60 *      q10,q11 -- 16 convolved columns
61 * Modifies:
62 *      x10 = upper row pointer
63 *      x11 = lower row pointer
64 *      q12-q15 = temporary sums
65 */
66.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
67  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
68
69            ld1         {v15.16b}, [x1], #16
70            mov         x10, x15
71
72            uxtl        v14.8h, v15.8b
73//            prfm        PLDL1KEEP,[x1, #16] // TODO: confirm
74            uxtl2       v15.8h, v15.16b
75  .if \max_r < 16 // approximate
76    ifcc    adr         \reg, 1f
77  .else
78    ifcc    adrp        \reg, 1f
79    ifcc    add         \reg, \reg, #:lo12:1f
80  .endif
81
82            umull       v12.4s, v14.4h, v0.h[0]
83    ifcc    sub         \reg, \reg, x5, LSL #6
84            umull2      v13.4s, v14.8h, v0.h[0]
85            mov         x11, x16
86            umull       v14.4s, v15.4h, v0.h[0]
87    ifcc    add         \reg, \reg, x5, LSL #3
88            umull2      v15.4s, v15.8h, v0.h[0]
89            br          \reg
90
91  .irp rowclamp, 1, 0
92    .set cc, \rowclamp
93    .align 4
94    .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
95        .set i, \dreg * 8 + \lane
96        .if 0 < i && i <= \max_r
97            ld1         {v10.16b}, [x10], x2
98    ifcc    cmp         x6, #i
99            ld1         {v11.16b}, [x11], x13
100    ifcc    csel        x10, x15, x10, lo
101            uaddl       v16.8h, v10.8b, v11.8b
102    ifcc    cmp         x7, #i
103            uaddl2      v11.8h, v10.16b, v11.16b
104    ifcc    csel        x11, x16, x11, lo
105            umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
106            umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
107//            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
108nop
109            umlal       v14.4s, v11.4h, v\dreg\doth[\lane]
110//            prfm        PLDL1KEEP,[x11, #32] // TODO: confirm
111nop
112            umlal2      v15.4s, v11.8h, v\dreg\doth[\lane]
113        .endif
114    .endr ; .endr ; .endr
115    .if \rowclamp == 1
116        1: \labelc :
117            b           2f
118    .else
119        2: \labelnc :
120    .endif
121  .endr
122
123            uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
124            add         x15, x15, #16
125            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
126            add         x16, x16, #16
127            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
128            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
129.endm /*}}}*/
130
131/* Some portion of the convolution window (as much as will fit, and all of it
132 * for the uchar1 cases) is kept in the register file to avoid unnecessary
133 * memory accesses.  This forces the horizontal loops to be unrolled because
134 * there's no indexed addressing into the register file.
135 *
136 * As in the fetch macro, the operations are ordered from outside to inside, so
137 * that jumping into the middle of the block bypasses the unwanted window taps.
138 *
139 * There are several variants of the macro because of the fixed offets of the
140 * taps -- the wider the maximum radius the further the centre tap is from the
141 * most recently fetched data.  This means that pre-filling the window requires
142 * more data that won't be used and it means that rotating the window involves
143 * more mov operations.
144 *
145 * When the buffer gets too big the buffer at [r9] is used.
146 *
147 * Input:
148 *      q4-q11 -- convoltion window
149 *      r9 -- pointer to additional convolution window data
150 * Output:
151 *      r9 -- updated buffer pointer (if used)
152 *      d31 -- result to be stored
153 * Modifies:
154 *      r12 -- temp buffer pointer
155 *      q12-q13 -- temporaries for load and vext operations.
156 *      q14-q15 -- intermediate sums
157 */
158#define TUNED_LIST1 8, 16
159.macro hconv1_8/*{{{*/
160            umull       v14.4s, v9.4h, v0.h[0]
161            umull2      v15.4s, v9.8h, v0.h[0]
162
163            adr         x12, 199f-8
164            ldr         x12, [x12, x5, LSL #3]
165            br          x12
166   199:     .xword 101f
167            .xword 102f
168            .xword 103f
169            .xword 104f
170            .xword 105f
171            .xword 106f
172            .xword 107f
173            .xword 108f
174            .align      4
175    108:    umlal       v14.4s, v8.4h, v1.h[0]
176            umlal2      v15.4s, v8.8h, v1.h[0]
177            umlal       v14.4s, v10.4h, v1.h[0]
178            umlal2      v15.4s, v10.8h, v1.h[0]
179    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
180            ext         v13.16b, v9.16b, v10.16b, #7*2
181            umlal       v14.4s, v12.4h, v0.h[7]
182            umlal2      v15.4s, v12.8h, v0.h[7]
183            umlal       v14.4s, v13.4h, v0.h[7]
184            umlal2      v15.4s, v13.8h, v0.h[7]
185    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
186            ext         v13.16b, v9.16b, v10.16b, #6*2
187            umlal       v14.4s, v12.4h, v0.h[6]
188            umlal2      v15.4s, v12.8h, v0.h[6]
189            umlal       v14.4s, v13.4h, v0.h[6]
190            umlal2      v15.4s, v13.8h, v0.h[6]
191    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
192            ext         v13.16b, v9.16b, v10.16b, #5*2
193            umlal       v14.4s, v12.4h, v0.h[5]
194            umlal2      v15.4s, v12.8h, v0.h[5]
195            umlal       v14.4s, v13.4h, v0.h[5]
196            umlal2      v15.4s, v13.8h, v0.h[5]
197    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
198            //ext         v13.16b, v9.16b, v10.16b, #4*2
199            umlal2      v14.4s, v8.8h, v0.h[4]
200            umlal       v15.4s, v9.4h, v0.h[4]
201            umlal2      v14.4s, v9.8h, v0.h[4]
202            umlal       v15.4s, v10.4h, v0.h[4]
203    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
204            ext         v13.16b, v9.16b, v10.16b, #3*2
205            umlal       v14.4s, v12.4h, v0.h[3]
206            umlal2      v15.4s, v12.8h, v0.h[3]
207            umlal       v14.4s, v13.4h, v0.h[3]
208            umlal2      v15.4s, v13.8h, v0.h[3]
209    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
210            ext         v13.16b, v9.16b, v10.16b, #2*2
211            umlal       v14.4s, v12.4h, v0.h[2]
212            umlal2      v15.4s, v12.8h, v0.h[2]
213            umlal       v14.4s, v13.4h, v0.h[2]
214            umlal2      v15.4s, v13.8h, v0.h[2]
215    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
216            ext         v13.16b, v9.16b, v10.16b, #1*2
217            umlal       v14.4s, v12.4h, v0.h[1]
218            umlal2      v15.4s, v12.8h, v0.h[1]
219            umlal       v14.4s, v13.4h, v0.h[1]
220            umlal2      v15.4s, v13.8h, v0.h[1]
221
222            uqrshrn     v14.4h, v14.4s, #16
223            uqrshrn2    v14.8h, v15.4s, #16
224            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
225
226            mov         v8.16b, v9.16b
227            mov         v9.16b, v10.16b
228            mov         v10.16b, v11.16b
229.endm/*}}}*/
230
231.macro hconv1_16/*{{{*/
232            umull       v14.4s, v8.4h, v0.h[0]
233            umull2      v15.4s, v8.8h, v0.h[0]
234
235            adr         x12, 199f-8
236            ldr         x12, [x12, x5, LSL #3]
237            br          x12
238   199:     .xword 101f
239            .xword 102f
240            .xword 103f
241            .xword 104f
242            .xword 105f
243            .xword 106f
244            .xword 107f
245            .xword 108f
246            .xword 109f
247            .xword 110f
248            .xword 111f
249            .xword 112f
250            .xword 113f
251            .xword 114f
252            .xword 115f
253            .xword 116f
254            .align 4
255    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
256            //ext         v13.16b, v10.16b, v11.16b, #0*2
257            umlal       v14.4s, v6.4h, v2.h[0]
258            umlal2      v15.4s, v6.8h, v2.h[0]
259            umlal       v14.4s, v10.4h, v2.h[0]
260            umlal2      v15.4s, v10.8h, v2.h[0]
261    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
262            ext         v13.16b, v9.16b, v10.16b, #7*2
263            umlal       v14.4s, v12.4h, v1.h[7]
264            umlal2      v15.4s, v12.8h, v1.h[7]
265            umlal       v14.4s, v13.4h, v1.h[7]
266            umlal2      v15.4s, v13.8h, v1.h[7]
267    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
268            ext         v13.16b, v9.16b, v10.16b, #6*2
269            umlal       v14.4s, v12.4h, v1.h[6]
270            umlal2      v15.4s, v12.8h, v1.h[6]
271            umlal       v14.4s, v13.4h, v1.h[6]
272            umlal2      v15.4s, v13.8h, v1.h[6]
273    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
274            ext         v13.16b, v9.16b, v10.16b, #5*2
275            umlal       v14.4s, v12.4h, v1.h[5]
276            umlal2      v15.4s, v12.8h, v1.h[5]
277            umlal       v14.4s, v13.4h, v1.h[5]
278            umlal2      v15.4s, v13.8h, v1.h[5]
279    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
280            //ext         v13.16b, v9.16b, v10.16b, #4*2
281            umlal2      v14.4s, v6.8h, v1.h[4]
282            umlal       v15.4s, v7.4h, v1.h[4]
283            umlal2      v14.4s, v9.8h, v1.h[4]
284            umlal       v15.4s, v10.4h, v1.h[4]
285    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
286            ext         v13.16b, v9.16b, v10.16b, #3*2
287            umlal       v14.4s, v12.4h, v1.h[3]
288            umlal2      v15.4s, v12.8h, v1.h[3]
289            umlal       v14.4s, v13.4h, v1.h[3]
290            umlal2      v15.4s, v13.8h, v1.h[3]
291    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
292            ext         v13.16b, v9.16b, v10.16b, #2*2
293            umlal       v14.4s, v12.4h, v1.h[2]
294            umlal2      v15.4s, v12.8h, v1.h[2]
295            umlal       v14.4s, v13.4h, v1.h[2]
296            umlal2      v15.4s, v13.8h, v1.h[2]
297    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
298            ext         v13.16b, v9.16b, v10.16b, #1*2
299            umlal       v14.4s, v12.4h, v1.h[1]
300            umlal2      v15.4s, v12.8h, v1.h[1]
301            umlal       v14.4s, v13.4h, v1.h[1]
302            umlal2      v15.4s, v13.8h, v1.h[1]
303    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
304            //ext         v13.16b, v9.16b, v10.16b, #0*2
305            umlal       v14.4s, v7.4h, v1.h[0]
306            umlal2      v15.4s, v7.8h, v1.h[0]
307            umlal       v14.4s, v9.4h, v1.h[0]
308            umlal2      v15.4s, v9.8h, v1.h[0]
309    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
310            ext         v13.16b, v8.16b, v9.16b, #7*2
311            umlal       v14.4s, v12.4h, v0.h[7]
312            umlal2      v15.4s, v12.8h, v0.h[7]
313            umlal       v14.4s, v13.4h, v0.h[7]
314            umlal2      v15.4s, v13.8h, v0.h[7]
315    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
316            ext         v13.16b, v8.16b, v9.16b, #6*2
317            umlal       v14.4s, v12.4h, v0.h[6]
318            umlal2      v15.4s, v12.8h, v0.h[6]
319            umlal       v14.4s, v13.4h, v0.h[6]
320            umlal2      v15.4s, v13.8h, v0.h[6]
321    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
322            ext         v13.16b, v8.16b, v9.16b, #5*2
323            umlal       v14.4s, v12.4h, v0.h[5]
324            umlal2      v15.4s, v12.8h, v0.h[5]
325            umlal       v14.4s, v13.4h, v0.h[5]
326            umlal2      v15.4s, v13.8h, v0.h[5]
327    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
328            //ext         v13.16b, v8.16b, v9.16b, #4*2
329            umlal2      v14.4s, v7.8h, v0.h[4]
330            umlal       v15.4s, v8.4h, v0.h[4]
331            umlal2      v14.4s, v8.8h, v0.h[4]
332            umlal       v15.4s, v9.4h, v0.h[4]
333    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
334            ext         v13.16b, v8.16b, v9.16b, #3*2
335            umlal       v14.4s, v12.4h, v0.h[3]
336            umlal2      v15.4s, v12.8h, v0.h[3]
337            umlal       v14.4s, v13.4h, v0.h[3]
338            umlal2      v15.4s, v13.8h, v0.h[3]
339    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
340            ext         v13.16b, v8.16b, v9.16b, #2*2
341            umlal       v14.4s, v12.4h, v0.h[2]
342            umlal2      v15.4s, v12.8h, v0.h[2]
343            umlal       v14.4s, v13.4h, v0.h[2]
344            umlal2      v15.4s, v13.8h, v0.h[2]
345    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
346            ext         v13.16b, v8.16b, v9.16b, #1*2
347            umlal       v14.4s, v12.4h, v0.h[1]
348            umlal2      v15.4s, v12.8h, v0.h[1]
349            umlal       v14.4s, v13.4h, v0.h[1]
350            umlal2      v15.4s, v13.8h, v0.h[1]
351
352            uqrshrn     v14.4h, v14.4s, #16
353            uqrshrn2    v14.8h, v15.4s, #16
354            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
355
356            mov         v6.16b, v7.16b
357            mov         v7.16b, v8.16b
358            mov         v8.16b, v9.16b
359            mov         v9.16b, v10.16b
360            mov         v10.16b, v11.16b
361.endm/*}}}*/
362
363.macro hconv1_25/*{{{*/
364            ext         v12.16b, v6.16b, v7.16b, #7*2
365            umull       v14.4s, v12.4h, v0.h[0]
366            umull2      v15.4s, v12.8h, v0.h[0]
367
368            adr         x12, 199f-8
369            ldr         x12, [x12, x5, LSL #3]
370            br          x12
371   199:     .xword 101f
372            .xword 102f
373            .xword 103f
374            .xword 104f
375            .xword 105f
376            .xword 106f
377            .xword 107f
378            .xword 108f
379            .xword 109f
380            .xword 110f
381            .xword 111f
382            .xword 112f
383            .xword 113f
384            .xword 114f
385            .xword 115f
386            .xword 116f
387            .xword 117f
388            .xword 118f
389            .xword 119f
390            .xword 120f
391            .xword 121f
392            .xword 122f
393            .xword 123f
394            .xword 124f
395            .xword 125f
396            .align 4
397    125:    ext         v12.16b, v3.16b, v4.16b, #6*2
398            ext         v13.16b, v10.16b, v11.16b, #0*2
399            umlal       v14.4s, v12.4h, v3.h[1]
400            umlal2      v15.4s, v12.8h, v3.h[1]
401            umlal       v14.4s, v13.4h, v3.h[1]
402            umlal2      v15.4s, v13.8h, v3.h[1]
403    124:    ext         v12.16b, v3.16b, v4.16b, #7*2
404            ext         v13.16b, v9.16b, v10.16b, #7*2
405            umlal       v14.4s, v12.4h, v3.h[0]
406            umlal2      v15.4s, v12.8h, v3.h[0]
407            umlal       v14.4s, v13.4h, v3.h[0]
408            umlal2      v15.4s, v13.8h, v3.h[0]
409    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
410            ext         v13.16b, v9.16b, v10.16b, #6*2
411            umlal       v14.4s, v12.4h, v2.h[7]
412            umlal2      v15.4s, v12.8h, v2.h[7]
413            umlal       v14.4s, v13.4h, v2.h[7]
414            umlal2      v15.4s, v13.8h, v2.h[7]
415    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
416            ext         v13.16b, v9.16b, v10.16b, #5*2
417            umlal       v14.4s, v12.4h, v2.h[6]
418            umlal2      v15.4s, v12.8h, v2.h[6]
419            umlal       v14.4s, v13.4h, v2.h[6]
420            umlal2      v15.4s, v13.8h, v2.h[6]
421    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
422            ext         v13.16b, v9.16b, v10.16b, #4*2
423            umlal       v14.4s, v12.4h, v2.h[5]
424            umlal2      v15.4s, v12.8h, v2.h[5]
425            umlal       v14.4s, v13.4h, v2.h[5]
426            umlal2      v15.4s, v13.8h, v2.h[5]
427    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
428            ext         v13.16b, v9.16b, v10.16b, #3*2
429            umlal       v14.4s, v12.4h, v2.h[4]
430            umlal2      v15.4s, v12.8h, v2.h[4]
431            umlal       v14.4s, v13.4h, v2.h[4]
432            umlal2      v15.4s, v13.8h, v2.h[4]
433    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
434            ext         v13.16b, v9.16b, v10.16b, #2*2
435            umlal       v14.4s, v12.4h, v2.h[3]
436            umlal2      v15.4s, v12.8h, v2.h[3]
437            umlal       v14.4s, v13.4h, v2.h[3]
438            umlal2      v15.4s, v13.8h, v2.h[3]
439    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
440            ext         v13.16b, v9.16b, v10.16b, #1*2
441            umlal       v14.4s, v12.4h, v2.h[2]
442            umlal2      v15.4s, v12.8h, v2.h[2]
443            umlal       v14.4s, v13.4h, v2.h[2]
444            umlal2      v15.4s, v13.8h, v2.h[2]
445    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
446            ext         v13.16b, v9.16b, v10.16b, #0*2
447            umlal       v14.4s, v12.4h, v2.h[1]
448            umlal2      v15.4s, v12.8h, v2.h[1]
449            umlal       v14.4s, v13.4h, v2.h[1]
450            umlal2      v15.4s, v13.8h, v2.h[1]
451    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
452            ext         v13.16b, v8.16b, v9.16b, #7*2
453            umlal       v14.4s, v12.4h, v2.h[0]
454            umlal2      v15.4s, v12.8h, v2.h[0]
455            umlal       v14.4s, v13.4h, v2.h[0]
456            umlal2      v15.4s, v13.8h, v2.h[0]
457    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
458            ext         v13.16b, v8.16b, v9.16b, #6*2
459            umlal       v14.4s, v12.4h, v1.h[7]
460            umlal2      v15.4s, v12.8h, v1.h[7]
461            umlal       v14.4s, v13.4h, v1.h[7]
462            umlal2      v15.4s, v13.8h, v1.h[7]
463    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
464            ext         v13.16b, v8.16b, v9.16b, #5*2
465            umlal       v14.4s, v12.4h, v1.h[6]
466            umlal2      v15.4s, v12.8h, v1.h[6]
467            umlal       v14.4s, v13.4h, v1.h[6]
468            umlal2      v15.4s, v13.8h, v1.h[6]
469    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
470            ext         v13.16b, v8.16b, v9.16b, #4*2
471            umlal       v14.4s, v12.4h, v1.h[5]
472            umlal2      v15.4s, v12.8h, v1.h[5]
473            umlal       v14.4s, v13.4h, v1.h[5]
474            umlal2      v15.4s, v13.8h, v1.h[5]
475    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
476            ext         v13.16b, v8.16b, v9.16b, #3*2
477            umlal       v14.4s, v12.4h, v1.h[4]
478            umlal2      v15.4s, v12.8h, v1.h[4]
479            umlal       v14.4s, v13.4h, v1.h[4]
480            umlal2      v15.4s, v13.8h, v1.h[4]
481    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
482            ext         v13.16b, v8.16b, v9.16b, #2*2
483            umlal       v14.4s, v12.4h, v1.h[3]
484            umlal2      v15.4s, v12.8h, v1.h[3]
485            umlal       v14.4s, v13.4h, v1.h[3]
486            umlal2      v15.4s, v13.8h, v1.h[3]
487    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
488            ext         v13.16b, v8.16b, v9.16b, #1*2
489            umlal       v14.4s, v12.4h, v1.h[2]
490            umlal2      v15.4s, v12.8h, v1.h[2]
491            umlal       v14.4s, v13.4h, v1.h[2]
492            umlal2      v15.4s, v13.8h, v1.h[2]
493    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
494            ext         v13.16b, v8.16b, v9.16b, #0*2
495            umlal       v14.4s, v12.4h, v1.h[1]
496            umlal2      v15.4s, v12.8h, v1.h[1]
497            umlal       v14.4s, v13.4h, v1.h[1]
498            umlal2      v15.4s, v13.8h, v1.h[1]
499    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
500            ext         v13.16b, v7.16b, v8.16b, #7*2
501            umlal       v14.4s, v12.4h, v1.h[0]
502            umlal2      v15.4s, v12.8h, v1.h[0]
503            umlal       v14.4s, v13.4h, v1.h[0]
504            umlal2      v15.4s, v13.8h, v1.h[0]
505    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
506            ext         v13.16b, v7.16b, v8.16b, #6*2
507            umlal       v14.4s, v12.4h, v0.h[7]
508            umlal2      v15.4s, v12.8h, v0.h[7]
509            umlal       v14.4s, v13.4h, v0.h[7]
510            umlal2      v15.4s, v13.8h, v0.h[7]
511    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
512            ext         v13.16b, v7.16b, v8.16b, #5*2
513            umlal       v14.4s, v12.4h, v0.h[6]
514            umlal2      v15.4s, v12.8h, v0.h[6]
515            umlal       v14.4s, v13.4h, v0.h[6]
516            umlal2      v15.4s, v13.8h, v0.h[6]
517    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
518            ext         v13.16b, v7.16b, v8.16b, #4*2
519            umlal       v14.4s, v12.4h, v0.h[5]
520            umlal2      v15.4s, v12.8h, v0.h[5]
521            umlal       v14.4s, v13.4h, v0.h[5]
522            umlal2      v15.4s, v13.8h, v0.h[5]
523    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
524            ext         v13.16b, v7.16b, v8.16b, #3*2
525            umlal       v14.4s, v12.4h, v0.h[4]
526            umlal2      v15.4s, v12.8h, v0.h[4]
527            umlal       v14.4s, v13.4h, v0.h[4]
528            umlal2      v15.4s, v13.8h, v0.h[4]
529    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
530            ext         v13.16b, v7.16b, v8.16b, #2*2
531            umlal       v14.4s, v12.4h, v0.h[3]
532            umlal2      v15.4s, v12.8h, v0.h[3]
533            umlal       v14.4s, v13.4h, v0.h[3]
534            umlal2      v15.4s, v13.8h, v0.h[3]
535    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
536            ext         v13.16b, v7.16b, v8.16b, #1*2
537            umlal       v14.4s, v12.4h, v0.h[2]
538            umlal2      v15.4s, v12.8h, v0.h[2]
539            umlal       v14.4s, v13.4h, v0.h[2]
540            umlal2      v15.4s, v13.8h, v0.h[2]
541    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
542            ext         v13.16b, v7.16b, v8.16b, #0*2
543            umlal       v14.4s, v12.4h, v0.h[1]
544            umlal2      v15.4s, v12.8h, v0.h[1]
545            umlal       v14.4s, v13.4h, v0.h[1]
546            umlal2      v15.4s, v13.8h, v0.h[1]
547
548            uqrshrn     v14.4h, v14.4s, #16
549            uqrshrn2    v14.8h, v15.4s, #16
550            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
551
552            ins         v3.d[1], v4.d[0]
553            mov         v4.16b, v5.16b
554            mov         v5.16b, v6.16b
555            mov         v6.16b, v7.16b
556            mov         v7.16b, v8.16b
557            mov         v8.16b, v9.16b
558            mov         v9.16b, v10.16b
559            mov         v10.16b, v11.16b
560.endm/*}}}*/
561
562#define TUNED_LIST4 6, 12
563.macro hconv4_6/*{{{*/
564            umull       v14.4s, v7.4h, v0.h[0]
565            umull2      v15.4s, v7.8h, v0.h[0]
566
567            adr         x12, 199f-8
568            ldr         x12, [x12, x5, LSL #3]
569            br          x12
570   199:     .xword 101f
571            .xword 102f
572            .xword 103f
573            .xword 104f
574            .xword 105f
575            .xword 106f
576            .align      4
577    106:    umlal       v14.4s, v4.4h,  v0.h[6]
578            umlal2      v15.4s, v4.8h,  v0.h[6]
579            umlal       v14.4s, v10.4h, v0.h[6]
580            umlal2      v15.4s, v10.8h, v0.h[6]
581    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
582            umlal       v15.4s, v5.4h, v0.h[5]
583            umlal2      v14.4s, v9.8h, v0.h[5]
584            umlal       v15.4s, v10.4h, v0.h[5]
585    104:    umlal       v14.4s, v5.4h, v0.h[4]
586            umlal2      v15.4s, v5.8h, v0.h[4]
587            umlal       v14.4s, v9.4h, v0.h[4]
588            umlal2      v15.4s, v9.8h, v0.h[4]
589    103:    umlal2      v14.4s, v5.8h, v0.h[3]
590            umlal       v15.4s, v6.4h, v0.h[3]
591            umlal2      v14.4s, v8.8h, v0.h[3]
592            umlal       v15.4s, v9.4h, v0.h[3]
593    102:    umlal       v14.4s, v6.4h, v0.h[2]
594            umlal2      v15.4s, v6.8h, v0.h[2]
595            umlal       v14.4s, v8.4h, v0.h[2]
596            umlal2      v15.4s, v8.8h, v0.h[2]
597    101:    umlal2      v14.4s, v6.8h, v0.h[1]
598            umlal       v15.4s, v7.4h, v0.h[1]
599            umlal2      v14.4s, v7.8h, v0.h[1]
600            umlal       v15.4s, v8.4h, v0.h[1]
601
602            uqrshrn     v14.4h, v14.4s, #16
603            uqrshrn2    v14.8h, v15.4s, #16
604            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
605
606            mov         v4.16b, v5.16b
607            mov         v5.16b, v6.16b
608            mov         v6.16b, v7.16b
609            mov         v7.16b, v8.16b
610            mov         v8.16b, v9.16b
611            mov         v9.16b, v10.16b
612            mov         v10.16b, v11.16b
613.endm/*}}}*/
614
615.macro hconv4_12/*{{{*/
616            umull       v14.4s, v4.4h, v0.h[0]
617            umull2      v15.4s, v4.8h, v0.h[0]
618
619            adr         x12, 199f-8
620            ldr         x12, [x12, x5, LSL #3]
621            br          x12
622   199:     .xword 101f
623            .xword 102f
624            .xword 103f
625            .xword 104f
626            .xword 105f
627            .xword 106f
628            .xword 107f
629            .xword 108f
630            .xword 109f
631            .xword 110f
632            .xword 111f
633            .xword 112f
634            .align 4
635    112:    add         x12, x9, #0x1a0
636            bic         x12, x12, #0x200
637            ld1         {v12.8h}, [x12]
638            umlal       v14.4s, v12.4h, v1.h[4]
639            umlal2      v15.4s, v12.8h, v1.h[4]
640            umlal       v14.4s, v10.4h, v1.h[4]
641            umlal2      v15.4s, v10.8h, v1.h[4]
642    111:    add         x12, x9, #0x1a8
643            bic         x12, x12, #0x200
644            ld1         {v12.4h}, [x12], #8
645            bic         x12, x12, #0x200
646            ld1         {v13.4h}, [x12]
647            umlal       v14.4s, v12.4h, v1.h[3]
648            umlal       v15.4s, v13.4h, v1.h[3]
649            umlal2      v14.4s, v9.8h, v1.h[3]
650            umlal       v15.4s, v10.4h, v1.h[3]
651    110:    add         x12, x9, #0x1b0
652            bic         x12, x12, #0x200
653            ld1         {v12.8h}, [x12]
654            umlal       v14.4s, v12.4h, v1.h[2]
655            umlal2      v15.4s, v12.8h, v1.h[2]
656            umlal       v14.4s, v9.4h, v1.h[2]
657            umlal2      v15.4s, v9.8h, v1.h[2]
658    109:    add         x12, x9, #0x1b8
659            bic         x12, x12, #0x200
660            ld1         {v12.4h}, [x12], #8
661            bic         x12, x12, #0x200
662            ld1         {v13.4h}, [x12]
663            umlal       v14.4s, v12.4h, v1.h[1]
664            umlal       v15.4s, v13.4h, v1.h[1]
665            umlal2      v14.4s, v8.8h, v1.h[1]
666            umlal       v15.4s, v9.4h, v1.h[1]
667    108:    add         x12, x9, #0x1c0
668            bic         x12, x12, #0x200
669            ld1         {v12.8h}, [x12]
670            umlal       v14.4s, v12.4h, v1.h[0]
671            umlal2      v15.4s, v12.8h, v1.h[0]
672            umlal       v14.4s, v8.4h, v1.h[0]
673            umlal2      v15.4s, v8.8h, v1.h[0]
674    107:    add         x12, x9, #0x1c8
675            bic         x12, x12, #0x200
676            ld1         {v12.4h}, [x12], #8
677            bic         x12, x12, #0x200
678            ld1         {v13.4h}, [x12]
679            umlal       v14.4s, v12.4h, v0.h[7]
680            umlal       v15.4s, v13.4h, v0.h[7]
681            umlal2      v14.4s, v7.8h, v0.h[7]
682            umlal       v15.4s, v8.4h, v0.h[7]
683    106:    add         x12, x9, #0x1d0
684            bic         x12, x12, #0x200
685            ld1         {v12.8h}, [x12]
686            umlal       v14.4s, v12.4h, v0.h[6]
687            umlal2      v15.4s, v12.8h, v0.h[6]
688            umlal       v14.4s, v7.4h, v0.h[6]
689            umlal2      v15.4s, v7.8h, v0.h[6]
690    105:    add         x12, x9, #0x1d8
691            bic         x12, x12, #0x200
692            ld1         {v12.4h}, [x12], #8
693            bic         x12, x12, #0x200
694            ld1         {v13.4h}, [x12]
695            umlal       v14.4s, v12.4h, v0.h[5]
696            umlal       v15.4s, v13.4h, v0.h[5]
697            umlal2      v14.4s, v6.8h, v0.h[5]
698            umlal       v15.4s, v7.4h, v0.h[5]
699    104:    add         x12, x9, #0x1e0
700            bic         x12, x12, #0x200
701            ld1         {v12.8h}, [x12]
702            umlal       v14.4s, v12.4h, v0.h[4]
703            umlal2      v15.4s, v12.8h, v0.h[4]
704            umlal       v14.4s, v6.4h, v0.h[4]
705            umlal2      v15.4s, v6.8h, v0.h[4]
706    103:    add         x12, x9, #0x1e8
707            bic         x12, x12, #0x200
708            ld1         {v12.4h}, [x12], #8
709            bic         x12, x12, #0x200
710            ld1         {v13.4h}, [x12]
711            umlal       v14.4s, v12.4h, v0.h[3]
712            umlal       v15.4s, v13.4h, v0.h[3]
713            umlal2      v14.4s, v5.8h, v0.h[3]
714            umlal       v15.4s, v6.4h, v0.h[3]
715    102:    add         x12, x9, #0x1f0
716            bic         x12, x12, #0x200
717            ld1         {v12.8h}, [x12]
718            umlal       v14.4s, v12.4h, v0.h[2]
719            umlal2      v15.4s, v12.8h, v0.h[2]
720            umlal       v14.4s, v5.4h, v0.h[2]
721            umlal2      v15.4s, v5.8h, v0.h[2]
722    101:    add         x12, x9, #0x1f8
723            bic         x12, x12, #0x200
724            ld1         {v12.4h}, [x12]
725            umlal       v14.4s, v12.4h, v0.h[1]
726            umlal       v15.4s, v4.4h,  v0.h[1]
727            umlal2      v14.4s, v4.8h,  v0.h[1]
728            umlal       v15.4s, v5.4h, v0.h[1]
729
730            uqrshrn     v14.4h, v14.4s, #16
731            uqrshrn2    v14.8h, v15.4s, #16
732            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
733
734            st1         {v4.16b}, [x9], #16
735            bic         x9, x9, #0x200
736            mov         v4.16b, v5.16b
737            mov         v5.16b, v6.16b
738            mov         v6.16b, v7.16b
739            mov         v7.16b, v8.16b
740            mov         v8.16b, v9.16b
741            mov         v9.16b, v10.16b
742            mov         v10.16b, v11.16b
743.endm/*}}}*/
744
745.macro hconv4_25/*{{{*/
746            add         x12, x9, #0x198
747            bic         x12, x12, #0x200
748            ld1         {v12.4h}, [x12], #8
749            bic         x12, x12, #0x200
750            ld1         {v13.4h}, [x12]
751            umull       v14.4s, v12.4h, v0.h[0]
752            umull       v15.4s, v13.4h, v0.h[0]
753
754            adr         x12, 199f-8
755            ldr         x12, [x12, x5, LSL #3]
756            br          x12
757   199:     .xword 101f
758            .xword 102f
759            .xword 103f
760            .xword 104f
761            .xword 105f
762            .xword 106f
763            .xword 107f
764            .xword 108f
765            .xword 109f
766            .xword 110f
767            .xword 111f
768            .xword 112f
769            .xword 113f
770            .xword 114f
771            .xword 115f
772            .xword 116f
773            .xword 117f
774            .xword 118f
775            .xword 119f
776            .xword 120f
777            .xword 121f
778            .xword 122f
779            .xword 123f
780            .xword 124f
781            .xword 125f
782            .align 4
783    125:    add         x12, x9, #0x0d0
784            bic         x12, x12, #0x200
785            ld1         {v12.8h}, [x12]
786            umlal       v14.4s, v12.4h, v3.h[1]
787            umlal2      v15.4s, v12.8h, v3.h[1]
788            umlal       v14.4s, v10.4h, v3.h[1]
789            umlal2      v15.4s, v10.8h, v3.h[1]
790    124:    add         x12, x9, #0x0d8
791            bic         x12, x12, #0x200
792            ld1         {v12.4h}, [x12], #8
793            bic         x12, x12, #0x200
794            ld1         {v13.4h}, [x12]
795            umlal       v14.4s, v12.4h, v3.h[0]
796            umlal       v15.4s, v13.4h, v3.h[0]
797            umlal2      v14.4s, v9.8h, v3.h[0]
798            umlal       v15.4s, v10.4h, v3.h[0]
799    123:    add         x12, x9, #0x0e0
800            bic         x12, x12, #0x200
801            ld1         {v12.8h}, [x12]
802            umlal       v14.4s, v12.4h, v2.h[7]
803            umlal2      v15.4s, v12.8h, v2.h[7]
804            umlal       v14.4s, v9.4h, v2.h[7]
805            umlal2      v15.4s, v9.8h, v2.h[7]
806    122:    add         x12, x9, #0x0e8
807            bic         x12, x12, #0x200
808            ld1         {v12.4h}, [x12], #8
809            bic         x12, x12, #0x200
810            ld1         {v13.4h}, [x12]
811            umlal       v14.4s, v12.4h, v2.h[6]
812            umlal       v15.4s, v13.4h, v2.h[6]
813            umlal2      v14.4s, v8.8h, v2.h[6]
814            umlal       v15.4s, v9.4h, v2.h[6]
815    121:    add         x12, x9, #0x0f0
816            bic         x12, x12, #0x200
817            ld1         {v12.8h}, [x12]
818            umlal       v14.4s, v12.4h, v2.h[5]
819            umlal2      v15.4s, v12.8h, v2.h[5]
820            umlal       v14.4s, v8.4h, v2.h[5]
821            umlal2      v15.4s, v8.8h, v2.h[5]
822    120:    add         x12, x9, #0x0f8
823            bic         x12, x12, #0x200
824            ld1         {v12.4h}, [x12], #8
825            bic         x12, x12, #0x200
826            ld1         {v13.4h}, [x12]
827            umlal       v14.4s, v12.4h, v2.h[4]
828            umlal       v15.4s, v13.4h, v2.h[4]
829            umlal2      v14.4s, v7.8h, v2.h[4]
830            umlal       v15.4s, v8.4h, v2.h[4]
831    119:    add         x12, x9, #0x100
832            bic         x12, x12, #0x200
833            ld1         {v12.8h}, [x12]
834            umlal       v14.4s, v12.4h, v2.h[3]
835            umlal2      v15.4s, v12.8h, v2.h[3]
836            umlal       v14.4s, v7.4h, v2.h[3]
837            umlal2      v15.4s, v7.8h, v2.h[3]
838    118:    add         x12, x9, #0x108
839            bic         x12, x12, #0x200
840            ld1         {v12.4h}, [x12], #8
841            bic         x12, x12, #0x200
842            ld1         {v13.4h}, [x12]
843            umlal       v14.4s, v12.4h, v2.h[2]
844            umlal       v15.4s, v13.4h, v2.h[2]
845            umlal2      v14.4s, v6.8h, v2.h[2]
846            umlal       v15.4s, v7.4h, v2.h[2]
847    117:    add         x12, x9, #0x110
848            bic         x12, x12, #0x200
849            ld1         {v12.8h}, [x12]
850            umlal       v14.4s, v12.4h, v2.h[1]
851            umlal2      v15.4s, v12.8h, v2.h[1]
852            umlal       v14.4s, v6.4h, v2.h[1]
853            umlal2      v15.4s, v6.8h, v2.h[1]
854    116:    add         x12, x9, #0x118
855            bic         x12, x12, #0x200
856            ld1         {v12.4h}, [x12], #8
857            bic         x12, x12, #0x200
858            ld1         {v13.4h}, [x12]
859            umlal       v14.4s, v12.4h, v2.h[0]
860            umlal       v15.4s, v13.4h, v2.h[0]
861            umlal2      v14.4s, v5.8h, v2.h[0]
862            umlal       v15.4s, v6.4h, v2.h[0]
863    115:    add         x12, x9, #0x120
864            bic         x12, x12, #0x200
865            ld1         {v12.8h}, [x12]
866            umlal       v14.4s, v12.4h, v1.h[7]
867            umlal2      v15.4s, v12.8h, v1.h[7]
868            umlal       v14.4s, v5.4h, v1.h[7]
869            umlal2      v15.4s, v5.8h, v1.h[7]
870    114:    add         x12, x9, #0x128
871            bic         x12, x12, #0x200
872            ld1         {v12.4h}, [x12], #8
873            bic         x12, x12, #0x200
874            ld1         {v13.4h}, [x12]
875            umlal       v14.4s, v12.4h, v1.h[6]
876            umlal       v15.4s, v13.4h, v1.h[6]
877            umlal2      v14.4s, v4.8h,  v1.h[6]
878            umlal       v15.4s, v5.4h, v1.h[6]
879    113:    add         x12, x9, #0x130
880            bic         x12, x12, #0x200
881            ld1         {v12.8h}, [x12]
882            umlal       v14.4s, v12.4h, v1.h[5]
883            umlal2      v15.4s, v12.8h, v1.h[5]
884            umlal       v14.4s, v4.4h,  v1.h[5]
885            umlal2      v15.4s, v4.8h,  v1.h[5]
886    112:    add         x12, x9, #0x138
887            bic         x12, x12, #0x200
888            ld1         {v12.4h}, [x12], #8
889            bic         x12, x12, #0x200
890            ld1         {v16.4h}, [x12]
891                                            add         x12, x9, #0x1f8
892                                            bic         x12, x12, #0x200
893                                            ld1         {v13.4h}, [x12]
894            umlal       v14.4s, v12.4h, v1.h[4]
895            umlal       v15.4s, v16.4h, v1.h[4]
896            umlal       v14.4s, v13.4h, v1.h[4]   // Could be d7, without the load, right?
897            umlal       v15.4s, v4.4h,  v1.h[4]
898    111:    add         x12, x9, #0x140
899            bic         x12, x12, #0x200
900            ld1         {v12.8h}, [x12]
901                                            add         x12, x9, #0x1f0
902                                            bic         x12, x12, #0x200
903                                            ld1         {v13.8h}, [x12]
904            umlal       v14.4s, v12.4h, v1.h[3]
905            umlal2      v15.4s, v12.8h, v1.h[3]
906            umlal       v14.4s, v13.4h, v1.h[3]
907            umlal2      v15.4s, v13.8h, v1.h[3]
908    110:    add         x12, x9, #0x148
909            bic         x12, x12, #0x200
910            ld1         {v12.4h}, [x12], #8
911            bic         x12, x12, #0x200
912            ld1         {v16.4h}, [x12]
913                                            add         x12, x9, #0x1e8
914                                            bic         x12, x12, #0x200
915                                            ld1         {v13.4h}, [x12], #8
916                                            bic         x12, x12, #0x200
917                                            ld1         {v17.4h}, [x12]
918            umlal       v14.4s, v12.4h, v1.h[2]
919            umlal       v15.4s, v16.4h, v1.h[2]
920            umlal       v14.4s, v13.4h, v1.h[2]
921            umlal       v15.4s, v17.4h, v1.h[2]
922    109:    add         x12, x9, #0x150
923            bic         x12, x12, #0x200
924            ld1         {v12.8h}, [x12]
925                                            add         x12, x9, #0x1e0
926                                            bic         x12, x12, #0x200
927                                            ld1         {v13.8h}, [x12]
928            umlal       v14.4s, v12.4h, v1.h[1]
929            umlal2      v15.4s, v12.8h, v1.h[1]
930            umlal       v14.4s, v13.4h, v1.h[1]
931            umlal2      v15.4s, v13.8h, v1.h[1]
932    108:    add         x12, x9, #0x158
933            bic         x12, x12, #0x200
934            ld1         {v12.4h}, [x12], #8
935            bic         x12, x12, #0x200
936            ld1         {v16.4h}, [x12]
937                                            add         x12, x9, #0x1d8
938                                            bic         x12, x12, #0x200
939                                            ld1         {v13.4h}, [x12], #8
940                                            bic         x12, x12, #0x200
941                                            ld1         {v17.4h}, [x12]
942            umlal       v14.4s, v12.4h, v1.h[0]
943            umlal       v15.4s, v16.4h, v1.h[0]
944            umlal       v14.4s, v13.4h, v1.h[0]
945            umlal       v15.4s, v17.4h, v1.h[0]
946    107:    add         x12, x9, #0x160
947            bic         x12, x12, #0x200
948            ld1         {v12.8h}, [x12]
949                                            add         x12, x9, #0x1d0
950                                            bic         x12, x12, #0x200
951                                            ld1         {v13.8h}, [x12]
952            umlal       v14.4s, v12.4h, v0.h[7]
953            umlal2      v15.4s, v12.8h, v0.h[7]
954            umlal       v14.4s, v13.4h, v0.h[7]
955            umlal2      v15.4s, v13.8h, v0.h[7]
956    106:    add         x12, x9, #0x168
957            bic         x12, x12, #0x200
958            ld1         {v12.4h}, [x12], #8
959            bic         x12, x12, #0x200
960            ld1         {v16.4h}, [x12]
961                                            add         x12, x9, #0x1c8
962                                            bic         x12, x12, #0x200
963                                            ld1         {v13.4h}, [x12], #8
964                                            bic         x12, x12, #0x200
965                                            ld1         {v17.4h}, [x12]
966            umlal       v14.4s, v12.4h, v0.h[6]
967            umlal       v15.4s, v16.4h, v0.h[6]
968            umlal       v14.4s, v13.4h, v0.h[6]
969            umlal       v15.4s, v17.4h, v0.h[6]
970    105:    add         x12, x9, #0x170
971            bic         x12, x12, #0x200
972            ld1         {v12.8h}, [x12]
973                                            add         x12, x9, #0x1c0
974                                            bic         x12, x12, #0x200
975                                            ld1         {v13.8h}, [x12]
976            umlal       v14.4s, v12.4h, v0.h[5]
977            umlal2      v15.4s, v12.8h, v0.h[5]
978            umlal       v14.4s, v13.4h, v0.h[5]
979            umlal2      v15.4s, v13.8h, v0.h[5]
980    104:    add         x12, x9, #0x178
981            bic         x12, x12, #0x200
982            ld1         {v12.4h}, [x12], #8
983            bic         x12, x12, #0x200
984            ld1         {v16.4h}, [x12]
985                                            add         x12, x9, #0x1b8
986                                            bic         x12, x12, #0x200
987                                            ld1         {v13.4h}, [x12], #8
988                                            bic         x12, x12, #0x200
989                                            ld1         {v17.4h}, [x12]
990            umlal       v14.4s, v12.4h, v0.h[4]
991            umlal       v15.4s, v16.4h, v0.h[4]
992            umlal       v14.4s, v13.4h, v0.h[4]
993            umlal       v15.4s, v17.4h, v0.h[4]
994    103:    add         x12, x9, #0x180
995            bic         x12, x12, #0x200
996            ld1         {v12.8h}, [x12]
997                                            add         x12, x9, #0x1b0
998                                            bic         x12, x12, #0x200
999                                            ld1         {v13.8h}, [x12]
1000            umlal       v14.4s, v12.4h, v0.h[3]
1001            umlal2      v15.4s, v12.8h, v0.h[3]
1002            umlal       v14.4s, v13.4h, v0.h[3]
1003            umlal2      v15.4s, v13.8h, v0.h[3]
1004    102:    add         x12, x9, #0x188
1005            bic         x12, x12, #0x200
1006            ld1         {v12.4h}, [x12], #8
1007            bic         x12, x12, #0x200
1008            ld1         {v16.4h}, [x12]
1009                                            add         x12, x9, #0x1a8
1010                                            bic         x12, x12, #0x200
1011                                            ld1         {v13.4h}, [x12], #8
1012                                            bic         x12, x12, #0x200
1013                                            ld1         {v17.4h}, [x12]
1014            umlal       v14.4s, v12.4h, v0.h[2]
1015            umlal       v15.4s, v16.4h, v0.h[2]
1016            umlal       v14.4s, v13.4h, v0.h[2]
1017            umlal       v15.4s, v17.4h, v0.h[2]
1018    101:    add         x12, x9, #0x190
1019            bic         x12, x12, #0x200
1020            ld1         {v12.8h}, [x12], #16
1021            bic         x12, x12, #0x200
1022            ld1         {v13.8h}, [x12]
1023            umlal       v14.4s, v12.4h, v0.h[1]
1024            umlal2      v15.4s, v12.8h, v0.h[1]
1025            umlal       v14.4s, v13.4h, v0.h[1]
1026            umlal2      v15.4s, v13.8h, v0.h[1]
1027
1028            uqrshrn     v14.4h, v14.4s, #16
1029            uqrshrn2    v14.8h, v15.4s, #16
1030            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1031
1032            st1         {v4.16b}, [x9], #16
1033            bic         x9, x9, #0x200
1034            mov         v4.16b, v5.16b
1035            mov         v5.16b, v6.16b
1036            mov         v6.16b, v7.16b
1037            mov         v7.16b, v8.16b
1038            mov         v8.16b, v9.16b
1039            mov         v9.16b, v10.16b
1040            mov         v10.16b, v11.16b
1041.endm/*}}}*/
1042
1043/* Dedicated function wrapper for the fetch macro, for the cases where
1044 * performance isn't that important, to keep code size down.
1045 */
1046ENTRY(fetch_generic_asm)
1047            stp         x10, x11, [sp, #-16]!
1048            fetch
1049            ldp         x10, x11, [sp], #16
1050            ret
1051END(fetch_generic_asm)
1052
1053/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value
1054 * across to fill the rest of the register pair.  Used for filling the right
1055 * hand edge of the window when starting too close to the right hand edge of
1056 * the image.
1057 */
1058ENTRY(prefetch_clamp1)
1059            sub         x11, xzr, x11
1060            sub         x15, x15, x1
1061            sub         x16, x16, x1
1062            tbz         x11, #3, 1f
1063            mov         v11.16b, v10.16b
1064            sub         x1, x1, #16
10651:          mov         v12.16b, v11.16b
1066            movi        v13.8b, #0xff
1067            tbz         x11, #2, 1f
1068            ext         v12.16b, v12.16b, v12.16b, #4*2
1069            sub         x1, x1, #8
1070            shl         v13.2d, v13.2d, #32
10711:          tbz         x11, #1, 1f
1072            ext         v12.16b, v12.16b, v12.16b, #6*2
1073            sub         x1, x1, #4
1074            shl         v13.2d, v13.2d, #16
10751:          tbz         x11, #0, 1f
1076            ext         v12.16b, v12.16b, v12.16b, #7*2
1077            sub         x1, x1, #2
1078            shl         v13.2d, v13.2d, #8
10791:          dup         v12.8h, v12.h[6]
1080            sxtl        v13.8h, v13.8b
1081            bif         v11.16b, v12.16b, v13.16b
10821:          tbz         x11, #3, 1f
1083            mov         v10.16b, v11.16b
1084            mov         v11.16b, v12.16b
10851:          sub         x11, xzr, x11
1086            add         x15, x15, x1
1087            add         x16, x16, x1
1088            ret
1089END(prefetch_clamp1)
1090
1091ENTRY(prefetch_clamp4)
1092            sub         x11, xzr, x11
1093            sub         x15, x15, x1
1094            sub         x16, x16, x1
1095            tbz         x11, #3, 1f
1096            sub         x1, x1, #16     // what's this?
1097            mov         v11.16b, v10.16b
10981:          dup         v12.2d, v11.d[1]
1099            tbz         x11, #2, 1f
1100            dup         v12.2d, v11.d[0]
1101            sub         x1, x1, #8
1102            dup         v11.2d, v11.d[0]
11031:          tbz         x11, #3, 1f
1104            mov         v10.16b, v11.16b
1105            mov         v11.16b, v12.16b
11061:          sub         x11, xzr, x11
1107            add         x15, x15, x1
1108            add         x16, x16, x1
1109            ret
1110END(prefetch_clamp4)
1111
1112
1113/* Helpers for prefetch, below.
1114 */
1115.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi
1116  .if \store > 0
1117    .ifc \qsa,\qsb
1118            st1         {\qsa}, [x9], #16
1119            st1         {\qsb}, [x9], #16
1120    .else
1121            st1         {\qsa,\qsb}, [x9], #32
1122    .endif
1123  .elseif \store == 0
1124            mov         \qa, \qsa
1125            mov         \qb, \qsb
1126  .else
1127            ins         \qb, \qsb_hi
1128  .endif
1129.endm
1130
1131.macro prefetch_one  qa, qb, rem, c, store=0, step=1
1132.set i, (need - 16) - \rem
1133.if i >= 0
11341:          cmp         x10, #i+16
1135            blo         2f
1136            prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1]
1137            b           1f
11382:          cmp         x11, #i+16
1139            bls         3f
1140            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
1141            bl          fetch_generic_asm
1142            b           2f
11433:          bl          prefetch_clamp\step
1144            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
11454:          b           4f+4
1146           //v12 contains pad word from prefetch_clamp call
1147            prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
1148  .if \rem > 0
1149            b           4f+4
1150  .else
11511:
11522:
11533:
11544:          nop
1155  .endif
1156.endif
1157.endm
1158
1159/* Fill the convolution window with context data.  The aim here is to load
1160 * exactly rlf + rrt columns, and in the main loop to read as many columns as
1161 * will be written.  This is complicated by the need to handle cases when the
1162 * input starts very close to the left or right (or both) edges of the image,
1163 * and where these do not fall on 16-byte boundaries.
1164 *
1165 * Input:
1166 *      x1 -- src
1167 *      x2 -- pitch
1168 *      x3 -- count
1169 *      x4 -- inlen
1170 *      x5 -- r
1171 *      x6 -- rup
1172 *      x7 -- rdn
1173 *      x8 -- rlf
1174 *      x9 -- buffer (if needed)
1175 *      x13 = -pitch
1176 *      x15 = top-row in
1177 *      x16 = bottom-row in
1178 * Output:
1179 *      x1 += rlf + min(count, rrt)
1180 * Modifies:
1181 *      x10 -- fill start index in the window
1182 *      x11 -- fill stop index in the window
1183 *      x12 -- scratch
1184 */
1185.macro prefetch step=1, max_r=25
1186.set need, ((\max_r + \max_r) * \step + 15) & ~15
1187  .if \step == 1
1188            mov         x10, #need - (\max_r * \step)
1189            sub         x10, x10, x8
1190  .else
1191            mov         x10, #need - (\max_r * \step)
1192            sub         x10, x10, x8, LSL #2
1193  .endif
1194            add         x11, x10, x4
1195            subs        x11, x11, #need
1196            csel        x11, xzr, x11, hi
1197            add         x11, x11, #need
1198
1199            bl          fetch_generic_asm
1200  .if \step == 1
1201            dup         v9.8h, v10.h[0]
1202  .else
1203            dup         v9.2d, v10.d[0]
1204  .endif
1205            tst         x10, #15
1206            beq         2f
1207            sub         x12, xzr, x10
1208            tbz         x10, #3, 1f
1209            mov         v11.16b, v10.16b
1210            mov         v10.16b, v9.16b
12111:          tbz         x12, #2, 1f
1212            ext         v11.16b, v10.16b, v11.16b, #4*2
1213            ext         v10.16b, v9.16b, v10.16b, #4*2
1214  .if \step == 1
1215  1:        tbz         x12, #1, 1f
1216            ext         v11.16b, v10.16b, v11.16b, #2*2
1217            ext         v10.16b, v9.16b, v10.16b, #2*2
1218  1:        tbz         x12, #0, 1f
1219            ext         v11.16b, v10.16b, v11.16b, #1*2
1220            ext         v10.16b, v9.16b, v10.16b, #1*2
1221  .endif
12221:          sub         x1, x1, x10
1223            sub         x15, x15, x10
1224            sub         x16, x16, x10
1225            bic         x10, x10, #15
1226            add         x1, x1, x10
1227            add         x15, x15, x10
1228            add         x16, x16, x10
12292:
1230  .if \step > 1
1231            /* it's only in the uchar2 and uchar4 cases where the register file
1232             * is insufficient (given MAX_R <= 25).
1233             */
1234            prefetch_one xx, xx, 192, c=\max_r, step=\step, store=1
1235            prefetch_one xx, xx, 176, c=\max_r, step=\step, store=1
1236            prefetch_one xx, xx, 160, c=\max_r, step=\step, store=1
1237            prefetch_one xx, xx, 144, c=\max_r, step=\step, store=1
1238            prefetch_one xx, xx, 128, c=\max_r, step=\step, store=1
1239            prefetch_one xx, xx, 112, c=\max_r, step=\step, store=1
1240            prefetch_one xx, xx,  96, c=\max_r, step=\step, store=1
1241            prefetch_one xx, xx,  80, c=\max_r, step=\step, store=1
1242            prefetch_one xx, xx,  64, c=\max_r, step=\step, store=1
1243            prefetch_one xx, xx,  48, c=\max_r, step=\step, store=1
1244  .else
1245            /* q3 normally contains the coefficient table, but it's not fully
1246             * used.  In the uchar1, r=25 case the other half of q3 is used for
1247             * the last two window taps to avoid falling out to memory.
1248             */
1249            prefetch_one xx,  v3.d[1], 48, c=\max_r, step=\step, store=-1
1250  .endif
1251            prefetch_one v4.16b, v5.16b, 32, c=\max_r, step=\step, store=0
1252            prefetch_one v6.16b, v7.16b, 16, c=\max_r, step=\step, store=0
1253            prefetch_one v8.16b, v9.16b,  0, c=\max_r, step=\step, store=0
1254
1255  .if \step == 1
1256            add         x10, x8, #\max_r * \step
1257  .else
1258            lsl         x10, x8, #2
1259            add         x10, x10, #\max_r * \step
1260  .endif
1261            subs        x4, x4, x10
1262            csel        x4, xzr, x4, lo
1263.endm
1264
1265/* The main loop.
1266 *
1267 * Input:
1268 *      x0 = dst
1269 *      x1 = src
1270 *      x2 = pitch
1271 *      x3 = count
1272 *      x4 = inlen
1273 *      x5 = r
1274 *      x6 = rup
1275 *      x7 = rdn
1276 *      x9 = buffer
1277 *      x13 = -pitch
1278 *      x15 = top-row in
1279 *      x16 = bottom-row in
1280 * Modifies
1281 *      x8 = fetch code pointer
1282 */
1283.macro mainloop core, step=1, max_r=25, labelc="", labelnc=""
1284            adrp        x8, \labelnc
1285            add         x8, x8, #:lo12:\labelnc
1286            sub         x8, x8, x5, LSL #5
1287            sub         x8, x8, x5, LSL #3
1288            cmp         x5, x6
1289            ccmp        x5, x7, #0, eq
1290            beq         5f
1291
1292            /* if (r != rup || r != rdn) then the address-clamping table should
1293             * be used rather than the short-cut version.
1294             */
1295            adrp        x8, \labelc
1296            add         x8, x8, #:lo12:\labelc
1297            sub         x8, x8, x5, LSL #6
1298            add         x8, x8, x5, LSL #3
1299            b           5f
1300            .align  4
13013:          fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1302
1303            /* For each call to fetch two are made to \core.  It would be
1304             * preferable to have twice the work done in \core.
1305             */
1306            \core
1307            st1         {v15.8b}, [x0], #8
1308            \core
1309            st1         {v15.8b}, [x0], #8
1310
1311            sub         x3, x3, #16
13125:          subs        x4, x4, #16
1313            bhs         3b
1314            adds        x4, x4, #16
1315            bne         1f
1316  .if \step==1
1317            dup         v10.8h, v9.h[7]
1318            dup         v11.8h, v9.h[7]
1319  .else
1320            dup         v10.2d, v9.d[1]
1321            dup         v11.2d, v9.d[1]
1322  .endif
1323            b           4f
1324
13251:          sub         x1, x1, #16
1326            sub         x15, x15, #16
1327            sub         x16, x16, #16
1328            add         x1, x1, x4
1329            add         x15, x15, x4
1330            add         x16, x16, x4
1331            bl          fetch_generic_asm
1332
1333  .if \step==1
1334            dup         v12.8h, v11.h[7]
1335  .else
1336            dup         v12.2d, v11.d[1]
1337  .endif
1338            sub         x4, xzr, x4
1339            tbz         x4, #3, 1f
1340            mov         v10.16b, v11.16b
1341            mov         v11.16b, v12.16b
13421:          tbz         x4, #2, 1f
1343            ext         v10.16b, v10.16b, v11.16b, #4*2
1344            ext         v11.16b, v11.16b, v12.16b, #4*2
13451:          tbz         x4, #1, 1f
1346            ext         v10.16b, v10.16b, v11.16b, #2*2
1347            ext         v11.16b, v11.16b, v12.16b, #2*2
13481:          tbz         x4, #0, 4f
1349            ext         v10.16b, v10.16b, v11.16b, #1*2
1350            ext         v11.16b, v11.16b, v12.16b, #1*2
13514:          cbz         x3, 5f
13523:          \core
1353  .if \step==1
1354            dup         v11.8h, v11.h[7]
1355  .else
1356            dup         v11.2d, v11.d[1]
1357  .endif
1358            subs        x3, x3, #8
1359            blo         4f
1360            st1         {v15.8b}, [x0], #8
1361            beq         5f
1362            b           3b
13634:          tbz         x3, #2, 1f
1364            st1         {v15.s}[0], [x0], #4
1365            ext         v15.16b, v15.16b, v15.16b, #4*2
13661:          tbz         x3, #1, 1f
1367            st1         {v15.h}[0], [x0], #2
1368            ext         v15.16b, v15.16b, v15.16b, #2*2
13691:          tbz         x3, #0, 5f
1370            st1         {v15.b}[0], [x0], #1
1371            ext         v15.16b, v15.16b, v15.16b, #1*2
13725:          nop
1373.endm
1374
1375.irep r, TUNED_LIST1, 25
1376ENTRY(convolve1_\r)
1377            stp         x29,x30, [sp, #-16]!
1378
1379            prefetch    step=1, max_r=\r
1380
1381            mainloop    core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1382
1383            ldp         x29,x30, [sp], #16
1384            ret
1385END(convolve1_\r)
1386.endr
1387
1388.irep r, TUNED_LIST4, 25
1389ENTRY(convolve4_\r)
1390            sub         x12, sp, #0x200
1391            bic         x9, x12, #0x3fc
1392            mov         sp, x9
1393            stp         x12,x30, [sp, #-16]!
1394
1395            /* x9 now points to a buffer on the stack whose address has the low
1396             * 10 bits clear.  This allows easy address calculation in the
1397             * wrap-around cases.
1398             */
1399
1400
1401            prefetch    step=4, max_r=\r
1402
1403            mainloop    core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1404
1405            ldp         x12,x30, [sp]
1406            add         sp, x12, #0x200
1407            ret
1408END(convolve4_\r)
1409.endr
1410
1411/* void rsdIntrinsicBlurU1_K(
1412 *                  void *out,      // x0
1413 *                  void *in,       // x1
1414 *                  size_t w,       // x2
1415 *                  size_t h,       // x3
1416 *                  size_t p,       // x4
1417 *                  size_t x,       // x5
1418 *                  size_t y,       // x6
1419 *                  size_t count,   // x7
1420 *                  size_t r,       // [sp]
1421 *                  uint16_t *tab); // [sp,#8]
1422 */
1423ENTRY(rsdIntrinsicBlurU1_K)
1424            stp         x16,x30, [sp, #-80]!
1425            stp         x14,x15, [sp, #16]
1426            stp         x12,x13, [sp, #32]
1427            stp         x10,x11, [sp, #48]
1428            stp         x8,x9, [sp, #64]
1429            sub         x8, sp, #32
1430            sub         sp, sp, #64
1431            st1         {v8.1d - v11.1d}, [sp]
1432            st1         {v12.1d - v15.1d}, [x8]
1433            mov         x8, x5        // x
1434            ldr         w5, [sp,#144] // r
1435            sub         x9, x2, x8
1436            sub         x10, x3, x6
1437            mov         x2, x4        // pitch
1438            mov         x3, x7        // count
1439            sub         x7, x10, #1
1440            sub         x9, x9, x3
1441
1442            ldr         x12, [sp, #152] // tab
1443
1444            add         x0, x0, x8
1445            add         x1, x1, x8
1446
1447            cmp         x6, x5
1448            csel        x6, x5, x6, hs
1449            cmp         x7, x5
1450            csel        x7, x5, x7, hs
1451            cmp         x8, x5
1452            csel        x8, x5, x8, hs
1453            cmp         x9, x5
1454            csel        x9, x5, x8, hs
1455
1456            add         x4, x8, x9
1457            add         x4, x4, x3
1458
1459            sub         x1, x1, x8
1460
1461            sub         x13, xzr, x2
1462            msub        x15, x2, x6, x1
1463            madd        x16, x2, x7, x1
1464
1465            ld1         {v0.8h,v1.8h}, [x12], #32
1466            ld1         {v2.8h,v3.8h}, [x12], #32
1467
1468            adr         x30, 1f
1469  .irep r, TUNED_LIST1
1470            cmp         x5, #\r
1471            bls         convolve1_\r
1472  .endr
1473            b           convolve1_25
1474
14751:          ld1         {v8.1d - v11.1d}, [sp], #32
1476            ld1         {v12.1d - v15.1d}, [sp], #32
1477            ldp         x8,x9, [sp, #64]
1478            ldp         x10,x11, [sp, #48]
1479            ldp         x12,x13, [sp, #32]
1480            ldp         x14,x15, [sp, #16]
1481            ldp         x12,x30, [sp], #80
1482            ret
1483END(rsdIntrinsicBlurU1_K)
1484
1485/* void rsdIntrinsicBlurU4_K(
1486 *                  void *out,      // x0
1487 *                  void *in,       // x1
1488 *                  size_t w,       // x2
1489 *                  size_t h,       // x3
1490 *                  size_t p,       // x4
1491 *                  size_t x,       // x5
1492 *                  size_t y,       // x6
1493 *                  size_t count,   // x7
1494 *                  size_t r,       // [sp]
1495 *                  uint16_t *tab); // [sp,#8]
1496 */
1497ENTRY(rsdIntrinsicBlurU4_K)
1498            stp         x16,x30, [sp, #-80]!
1499            stp         x14,x15, [sp, #16]
1500            stp         x12,x13, [sp, #32]
1501            stp         x10,x11, [sp, #48]
1502            stp         x8,x9, [sp, #64]
1503            sub         x8, sp, #32
1504            sub         sp, sp, #64
1505            st1         {v8.1d - v11.1d}, [sp]
1506            st1         {v12.1d - v15.1d}, [x8]
1507            mov         x8, x5        // x
1508            ldr         w5, [sp,#144] // r
1509            sub         x9, x2, x8
1510            sub         x10, x3, x6
1511            mov         x2, x4        // pitch
1512            mov         x3, x7        // count
1513            sub         x7, x10, #1
1514            sub         x9, x9, x3
1515
1516            ldr         x12, [sp, #152]
1517
1518            add         x0, x0, x8, LSL #2
1519            add         x1, x1, x8, LSL #2
1520
1521            cmp         x6, x5
1522            csel        x6, x5, x6, hs
1523            cmp         x7, x5
1524            csel        x7, x5, x7, hs
1525            cmp         x8, x5
1526            csel        x8, x5, x8, hs
1527            cmp         x9, x5
1528            csel        x9, x5, x9, hs
1529
1530            lsl         x3, x3, #2
1531            add         x4, x8, x9
1532            add         x4, x3, x4, LSL #2
1533
1534            sub         x1, x1, x8, LSL #2
1535
1536            sub         x13, xzr, x2
1537            msub        x15, x2, x6, x1
1538            madd        x16, x2, x7, x1
1539
1540            ld1         {v0.8h,v1.8h}, [x12], #32
1541            ld1         {v2.8h,v3.8h}, [x12], #32
1542
1543            adr         x30, 1f
1544  .irep r, TUNED_LIST4
1545            cmp         x5, #\r
1546            bls         convolve4_\r
1547  .endr
1548            b           convolve4_25
1549
15501:          ld1         {v8.1d - v11.1d}, [sp], #32
1551            ld1         {v12.1d - v15.1d}, [sp], #32
1552            ldp         x8,x9, [sp, #64]
1553            ldp         x10,x11, [sp, #48]
1554            ldp         x12,x13, [sp, #32]
1555            ldp         x14,x15, [sp, #16]
1556            ldp         x12,x30, [sp], #80
1557            ret
1558END(rsdIntrinsicBlurU4_K)
1559