1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19#define END(f) .size f, .-f;
20
21.set FRACTION_BITS, 7
22.set MAX_R, 25
23
24
25/* A quick way of making a line of code conditional on some other condition.
26 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
27 * `ifcc`:
28 */
29.macro ifcc zzz:vararg
30.if cc
31            \zzz
32.endif
33.endm
34
35/* Fetch 16 columns of bytes (regardless of image format), convolve these
36 * vertically, and leave them in the register file.  If working near the top or
37 * bottom of an image then clamp the addressing while loading the data in.
38 *
39 * The convolution is fully unrolled for windows up to max_r, with the
40 * outermost edges calculated first.  This way it's possible to branch directly
41 * into the relevant part of the code for an arbitrary convolution radius.  Two
42 * variants of the loop are produced; one eliminates the clamping code for a
43 * slight speed advantage.
44 *
45 * Where the macro is called with reg=x, the specified register is taken to
46 * contain a pre-calculated pointer into one of the two loops.
47 *
48 * Input:
49 *      x1 -- src
50 *      x2 -- pitch
51 *      x5 -- r
52 *      x6 -- rup
53 *      x7 -- rdn
54 *      x12 -- switch index
55 *      v0-v3 -- coefficient table
56 *      x13 = -pitch
57 *      x15 = top-row in
58 *      x19 = bottom-row in
59 * Output:
60 *      x1 += 16
61 *      v10,v11 -- 16 convolved columns
62 * Modifies:
63 *      x10 = upper row pointer
64 *      x11 = lower row pointer
65 *      v12-v15 = temporary sums
66 */
67.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
68  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
69
70            ld1         {v15.16b}, [x1], #16
71            mov         x10, x15
72
73            uxtl        v14.8h, v15.8b
74//            prfm        PLDL1KEEP,[x1, #16] // TODO: confirm
75            uxtl2       v15.8h, v15.16b
76  .if \max_r < 16 // approximate
77    ifcc    adr         \reg, 1f
78  .else
79    ifcc    adrp        \reg, 1f
80    ifcc    add         \reg, \reg, #:lo12:1f
81  .endif
82
83            umull       v12.4s, v14.4h, v0.h[0]
84    ifcc    sub         \reg, \reg, x5, LSL #6
85            umull2      v13.4s, v14.8h, v0.h[0]
86            mov         x11, x19
87            umull       v14.4s, v15.4h, v0.h[0]
88    ifcc    add         \reg, \reg, x5, LSL #3
89            umull2      v15.4s, v15.8h, v0.h[0]
90            br          \reg
91
92  .irp rowclamp, 1, 0
93    .set cc, \rowclamp
94    .align 4
95    .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
96        .set i, \dreg * 8 + \lane
97        .if 0 < i && i <= \max_r
98            ld1         {v10.16b}, [x10], x2
99    ifcc    cmp         x6, #i
100            ld1         {v11.16b}, [x11], x13
101    ifcc    csel        x10, x15, x10, lo
102            uaddl       v16.8h, v10.8b, v11.8b
103    ifcc    cmp         x7, #i
104            uaddl2      v11.8h, v10.16b, v11.16b
105    ifcc    csel        x11, x19, x11, lo
106            umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
107            umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
108//            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
109nop
110            umlal       v14.4s, v11.4h, v\dreg\doth[\lane]
111//            prfm        PLDL1KEEP,[x11, #32] // TODO: confirm
112nop
113            umlal2      v15.4s, v11.8h, v\dreg\doth[\lane]
114        .endif
115    .endr ; .endr ; .endr
116    .if \rowclamp == 1
117        1: \labelc :
118            b           2f
119    .else
120        2: \labelnc :
121    .endif
122  .endr
123
124            uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
125            add         x15, x15, #16
126            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
127            add         x19, x19, #16
128            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
129            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
130.endm /*}}}*/
131
132/* Some portion of the convolution window (as much as will fit, and all of it
133 * for the uchar1 cases) is kept in the register file to avoid unnecessary
134 * memory accesses.  This forces the horizontal loops to be unrolled because
135 * there's no indexed addressing into the register file.
136 *
137 * As in the fetch macro, the operations are ordered from outside to inside, so
138 * that jumping into the middle of the block bypasses the unwanted window taps.
139 *
140 * There are several variants of the macro because of the fixed offets of the
141 * taps -- the wider the maximum radius the further the centre tap is from the
142 * most recently fetched data.  This means that pre-filling the window requires
143 * more data that won't be used and it means that rotating the window involves
144 * more mov operations.
145 *
146 * When the buffer gets too big the buffer at [x9] is used.
147 *
148 * Input:
149 *      v16-v31,v4-v11 -- convoltion window
150 *      x9 -- pointer to additional convolution window data
151 * Output:
152 *      x9 -- updated buffer pointer (if used)
153 *      d31 -- result to be stored
154 * Modifies:
155 *      x12 -- temp buffer pointer
156 *      v12-v13 -- temporaries for load and vext operations.
157 *      v14-v15 -- intermediate sums
158 */
159#define TUNED_LIST1 8, 16
160.macro hconv1_8/*{{{*/
161            umull       v14.4s, v9.4h, v0.h[0]
162            umull2      v15.4s, v9.8h, v0.h[0]
163
164            adr         x16, 100f
165            ldrsh       x12, [x16, x5, LSL #1]
166            add         x12, x12, x16
167            br          x12
168   100:     .hword -4
169            .hword 101f-100b
170            .hword 102f-100b
171            .hword 103f-100b
172            .hword 104f-100b
173            .hword 105f-100b
174            .hword 106f-100b
175            .hword 107f-100b
176            .hword 108f-100b
177            .align      4
178    108:    umlal       v14.4s, v8.4h, v1.h[0]
179            umlal2      v15.4s, v8.8h, v1.h[0]
180            umlal       v14.4s, v10.4h, v1.h[0]
181            umlal2      v15.4s, v10.8h, v1.h[0]
182    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
183            ext         v13.16b, v9.16b, v10.16b, #7*2
184            umlal       v14.4s, v12.4h, v0.h[7]
185            umlal2      v15.4s, v12.8h, v0.h[7]
186            umlal       v14.4s, v13.4h, v0.h[7]
187            umlal2      v15.4s, v13.8h, v0.h[7]
188    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
189            ext         v13.16b, v9.16b, v10.16b, #6*2
190            umlal       v14.4s, v12.4h, v0.h[6]
191            umlal2      v15.4s, v12.8h, v0.h[6]
192            umlal       v14.4s, v13.4h, v0.h[6]
193            umlal2      v15.4s, v13.8h, v0.h[6]
194    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
195            ext         v13.16b, v9.16b, v10.16b, #5*2
196            umlal       v14.4s, v12.4h, v0.h[5]
197            umlal2      v15.4s, v12.8h, v0.h[5]
198            umlal       v14.4s, v13.4h, v0.h[5]
199            umlal2      v15.4s, v13.8h, v0.h[5]
200    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
201            //ext         v13.16b, v9.16b, v10.16b, #4*2
202            umlal2      v14.4s, v8.8h, v0.h[4]
203            umlal       v15.4s, v9.4h, v0.h[4]
204            umlal2      v14.4s, v9.8h, v0.h[4]
205            umlal       v15.4s, v10.4h, v0.h[4]
206    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
207            ext         v13.16b, v9.16b, v10.16b, #3*2
208            umlal       v14.4s, v12.4h, v0.h[3]
209            umlal2      v15.4s, v12.8h, v0.h[3]
210            umlal       v14.4s, v13.4h, v0.h[3]
211            umlal2      v15.4s, v13.8h, v0.h[3]
212    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
213            ext         v13.16b, v9.16b, v10.16b, #2*2
214            umlal       v14.4s, v12.4h, v0.h[2]
215            umlal2      v15.4s, v12.8h, v0.h[2]
216            umlal       v14.4s, v13.4h, v0.h[2]
217            umlal2      v15.4s, v13.8h, v0.h[2]
218    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
219            ext         v13.16b, v9.16b, v10.16b, #1*2
220            umlal       v14.4s, v12.4h, v0.h[1]
221            umlal2      v15.4s, v12.8h, v0.h[1]
222            umlal       v14.4s, v13.4h, v0.h[1]
223            umlal2      v15.4s, v13.8h, v0.h[1]
224
225            uqrshrn     v14.4h, v14.4s, #16
226            uqrshrn2    v14.8h, v15.4s, #16
227            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
228
229            mov         v8.16b, v9.16b
230            mov         v9.16b, v10.16b
231            mov         v10.16b, v11.16b
232.endm/*}}}*/
233
234.macro hconv1_16/*{{{*/
235            umull       v14.4s, v8.4h, v0.h[0]
236            umull2      v15.4s, v8.8h, v0.h[0]
237
238            adr         x16, 100f
239            ldrsh       x12, [x16, x5, LSL #1]
240            add         x12, x12, x16
241            br          x12
242   100:     .hword -4
243            .hword 101f-100b
244            .hword 102f-100b
245            .hword 103f-100b
246            .hword 104f-100b
247            .hword 105f-100b
248            .hword 106f-100b
249            .hword 107f-100b
250            .hword 108f-100b
251            .hword 109f-100b
252            .hword 110f-100b
253            .hword 111f-100b
254            .hword 112f-100b
255            .hword 113f-100b
256            .hword 114f-100b
257            .hword 115f-100b
258            .hword 116f-100b
259            .align 4
260    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
261            //ext         v13.16b, v10.16b, v11.16b, #0*2
262            umlal       v14.4s, v6.4h, v2.h[0]
263            umlal2      v15.4s, v6.8h, v2.h[0]
264            umlal       v14.4s, v10.4h, v2.h[0]
265            umlal2      v15.4s, v10.8h, v2.h[0]
266    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
267            ext         v13.16b, v9.16b, v10.16b, #7*2
268            umlal       v14.4s, v12.4h, v1.h[7]
269            umlal2      v15.4s, v12.8h, v1.h[7]
270            umlal       v14.4s, v13.4h, v1.h[7]
271            umlal2      v15.4s, v13.8h, v1.h[7]
272    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
273            ext         v13.16b, v9.16b, v10.16b, #6*2
274            umlal       v14.4s, v12.4h, v1.h[6]
275            umlal2      v15.4s, v12.8h, v1.h[6]
276            umlal       v14.4s, v13.4h, v1.h[6]
277            umlal2      v15.4s, v13.8h, v1.h[6]
278    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
279            ext         v13.16b, v9.16b, v10.16b, #5*2
280            umlal       v14.4s, v12.4h, v1.h[5]
281            umlal2      v15.4s, v12.8h, v1.h[5]
282            umlal       v14.4s, v13.4h, v1.h[5]
283            umlal2      v15.4s, v13.8h, v1.h[5]
284    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
285            //ext         v13.16b, v9.16b, v10.16b, #4*2
286            umlal2      v14.4s, v6.8h, v1.h[4]
287            umlal       v15.4s, v7.4h, v1.h[4]
288            umlal2      v14.4s, v9.8h, v1.h[4]
289            umlal       v15.4s, v10.4h, v1.h[4]
290    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
291            ext         v13.16b, v9.16b, v10.16b, #3*2
292            umlal       v14.4s, v12.4h, v1.h[3]
293            umlal2      v15.4s, v12.8h, v1.h[3]
294            umlal       v14.4s, v13.4h, v1.h[3]
295            umlal2      v15.4s, v13.8h, v1.h[3]
296    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
297            ext         v13.16b, v9.16b, v10.16b, #2*2
298            umlal       v14.4s, v12.4h, v1.h[2]
299            umlal2      v15.4s, v12.8h, v1.h[2]
300            umlal       v14.4s, v13.4h, v1.h[2]
301            umlal2      v15.4s, v13.8h, v1.h[2]
302    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
303            ext         v13.16b, v9.16b, v10.16b, #1*2
304            umlal       v14.4s, v12.4h, v1.h[1]
305            umlal2      v15.4s, v12.8h, v1.h[1]
306            umlal       v14.4s, v13.4h, v1.h[1]
307            umlal2      v15.4s, v13.8h, v1.h[1]
308    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
309            //ext         v13.16b, v9.16b, v10.16b, #0*2
310            umlal       v14.4s, v7.4h, v1.h[0]
311            umlal2      v15.4s, v7.8h, v1.h[0]
312            umlal       v14.4s, v9.4h, v1.h[0]
313            umlal2      v15.4s, v9.8h, v1.h[0]
314    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
315            ext         v13.16b, v8.16b, v9.16b, #7*2
316            umlal       v14.4s, v12.4h, v0.h[7]
317            umlal2      v15.4s, v12.8h, v0.h[7]
318            umlal       v14.4s, v13.4h, v0.h[7]
319            umlal2      v15.4s, v13.8h, v0.h[7]
320    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
321            ext         v13.16b, v8.16b, v9.16b, #6*2
322            umlal       v14.4s, v12.4h, v0.h[6]
323            umlal2      v15.4s, v12.8h, v0.h[6]
324            umlal       v14.4s, v13.4h, v0.h[6]
325            umlal2      v15.4s, v13.8h, v0.h[6]
326    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
327            ext         v13.16b, v8.16b, v9.16b, #5*2
328            umlal       v14.4s, v12.4h, v0.h[5]
329            umlal2      v15.4s, v12.8h, v0.h[5]
330            umlal       v14.4s, v13.4h, v0.h[5]
331            umlal2      v15.4s, v13.8h, v0.h[5]
332    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
333            //ext         v13.16b, v8.16b, v9.16b, #4*2
334            umlal2      v14.4s, v7.8h, v0.h[4]
335            umlal       v15.4s, v8.4h, v0.h[4]
336            umlal2      v14.4s, v8.8h, v0.h[4]
337            umlal       v15.4s, v9.4h, v0.h[4]
338    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
339            ext         v13.16b, v8.16b, v9.16b, #3*2
340            umlal       v14.4s, v12.4h, v0.h[3]
341            umlal2      v15.4s, v12.8h, v0.h[3]
342            umlal       v14.4s, v13.4h, v0.h[3]
343            umlal2      v15.4s, v13.8h, v0.h[3]
344    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
345            ext         v13.16b, v8.16b, v9.16b, #2*2
346            umlal       v14.4s, v12.4h, v0.h[2]
347            umlal2      v15.4s, v12.8h, v0.h[2]
348            umlal       v14.4s, v13.4h, v0.h[2]
349            umlal2      v15.4s, v13.8h, v0.h[2]
350    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
351            ext         v13.16b, v8.16b, v9.16b, #1*2
352            umlal       v14.4s, v12.4h, v0.h[1]
353            umlal2      v15.4s, v12.8h, v0.h[1]
354            umlal       v14.4s, v13.4h, v0.h[1]
355            umlal2      v15.4s, v13.8h, v0.h[1]
356
357            uqrshrn     v14.4h, v14.4s, #16
358            uqrshrn2    v14.8h, v15.4s, #16
359            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
360
361            mov         v6.16b, v7.16b
362            mov         v7.16b, v8.16b
363            mov         v8.16b, v9.16b
364            mov         v9.16b, v10.16b
365            mov         v10.16b, v11.16b
366.endm/*}}}*/
367
368.macro hconv1_25/*{{{*/
369            ext         v12.16b, v6.16b, v7.16b, #7*2
370            umull       v14.4s, v12.4h, v0.h[0]
371            umull2      v15.4s, v12.8h, v0.h[0]
372
373            adr         x16, 100f
374            ldrsh       x12, [x16, x5, LSL #1]
375            add         x12, x12, x16
376            br          x12
377   100:     .hword -4
378            .hword 101f-100b
379            .hword 102f-100b
380            .hword 103f-100b
381            .hword 104f-100b
382            .hword 105f-100b
383            .hword 106f-100b
384            .hword 107f-100b
385            .hword 108f-100b
386            .hword 109f-100b
387            .hword 110f-100b
388            .hword 111f-100b
389            .hword 112f-100b
390            .hword 113f-100b
391            .hword 114f-100b
392            .hword 115f-100b
393            .hword 116f-100b
394            .hword 117f-100b
395            .hword 118f-100b
396            .hword 119f-100b
397            .hword 120f-100b
398            .hword 121f-100b
399            .hword 122f-100b
400            .hword 123f-100b
401            .hword 124f-100b
402            .hword 125f-100b
403            .align 4
404    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
405            ext         v13.16b, v10.16b, v11.16b, #0*2
406            umlal       v14.4s, v12.4h, v3.h[1]
407            umlal2      v15.4s, v12.8h, v3.h[1]
408            umlal       v14.4s, v13.4h, v3.h[1]
409            umlal2      v15.4s, v13.8h, v3.h[1]
410    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
411            ext         v13.16b, v9.16b, v10.16b, #7*2
412            umlal       v14.4s, v12.4h, v3.h[0]
413            umlal2      v15.4s, v12.8h, v3.h[0]
414            umlal       v14.4s, v13.4h, v3.h[0]
415            umlal2      v15.4s, v13.8h, v3.h[0]
416    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
417            ext         v13.16b, v9.16b, v10.16b, #6*2
418            umlal       v14.4s, v12.4h, v2.h[7]
419            umlal2      v15.4s, v12.8h, v2.h[7]
420            umlal       v14.4s, v13.4h, v2.h[7]
421            umlal2      v15.4s, v13.8h, v2.h[7]
422    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
423            ext         v13.16b, v9.16b, v10.16b, #5*2
424            umlal       v14.4s, v12.4h, v2.h[6]
425            umlal2      v15.4s, v12.8h, v2.h[6]
426            umlal       v14.4s, v13.4h, v2.h[6]
427            umlal2      v15.4s, v13.8h, v2.h[6]
428    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
429            ext         v13.16b, v9.16b, v10.16b, #4*2
430            umlal       v14.4s, v12.4h, v2.h[5]
431            umlal2      v15.4s, v12.8h, v2.h[5]
432            umlal       v14.4s, v13.4h, v2.h[5]
433            umlal2      v15.4s, v13.8h, v2.h[5]
434    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
435            ext         v13.16b, v9.16b, v10.16b, #3*2
436            umlal       v14.4s, v12.4h, v2.h[4]
437            umlal2      v15.4s, v12.8h, v2.h[4]
438            umlal       v14.4s, v13.4h, v2.h[4]
439            umlal2      v15.4s, v13.8h, v2.h[4]
440    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
441            ext         v13.16b, v9.16b, v10.16b, #2*2
442            umlal       v14.4s, v12.4h, v2.h[3]
443            umlal2      v15.4s, v12.8h, v2.h[3]
444            umlal       v14.4s, v13.4h, v2.h[3]
445            umlal2      v15.4s, v13.8h, v2.h[3]
446    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
447            ext         v13.16b, v9.16b, v10.16b, #1*2
448            umlal       v14.4s, v12.4h, v2.h[2]
449            umlal2      v15.4s, v12.8h, v2.h[2]
450            umlal       v14.4s, v13.4h, v2.h[2]
451            umlal2      v15.4s, v13.8h, v2.h[2]
452    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
453            ext         v13.16b, v9.16b, v10.16b, #0*2
454            umlal       v14.4s, v12.4h, v2.h[1]
455            umlal2      v15.4s, v12.8h, v2.h[1]
456            umlal       v14.4s, v13.4h, v2.h[1]
457            umlal2      v15.4s, v13.8h, v2.h[1]
458    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
459            ext         v13.16b, v8.16b, v9.16b, #7*2
460            umlal       v14.4s, v12.4h, v2.h[0]
461            umlal2      v15.4s, v12.8h, v2.h[0]
462            umlal       v14.4s, v13.4h, v2.h[0]
463            umlal2      v15.4s, v13.8h, v2.h[0]
464    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
465            ext         v13.16b, v8.16b, v9.16b, #6*2
466            umlal       v14.4s, v12.4h, v1.h[7]
467            umlal2      v15.4s, v12.8h, v1.h[7]
468            umlal       v14.4s, v13.4h, v1.h[7]
469            umlal2      v15.4s, v13.8h, v1.h[7]
470    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
471            ext         v13.16b, v8.16b, v9.16b, #5*2
472            umlal       v14.4s, v12.4h, v1.h[6]
473            umlal2      v15.4s, v12.8h, v1.h[6]
474            umlal       v14.4s, v13.4h, v1.h[6]
475            umlal2      v15.4s, v13.8h, v1.h[6]
476    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
477            ext         v13.16b, v8.16b, v9.16b, #4*2
478            umlal       v14.4s, v12.4h, v1.h[5]
479            umlal2      v15.4s, v12.8h, v1.h[5]
480            umlal       v14.4s, v13.4h, v1.h[5]
481            umlal2      v15.4s, v13.8h, v1.h[5]
482    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
483            ext         v13.16b, v8.16b, v9.16b, #3*2
484            umlal       v14.4s, v12.4h, v1.h[4]
485            umlal2      v15.4s, v12.8h, v1.h[4]
486            umlal       v14.4s, v13.4h, v1.h[4]
487            umlal2      v15.4s, v13.8h, v1.h[4]
488    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
489            ext         v13.16b, v8.16b, v9.16b, #2*2
490            umlal       v14.4s, v12.4h, v1.h[3]
491            umlal2      v15.4s, v12.8h, v1.h[3]
492            umlal       v14.4s, v13.4h, v1.h[3]
493            umlal2      v15.4s, v13.8h, v1.h[3]
494    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
495            ext         v13.16b, v8.16b, v9.16b, #1*2
496            umlal       v14.4s, v12.4h, v1.h[2]
497            umlal2      v15.4s, v12.8h, v1.h[2]
498            umlal       v14.4s, v13.4h, v1.h[2]
499            umlal2      v15.4s, v13.8h, v1.h[2]
500    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
501            ext         v13.16b, v8.16b, v9.16b, #0*2
502            umlal       v14.4s, v12.4h, v1.h[1]
503            umlal2      v15.4s, v12.8h, v1.h[1]
504            umlal       v14.4s, v13.4h, v1.h[1]
505            umlal2      v15.4s, v13.8h, v1.h[1]
506    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
507            ext         v13.16b, v7.16b, v8.16b, #7*2
508            umlal       v14.4s, v12.4h, v1.h[0]
509            umlal2      v15.4s, v12.8h, v1.h[0]
510            umlal       v14.4s, v13.4h, v1.h[0]
511            umlal2      v15.4s, v13.8h, v1.h[0]
512    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
513            ext         v13.16b, v7.16b, v8.16b, #6*2
514            umlal       v14.4s, v12.4h, v0.h[7]
515            umlal2      v15.4s, v12.8h, v0.h[7]
516            umlal       v14.4s, v13.4h, v0.h[7]
517            umlal2      v15.4s, v13.8h, v0.h[7]
518    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
519            ext         v13.16b, v7.16b, v8.16b, #5*2
520            umlal       v14.4s, v12.4h, v0.h[6]
521            umlal2      v15.4s, v12.8h, v0.h[6]
522            umlal       v14.4s, v13.4h, v0.h[6]
523            umlal2      v15.4s, v13.8h, v0.h[6]
524    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
525            ext         v13.16b, v7.16b, v8.16b, #4*2
526            umlal       v14.4s, v12.4h, v0.h[5]
527            umlal2      v15.4s, v12.8h, v0.h[5]
528            umlal       v14.4s, v13.4h, v0.h[5]
529            umlal2      v15.4s, v13.8h, v0.h[5]
530    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
531            ext         v13.16b, v7.16b, v8.16b, #3*2
532            umlal       v14.4s, v12.4h, v0.h[4]
533            umlal2      v15.4s, v12.8h, v0.h[4]
534            umlal       v14.4s, v13.4h, v0.h[4]
535            umlal2      v15.4s, v13.8h, v0.h[4]
536    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
537            ext         v13.16b, v7.16b, v8.16b, #2*2
538            umlal       v14.4s, v12.4h, v0.h[3]
539            umlal2      v15.4s, v12.8h, v0.h[3]
540            umlal       v14.4s, v13.4h, v0.h[3]
541            umlal2      v15.4s, v13.8h, v0.h[3]
542    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
543            ext         v13.16b, v7.16b, v8.16b, #1*2
544            umlal       v14.4s, v12.4h, v0.h[2]
545            umlal2      v15.4s, v12.8h, v0.h[2]
546            umlal       v14.4s, v13.4h, v0.h[2]
547            umlal2      v15.4s, v13.8h, v0.h[2]
548    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
549            ext         v13.16b, v7.16b, v8.16b, #0*2
550            umlal       v14.4s, v12.4h, v0.h[1]
551            umlal2      v15.4s, v12.8h, v0.h[1]
552            umlal       v14.4s, v13.4h, v0.h[1]
553            umlal2      v15.4s, v13.8h, v0.h[1]
554
555            uqrshrn     v14.4h, v14.4s, #16
556            uqrshrn2    v14.8h, v15.4s, #16
557            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
558
559            mov         v31.16b, v4.16b
560            mov         v4.16b, v5.16b
561            mov         v5.16b, v6.16b
562            mov         v6.16b, v7.16b
563            mov         v7.16b, v8.16b
564            mov         v8.16b, v9.16b
565            mov         v9.16b, v10.16b
566            mov         v10.16b, v11.16b
567.endm/*}}}*/
568
569#define TUNED_LIST4 6, 12, 20
570.macro hconv4_6/*{{{*/
571            umull       v14.4s, v7.4h, v0.h[0]
572            umull2      v15.4s, v7.8h, v0.h[0]
573
574            adr         x16, 100f
575            ldrsh       x12, [x16, x5, LSL #1]
576            add         x12, x12, x16
577            br          x12
578   100:     .hword -4
579            .hword 101f-100b
580            .hword 102f-100b
581            .hword 103f-100b
582            .hword 104f-100b
583            .hword 105f-100b
584            .hword 106f-100b
585            .align      4
586    106:    umlal       v14.4s, v4.4h,  v0.h[6]
587            umlal2      v15.4s, v4.8h,  v0.h[6]
588            umlal       v14.4s, v10.4h, v0.h[6]
589            umlal2      v15.4s, v10.8h, v0.h[6]
590    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
591            umlal       v15.4s, v5.4h, v0.h[5]
592            umlal2      v14.4s, v9.8h, v0.h[5]
593            umlal       v15.4s, v10.4h, v0.h[5]
594    104:    umlal       v14.4s, v5.4h, v0.h[4]
595            umlal2      v15.4s, v5.8h, v0.h[4]
596            umlal       v14.4s, v9.4h, v0.h[4]
597            umlal2      v15.4s, v9.8h, v0.h[4]
598    103:    umlal2      v14.4s, v5.8h, v0.h[3]
599            umlal       v15.4s, v6.4h, v0.h[3]
600            umlal2      v14.4s, v8.8h, v0.h[3]
601            umlal       v15.4s, v9.4h, v0.h[3]
602    102:    umlal       v14.4s, v6.4h, v0.h[2]
603            umlal2      v15.4s, v6.8h, v0.h[2]
604            umlal       v14.4s, v8.4h, v0.h[2]
605            umlal2      v15.4s, v8.8h, v0.h[2]
606    101:    umlal2      v14.4s, v6.8h, v0.h[1]
607            umlal       v15.4s, v7.4h, v0.h[1]
608            umlal2      v14.4s, v7.8h, v0.h[1]
609            umlal       v15.4s, v8.4h, v0.h[1]
610
611            uqrshrn     v14.4h, v14.4s, #16
612            uqrshrn2    v14.8h, v15.4s, #16
613            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
614
615            mov         v4.16b, v5.16b
616            mov         v5.16b, v6.16b
617            mov         v6.16b, v7.16b
618            mov         v7.16b, v8.16b
619            mov         v8.16b, v9.16b
620            mov         v9.16b, v10.16b
621            mov         v10.16b, v11.16b
622.endm/*}}}*/
623
624.macro hconv4_12/*{{{*/
625            umull       v14.4s, v4.4h, v0.h[0]
626            umull2      v15.4s, v4.8h, v0.h[0]
627
628            adr         x16, 100f
629            ldrsh       x12, [x16, x5, LSL #1]
630            add         x12, x12, x16
631            br          x12
632   100:     .hword -4
633            .hword 101f-100b
634            .hword 102f-100b
635            .hword 103f-100b
636            .hword 104f-100b
637            .hword 105f-100b
638            .hword 106f-100b
639            .hword 107f-100b
640            .hword 108f-100b
641            .hword 109f-100b
642            .hword 110f-100b
643            .hword 111f-100b
644            .hword 112f-100b
645            .align 4
646    112:    umlal       v14.4s, v26.4h, v1.h[4]
647            umlal2      v15.4s, v26.8h, v1.h[4]
648            umlal       v14.4s, v10.4h, v1.h[4]
649            umlal2      v15.4s, v10.8h, v1.h[4]
650    111:    umlal2      v14.4s, v26.8h, v1.h[3]
651            umlal       v15.4s, v27.4h, v1.h[3]
652            umlal2      v14.4s, v9.8h, v1.h[3]
653            umlal       v15.4s, v10.4h, v1.h[3]
654    110:    umlal       v14.4s, v27.4h, v1.h[2]
655            umlal2      v15.4s, v27.8h, v1.h[2]
656            umlal       v14.4s, v9.4h, v1.h[2]
657            umlal2      v15.4s, v9.8h, v1.h[2]
658    109:    umlal2      v14.4s, v27.8h, v1.h[1]
659            umlal       v15.4s, v28.4h, v1.h[1]
660            umlal2      v14.4s, v8.8h, v1.h[1]
661            umlal       v15.4s, v9.4h, v1.h[1]
662    108:    umlal       v14.4s, v28.4h, v1.h[0]
663            umlal2      v15.4s, v28.8h, v1.h[0]
664            umlal       v14.4s, v8.4h, v1.h[0]
665            umlal2      v15.4s, v8.8h, v1.h[0]
666    107:    umlal2      v14.4s, v28.8h, v0.h[7]
667            umlal       v15.4s, v29.4h, v0.h[7]
668            umlal2      v14.4s, v7.8h, v0.h[7]
669            umlal       v15.4s, v8.4h, v0.h[7]
670    106:    umlal       v14.4s, v29.4h, v0.h[6]
671            umlal2      v15.4s, v29.8h, v0.h[6]
672            umlal       v14.4s, v7.4h, v0.h[6]
673            umlal2      v15.4s, v7.8h, v0.h[6]
674    105:    umlal2      v14.4s, v29.8h, v0.h[5]
675            umlal       v15.4s, v30.4h, v0.h[5]
676            umlal2      v14.4s, v6.8h, v0.h[5]
677            umlal       v15.4s, v7.4h, v0.h[5]
678    104:    umlal       v14.4s, v30.4h, v0.h[4]
679            umlal2      v15.4s, v30.8h, v0.h[4]
680            umlal       v14.4s, v6.4h, v0.h[4]
681            umlal2      v15.4s, v6.8h, v0.h[4]
682    103:    umlal2      v14.4s, v30.8h, v0.h[3]
683            umlal       v15.4s, v31.4h, v0.h[3]
684            umlal2      v14.4s, v5.8h, v0.h[3]
685            umlal       v15.4s, v6.4h, v0.h[3]
686    102:    umlal       v14.4s, v31.4h, v0.h[2]
687            umlal2      v15.4s, v31.8h, v0.h[2]
688            umlal       v14.4s, v5.4h, v0.h[2]
689            umlal2      v15.4s, v5.8h, v0.h[2]
690    101:    umlal2      v14.4s, v31.8h, v0.h[1]
691            umlal       v15.4s, v4.4h,  v0.h[1]
692            umlal2      v14.4s, v4.8h,  v0.h[1]
693            umlal       v15.4s, v5.4h, v0.h[1]
694
695            uqrshrn     v14.4h, v14.4s, #16
696            uqrshrn2    v14.8h, v15.4s, #16
697            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
698
699            mov         v26.16b, v27.16b
700            mov         v27.16b, v28.16b
701            mov         v28.16b, v29.16b
702            mov         v29.16b, v30.16b
703            mov         v30.16b, v31.16b
704            mov         v31.16b, v4.16b
705            mov         v4.16b, v5.16b
706            mov         v5.16b, v6.16b
707            mov         v6.16b, v7.16b
708            mov         v7.16b, v8.16b
709            mov         v8.16b, v9.16b
710            mov         v9.16b, v10.16b
711            mov         v10.16b, v11.16b
712.endm/*}}}*/
713
714.macro hconv4_20/*{{{*/
715            umull       v14.4s, v28.4h, v0.h[0]
716            umull2      v15.4s, v28.8h, v0.h[0]
717
718            adr         x16, 100f
719            ldrsh       x12, [x16, x5, LSL #1]
720            add         x12, x12, x16
721            br          x12
722   100:     .hword -4
723            .hword 101f-100b
724            .hword 102f-100b
725            .hword 103f-100b
726            .hword 104f-100b
727            .hword 105f-100b
728            .hword 106f-100b
729            .hword 107f-100b
730            .hword 108f-100b
731            .hword 109f-100b
732            .hword 110f-100b
733            .hword 111f-100b
734            .hword 112f-100b
735            .hword 113f-100b
736            .hword 114f-100b
737            .hword 115f-100b
738            .hword 116f-100b
739            .hword 117f-100b
740            .hword 118f-100b
741            .hword 119f-100b
742            .hword 120f-100b
743            .align 4
744
745    120:    umlal       v14.4s, v18.4h, v2.h[4]
746            umlal2      v15.4s, v18.8h, v2.h[4]
747            umlal       v14.4s, v10.4h, v2.h[4]
748            umlal2      v15.4s, v10.8h, v2.h[4]
749    119:    umlal2      v14.4s, v18.8h, v2.h[3]
750            umlal       v15.4s, v19.4h, v2.h[3]
751            umlal2      v14.4s, v9.8h,  v2.h[3]
752            umlal       v15.4s, v10.4h, v2.h[3]
753    118:    umlal       v14.4s, v19.4h, v2.h[2]
754            umlal2      v15.4s, v19.8h, v2.h[2]
755            umlal       v14.4s, v9.4h,  v2.h[2]
756            umlal2      v15.4s, v9.8h,  v2.h[2]
757    117:    umlal2      v14.4s, v19.8h, v2.h[1]
758            umlal       v15.4s, v20.4h, v2.h[1]
759            umlal2      v14.4s, v8.8h,  v2.h[1]
760            umlal       v15.4s, v9.4h,  v2.h[1]
761    116:    umlal       v14.4s, v20.4h, v2.h[0]
762            umlal2      v15.4s, v20.8h, v2.h[0]
763            umlal       v14.4s, v8.4h,  v2.h[0]
764            umlal2      v15.4s, v8.8h,  v2.h[0]
765    115:    umlal2      v14.4s, v20.8h, v1.h[7]
766            umlal       v15.4s, v21.4h, v1.h[7]
767            umlal2      v14.4s, v7.8h,  v1.h[7]
768            umlal       v15.4s, v8.4h,  v1.h[7]
769    114:    umlal       v14.4s, v21.4h, v1.h[6]
770            umlal2      v15.4s, v21.8h, v1.h[6]
771            umlal       v14.4s, v7.4h,  v1.h[6]
772            umlal2      v15.4s, v7.8h,  v1.h[6]
773    113:    umlal2      v14.4s, v21.8h, v1.h[5]
774            umlal       v15.4s, v22.4h, v1.h[5]
775            umlal2      v14.4s, v6.8h,  v1.h[5]
776            umlal       v15.4s, v7.4h,  v1.h[5]
777    112:    umlal       v14.4s, v22.4h, v1.h[4]
778            umlal2      v15.4s, v22.8h, v1.h[4]
779            umlal       v14.4s, v6.4h,  v1.h[4]
780            umlal2      v15.4s, v6.8h,  v1.h[4]
781    111:    umlal2      v14.4s, v22.8h, v1.h[3]
782            umlal       v15.4s, v23.4h, v1.h[3]
783            umlal2      v14.4s, v5.8h,  v1.h[3]
784            umlal       v15.4s, v6.4h,  v1.h[3]
785    110:    umlal       v14.4s, v23.4h, v1.h[2]
786            umlal2      v15.4s, v23.8h, v1.h[2]
787            umlal       v14.4s, v5.4h,  v1.h[2]
788            umlal2      v15.4s, v5.8h,  v1.h[2]
789    109:    umlal2      v14.4s, v23.8h, v1.h[1]
790            umlal       v15.4s, v24.4h, v1.h[1]
791            umlal2      v14.4s, v4.8h,  v1.h[1]
792            umlal       v15.4s, v5.4h,  v1.h[1]
793    108:    umlal       v14.4s, v24.4h, v1.h[0]
794            umlal2      v15.4s, v24.8h, v1.h[0]
795            umlal       v14.4s, v4.4h,  v1.h[0]
796            umlal2      v15.4s, v4.8h,  v1.h[0]
797    107:    umlal2      v14.4s, v24.8h, v0.h[7]
798            umlal       v15.4s, v25.4h, v0.h[7]
799            umlal2      v14.4s, v31.8h, v0.h[7]
800            umlal       v15.4s, v4.4h,  v0.h[7]
801    106:    umlal       v14.4s, v25.4h, v0.h[6]
802            umlal2      v15.4s, v25.8h, v0.h[6]
803            umlal       v14.4s, v31.4h, v0.h[6]
804            umlal2      v15.4s, v31.8h, v0.h[6]
805    105:    umlal2      v14.4s, v25.8h, v0.h[5]
806            umlal       v15.4s, v26.4h, v0.h[5]
807            umlal2      v14.4s, v30.8h, v0.h[5]
808            umlal       v15.4s, v31.4h, v0.h[5]
809    104:    umlal       v14.4s, v26.4h, v0.h[4]
810            umlal2      v15.4s, v26.8h, v0.h[4]
811            umlal       v14.4s, v30.4h, v0.h[4]
812            umlal2      v15.4s, v30.8h, v0.h[4]
813    103:    umlal2      v14.4s, v26.8h, v0.h[3]
814            umlal       v15.4s, v27.4h, v0.h[3]
815            umlal2      v14.4s, v29.8h, v0.h[3]
816            umlal       v15.4s, v30.4h, v0.h[3]
817    102:    umlal       v14.4s, v27.4h, v0.h[2]
818            umlal2      v15.4s, v27.8h, v0.h[2]
819            umlal       v14.4s, v29.4h, v0.h[2]
820            umlal2      v15.4s, v29.8h, v0.h[2]
821    101:    umlal2      v14.4s, v27.8h, v0.h[1]
822            umlal       v15.4s, v28.4h, v0.h[1]
823            umlal2      v14.4s, v28.8h, v0.h[1]
824            umlal       v15.4s, v29.4h, v0.h[1]
825
826            uqrshrn     v14.4h, v14.4s, #16
827            uqrshrn2    v14.8h, v15.4s, #16
828            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
829
830            mov         v18.16b, v19.16b
831            mov         v19.16b, v20.16b
832            mov         v20.16b, v21.16b
833            mov         v21.16b, v22.16b
834            mov         v22.16b, v23.16b
835            mov         v23.16b, v24.16b
836            mov         v24.16b, v25.16b
837            mov         v25.16b, v26.16b
838            mov         v26.16b, v27.16b
839            mov         v27.16b, v28.16b
840            mov         v28.16b, v29.16b
841            mov         v29.16b, v30.16b
842            mov         v30.16b, v31.16b
843            mov         v31.16b, v4.16b
844            mov         v4.16b, v5.16b
845            mov         v5.16b, v6.16b
846            mov         v6.16b, v7.16b
847            mov         v7.16b, v8.16b
848            mov         v8.16b, v9.16b
849            mov         v9.16b, v10.16b
850            mov         v10.16b, v11.16b
851.endm/*}}}*/
852
853.macro hconv4_25/*{{{*/
854            umull2      v14.4s, v25.8h, v0.h[0]
855            umull       v15.4s, v26.4h, v0.h[0]
856
857            adr         x16, 100f
858            ldrsh       x12, [x16, x5, LSL #1]
859            add         x12, x12, x16
860            br          x12
861   100:     .hword -4
862            .hword 101f-100b
863            .hword 102f-100b
864            .hword 103f-100b
865            .hword 104f-100b
866            .hword 105f-100b
867            .hword 106f-100b
868            .hword 107f-100b
869            .hword 108f-100b
870            .hword 109f-100b
871            .hword 110f-100b
872            .hword 111f-100b
873            .hword 112f-100b
874            .hword 113f-100b
875            .hword 114f-100b
876            .hword 115f-100b
877            .hword 116f-100b
878            .hword 117f-100b
879            .hword 118f-100b
880            .hword 119f-100b
881            .hword 120f-100b
882            .hword 121f-100b
883            .hword 122f-100b
884            .hword 123f-100b
885            .hword 124f-100b
886            .hword 125f-100b
887            .align 4
888
889    125:    ld1         {v12.8h}, [x9]
890            umlal       v14.4s, v12.4h, v3.h[1]
891            umlal2      v15.4s, v12.8h, v3.h[1]
892            umlal       v14.4s, v10.4h, v3.h[1]
893            umlal2      v15.4s, v10.8h, v3.h[1]
894    124:    add         x12, x9, #0x08
895            bic         x12, x12, #0x40
896            ld1         {v12.4h}, [x12], #8
897            bic         x12, x12, #0x40
898            ld1         {v13.4h}, [x12]
899            umlal       v14.4s, v12.4h, v3.h[0]
900            umlal       v15.4s, v13.4h, v3.h[0]
901            umlal2      v14.4s, v9.8h,  v3.h[0]
902            umlal       v15.4s, v10.4h, v3.h[0]
903    123:    add         x12, x9, #0x10
904            bic         x12, x12, #0x40
905            ld1         {v12.8h}, [x12]
906            umlal       v14.4s, v12.4h, v2.h[7]
907            umlal2      v15.4s, v12.8h, v2.h[7]
908            umlal       v14.4s, v9.4h,  v2.h[7]
909            umlal2      v15.4s, v9.8h,  v2.h[7]
910    122:    add         x12, x9, #0x18
911            bic         x12, x12, #0x40
912            ld1         {v12.4h}, [x12], #8
913            bic         x12, x12, #0x40
914            ld1         {v13.4h}, [x12]
915            umlal       v14.4s, v12.4h, v2.h[6]
916            umlal       v15.4s, v13.4h, v2.h[6]
917            umlal2      v14.4s, v8.8h,  v2.h[6]
918            umlal       v15.4s, v9.4h,  v2.h[6]
919    121:    add         x12, x9, #0x20
920            bic         x12, x12, #0x40
921            ld1         {v12.8h}, [x12]
922            umlal       v14.4s, v12.4h, v2.h[5]
923            umlal2      v15.4s, v12.8h, v2.h[5]
924            umlal       v14.4s, v8.4h,  v2.h[5]
925            umlal2      v15.4s, v8.8h,  v2.h[5]
926    120:    add         x12, x9, #0x28
927            bic         x12, x12, #0x40
928            ld1         {v12.4h}, [x12], #8
929            bic         x12, x12, #0x40
930            ld1         {v13.4h}, [x12]
931            umlal       v14.4s, v12.4h, v2.h[4]
932            umlal       v15.4s, v13.4h, v2.h[4]
933            umlal2      v14.4s, v7.8h,  v2.h[4]
934            umlal       v15.4s, v8.4h,  v2.h[4]
935    119:    add         x12, x9, #0x30
936            bic         x12, x12, #0x40
937            ld1         {v12.8h}, [x12]
938            umlal       v14.4s, v12.4h, v2.h[3]
939            umlal2      v15.4s, v12.8h, v2.h[3]
940            umlal       v14.4s, v7.4h,  v2.h[3]
941            umlal2      v15.4s, v7.8h,  v2.h[3]
942    118:    add         x12, x9, #0x38
943            bic         x12, x12, #0x40
944            ld1         {v12.4h}, [x12]
945            umlal       v14.4s, v12.4h, v2.h[2]
946            umlal       v15.4s, v17.4h, v2.h[2]
947            umlal2      v14.4s, v6.8h,  v2.h[2]
948            umlal       v15.4s, v7.4h,  v2.h[2]
949    117:    umlal       v14.4s, v17.4h, v2.h[1]
950            umlal2      v15.4s, v17.8h, v2.h[1]
951            umlal       v14.4s, v6.4h,  v2.h[1]
952            umlal2      v15.4s, v6.8h,  v2.h[1]
953    116:    umlal2      v14.4s, v17.8h, v2.h[0]
954            umlal       v15.4s, v18.4h, v2.h[0]
955            umlal2      v14.4s, v5.8h,  v2.h[0]
956            umlal       v15.4s, v6.4h,  v2.h[0]
957    115:    umlal       v14.4s, v18.4h, v1.h[7]
958            umlal2      v15.4s, v18.8h, v1.h[7]
959            umlal       v14.4s, v5.4h,  v1.h[7]
960            umlal2      v15.4s, v5.8h,  v1.h[7]
961    114:    umlal2      v14.4s, v18.8h, v1.h[6]
962            umlal       v15.4s, v19.4h, v1.h[6]
963            umlal2      v14.4s, v4.8h,  v1.h[6]
964            umlal       v15.4s, v5.4h,  v1.h[6]
965    113:    umlal       v14.4s, v19.4h, v1.h[5]
966            umlal2      v15.4s, v19.8h, v1.h[5]
967            umlal       v14.4s, v4.4h,  v1.h[5]
968            umlal2      v15.4s, v4.8h,  v1.h[5]
969    112:    umlal2      v14.4s, v19.8h, v1.h[4]
970            umlal       v15.4s, v20.4h, v1.h[4]
971            umlal2      v14.4s, v31.8h, v1.h[4]
972            umlal       v15.4s, v4.4h,  v1.h[4]
973    111:    umlal       v14.4s, v20.4h, v1.h[3]
974            umlal2      v15.4s, v20.8h, v1.h[3]
975            umlal       v14.4s, v31.4h, v1.h[3]
976            umlal2      v15.4s, v31.8h, v1.h[3]
977    110:    umlal2      v14.4s, v20.8h, v1.h[2]
978            umlal       v15.4s, v21.4h, v1.h[2]
979            umlal2      v14.4s, v30.8h, v1.h[2]
980            umlal       v15.4s, v31.4h, v1.h[2]
981    109:    umlal       v14.4s, v21.4h, v1.h[1]
982            umlal2      v15.4s, v21.8h, v1.h[1]
983            umlal       v14.4s, v30.4h, v1.h[1]
984            umlal2      v15.4s, v30.8h, v1.h[1]
985    108:    umlal2      v14.4s, v21.8h, v1.h[0]
986            umlal       v15.4s, v22.4h, v1.h[0]
987            umlal2      v14.4s, v29.8h, v1.h[0]
988            umlal       v15.4s, v30.4h, v1.h[0]
989    107:    umlal       v14.4s, v22.4h, v0.h[7]
990            umlal2      v15.4s, v22.8h, v0.h[7]
991            umlal       v14.4s, v29.4h, v0.h[7]
992            umlal2      v15.4s, v29.8h, v0.h[7]
993    106:    umlal2      v14.4s, v22.8h, v0.h[6]
994            umlal       v15.4s, v23.4h, v0.h[6]
995            umlal2      v14.4s, v28.8h, v0.h[6]
996            umlal       v15.4s, v29.4h, v0.h[6]
997    105:    umlal       v14.4s, v23.4h, v0.h[5]
998            umlal2      v15.4s, v23.8h, v0.h[5]
999            umlal       v14.4s, v28.4h, v0.h[5]
1000            umlal2      v15.4s, v28.8h, v0.h[5]
1001    104:    umlal2      v14.4s, v23.8h, v0.h[4]
1002            umlal       v15.4s, v24.4h, v0.h[4]
1003            umlal2      v14.4s, v27.8h, v0.h[4]
1004            umlal       v15.4s, v28.4h, v0.h[4]
1005    103:    umlal       v14.4s, v24.4h, v0.h[3]
1006            umlal2      v15.4s, v24.8h, v0.h[3]
1007            umlal       v14.4s, v27.4h, v0.h[3]
1008            umlal2      v15.4s, v27.8h, v0.h[3]
1009    102:    umlal2      v14.4s, v24.8h, v0.h[2]
1010            umlal       v15.4s, v25.4h, v0.h[2]
1011            umlal2      v14.4s, v26.8h, v0.h[2]
1012            umlal       v15.4s, v27.4h, v0.h[2]
1013    101:    umlal       v14.4s, v25.4h, v0.h[1]
1014            umlal2      v15.4s, v25.8h, v0.h[1]
1015            umlal       v14.4s, v26.4h, v0.h[1]
1016            umlal2      v15.4s, v26.8h, v0.h[1]
1017
1018            uqrshrn     v14.4h, v14.4s, #16
1019            uqrshrn2    v14.8h, v15.4s, #16
1020            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1021
1022            st1         {v17.16b}, [x9], #16
1023            bic         x9, x9, #0x40
1024            mov         v17.16b, v18.16b
1025            mov         v18.16b, v19.16b
1026            mov         v19.16b, v20.16b
1027            mov         v20.16b, v21.16b
1028            mov         v21.16b, v22.16b
1029            mov         v22.16b, v23.16b
1030            mov         v23.16b, v24.16b
1031            mov         v24.16b, v25.16b
1032            mov         v25.16b, v26.16b
1033            mov         v26.16b, v27.16b
1034            mov         v27.16b, v28.16b
1035            mov         v28.16b, v29.16b
1036            mov         v29.16b, v30.16b
1037            mov         v30.16b, v31.16b
1038            mov         v31.16b, v4.16b
1039            mov         v4.16b, v5.16b
1040            mov         v5.16b, v6.16b
1041            mov         v6.16b, v7.16b
1042            mov         v7.16b, v8.16b
1043            mov         v8.16b, v9.16b
1044            mov         v9.16b, v10.16b
1045            mov         v10.16b, v11.16b
1046.endm/*}}}*/
1047
1048/* Dedicated function wrapper for the fetch macro, for the cases where
1049 * performance isn't that important, to keep code size down.
1050 */
1051PRIVATE(fetch_generic_asm)
1052            stp         x10, x11, [sp, #-16]!
1053            fetch
1054            ldp         x10, x11, [sp], #16
1055            ret
1056END(fetch_generic_asm)
1057
1058/* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value
1059 * across to fill the rest of the register pair.  Used for filling the right
1060 * hand edge of the window when starting too close to the right hand edge of
1061 * the image.
1062 * Also returns a dup-ed copy of the last element in v12 for the tail-fill
1063 * case (this happens incidentally in common path, but must be done
1064 * deliberately in the fast-out path).
1065 */
1066PRIVATE(prefetch_clampright1)
1067            ands        x12, x11, #15
1068            beq         1f
1069            sub         x12, x12, #1
1070            sub         sp, sp, #64
1071            st1         {v10.8h,v11.8h}, [sp]
1072            add         x12, sp, x12, LSL #1
1073            ld1r        {v12.8h}, [x12]
1074            st1         {v12.8h}, [x12], #16
1075            st1         {v12.8h}, [x12]
1076            ld1         {v10.8h,v11.8h}, [sp]
1077            add         sp, sp, #64
1078            ret
10791:          dup         v12.8h, v11.h[7]
1080            ret
1081END(prefetch_clampright1)
1082
1083PRIVATE(prefetch_clampright4)
1084            ands        x12, x11, #15
1085            beq         1f
1086            sub         x12, x12, #4
1087            sub         sp, sp, #64
1088            st1         {v10.8h,v11.8h}, [sp]
1089            add         x12, sp, x12, LSL #1
1090            ld1r        {v12.2d}, [x12]
1091            st1         {v12.8h}, [x12], #16
1092            st1         {v12.8h}, [x12]
1093            ld1         {v10.8h,v11.8h}, [sp]
1094            add         sp, sp, #64
1095            ret
10961:          dup         v12.2d, v11.d[1]
1097            ret
1098END(prefetch_clampright4)
1099
1100
1101/* Helpers for prefetch, below.
1102 */
1103.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi
1104  .if \store == 2
1105    .ifc \qsa,\qsb
1106            st1         {\qsa}, [x9], #16
1107            st1         {\qsb}, [x9], #16
1108    .else
1109            st1         {\qsa,\qsb}, [x9], #32
1110    .endif
1111  .elseif \store == 1
1112            bic         x9, x9, #0x40
1113            st1         {\qsa}, [x9], #16
1114            mov         \qb, \qsb
1115  .elseif \store == 0
1116            mov         \qa, \qsa
1117            mov         \qb, \qsb
1118  .endif
1119.endm
1120
1121.macro prefetch_one  qa, qb, rem, c, store=0, step=1
1122.set i, (need - 16) - \rem
1123.if i >= 0
11241:          cmp         x10, #i+16
1125            blo         2f
1126            prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1]
1127            b           1f
11282:          cmp         x11, #i+16
1129            bls         3f
1130            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
1131            bl          fetch_generic_asm
1132            b           2f
11333:          bl          prefetch_clampright\step
1134            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
11354:          b           4f+4
1136           //v12 contains pad word from prefetch_clampright call
1137            prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
1138  .if \rem > 0
1139            b           4f+4
1140  .else
11411:
11422:
11433:
11444:          nop
1145  .endif
1146.endif
1147.endm
1148
1149/* Fill the convolution window with context data.  The aim here is to load
1150 * exactly rlf + rrt columns, and in the main loop to read as many columns as
1151 * will be written.  This is complicated by the need to handle cases when the
1152 * input starts very close to the left or right (or both) edges of the image,
1153 * and where these do not fall on 16-byte boundaries.
1154 *
1155 * Input:
1156 *      x1 -- src
1157 *      x2 -- pitch
1158 *      x3 -- count
1159 *      x4 -- inlen
1160 *      x5 -- r
1161 *      x6 -- rup
1162 *      x7 -- rdn
1163 *      x8 -- rlf
1164 *      x9 -- buffer (if needed)
1165 *      x13 = -pitch
1166 *      x15 = top-row in
1167 *      x19 = bottom-row in
1168 * Output:
1169 *      x1 += rlf + min(count, rrt)
1170 * Modifies:
1171 *      x10 -- fill start index in the window
1172 *      x11 -- fill stop index in the window
1173 *      x12 -- scratch
1174 */
1175.macro prefetch step=1, max_r=25
1176.set need, ((\max_r + \max_r) * \step + 15) & ~15
1177  .if \step == 1
1178            mov         x10, #need - (\max_r * \step)
1179            sub         x10, x10, x8
1180  .else
1181            mov         x10, #need - (\max_r * \step)
1182            sub         x10, x10, x8, LSL #2
1183  .endif
1184            add         x11, x10, x4
1185            subs        x11, x11, #need
1186            csel        x11, xzr, x11, hi
1187            add         x11, x11, #need
1188
1189            bl          fetch_generic_asm
1190  .if \step == 1
1191            dup         v9.8h, v10.h[0]
1192  .else
1193            dup         v9.2d, v10.d[0]
1194  .endif
1195            ands        x12, x10, #15
1196            beq         2f
1197            sub         sp, sp, #32
1198            st1         {v10.8h,v11.8h}, [sp]
1199            sub         x12, sp, x12, LSL #1
1200            sub         sp, sp, #16
1201            st1         {v9.8h}, [sp]
1202            sub         sp, sp, #16
1203            st1         {v9.8h}, [sp]
1204            ld1         {v10.8h,v11.8h}, [x12]
1205            add         sp, sp, #64
1206            sub         x1, x1, x10
1207            sub         x15, x15, x10
1208            sub         x19, x19, x10
1209            bic         x10, x10, #15
1210            add         x1, x1, x10
1211            add         x15, x15, x10
1212            add         x19, x19, x10
12132:
1214  .if \step > 1
1215            /* it's only in the uchar2 and uchar4 cases where the register file
1216             * is insufficient (given MAX_R <= 25).
1217             */
1218            prefetch_one xx, xx, 192, c=\max_r, step=\step, store=2
1219            prefetch_one xx, xx, 176, c=\max_r, step=\step, store=2
1220            prefetch_one xx,      v17.16b, 160, c=\max_r, step=\step, store=1
1221            prefetch_one v18.16b, v19.16b, 144, c=\max_r, step=\step, store=0
1222            prefetch_one v20.16b, v21.16b, 128, c=\max_r, step=\step, store=0
1223            prefetch_one v22.16b, v23.16b, 112, c=\max_r, step=\step, store=0
1224            prefetch_one v24.16b, v25.16b,  96, c=\max_r, step=\step, store=0
1225            prefetch_one v26.16b, v27.16b,  80, c=\max_r, step=\step, store=0
1226            prefetch_one v28.16b, v29.16b,  64, c=\max_r, step=\step, store=0
1227  .endif
1228            prefetch_one v30.16b, v31.16b,  48, c=\max_r, step=\step, store=0
1229            prefetch_one v4.16b,  v5.16b,   32, c=\max_r, step=\step, store=0
1230            prefetch_one v6.16b,  v7.16b,   16, c=\max_r, step=\step, store=0
1231            prefetch_one v8.16b,  v9.16b,    0, c=\max_r, step=\step, store=0
1232
1233  .if \step == 1
1234            add         x10, x8, #\max_r * \step
1235  .else
1236            lsl         x10, x8, #2
1237            add         x10, x10, #\max_r * \step
1238  .endif
1239            subs        x4, x4, x10
1240            csel        x4, xzr, x4, lo
1241.endm
1242
1243/* The main loop.
1244 *
1245 * Input:
1246 *      x0 = dst
1247 *      x1 = src
1248 *      x2 = pitch
1249 *      x3 = count
1250 *      x4 = inlen
1251 *      x5 = r
1252 *      x6 = rup
1253 *      x7 = rdn
1254 *      x9 = buffer
1255 *      x13 = -pitch
1256 *      x15 = top-row in
1257 *      x19 = bottom-row in
1258 * Modifies
1259 *      x8 = fetch code pointer
1260 */
1261.macro mainloop core, step=1, max_r=25, labelc="", labelnc=""
1262            adrp        x8, \labelnc
1263            add         x8, x8, #:lo12:\labelnc
1264            sub         x8, x8, x5, LSL #5
1265            sub         x8, x8, x5, LSL #3
1266            cmp         x5, x6
1267            ccmp        x5, x7, #0, eq
1268            beq         5f
1269
1270            /* if (r != rup || r != rdn) then the address-clamping table should
1271             * be used rather than the short-cut version.
1272             */
1273            adrp        x8, \labelc
1274            add         x8, x8, #:lo12:\labelc
1275            sub         x8, x8, x5, LSL #6
1276            add         x8, x8, x5, LSL #3
1277            b           5f
1278            .align  4
12793:          fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1280
1281            /* For each call to fetch two are made to \core.  It would be
1282             * preferable to have twice the work done in \core.
1283             */
1284            \core
1285            st1         {v15.8b}, [x0], #8
1286            \core
1287            st1         {v15.8b}, [x0], #8
1288
1289            sub         x3, x3, #16
12905:          subs        x4, x4, #16
1291            bhs         3b
1292            adds        x4, x4, #16
1293            bne         1f
1294  .if \step==1
1295            dup         v10.8h, v9.h[7]
1296            dup         v11.8h, v9.h[7]
1297  .else
1298            dup         v10.2d, v9.d[1]
1299            dup         v11.2d, v9.d[1]
1300  .endif
1301            b           4f
1302
13031:          sub         x1, x1, #16
1304            sub         x15, x15, #16
1305            sub         x19, x19, #16
1306            add         x1, x1, x4
1307            add         x15, x15, x4
1308            add         x19, x19, x4
1309            bl          fetch_generic_asm
1310
1311  .if \step==1
1312            dup         v12.8h, v11.h[7]
1313  .else
1314            dup         v12.2d, v11.d[1]
1315  .endif
1316            sub         x4, xzr, x4
1317            tbz         x4, #3, 1f
1318            mov         v10.16b, v11.16b
1319            mov         v11.16b, v12.16b
13201:          tbz         x4, #2, 1f
1321            ext         v10.16b, v10.16b, v11.16b, #4*2
1322            ext         v11.16b, v11.16b, v12.16b, #4*2
13231:          tbz         x4, #1, 1f
1324            ext         v10.16b, v10.16b, v11.16b, #2*2
1325            ext         v11.16b, v11.16b, v12.16b, #2*2
13261:          tbz         x4, #0, 4f
1327            ext         v10.16b, v10.16b, v11.16b, #1*2
1328            ext         v11.16b, v11.16b, v12.16b, #1*2
13294:          cbz         x3, 5f
13303:          \core
1331  .if \step==1
1332            dup         v11.8h, v11.h[7]
1333  .else
1334            dup         v11.2d, v11.d[1]
1335  .endif
1336            subs        x3, x3, #8
1337            blo         4f
1338            st1         {v15.8b}, [x0], #8
1339            beq         5f
1340            b           3b
13414:          tbz         x3, #2, 1f
1342            st1         {v15.s}[0], [x0], #4
1343            ext         v15.8b, v15.8b, v15.8b, #4
13441:          tbz         x3, #1, 1f
1345            st1         {v15.h}[0], [x0], #2
1346            ext         v15.8b, v15.8b, v15.8b, #2
13471:          tbz         x3, #0, 5f
1348            st1         {v15.b}[0], [x0], #1
1349            ext         v15.8b, v15.8b, v15.8b, #1
13505:          nop
1351.endm
1352
1353.irep r, TUNED_LIST1, 25
1354PRIVATE(convolve1_\r)
1355            stp         x29,x30, [sp, #-16]!
1356
1357            prefetch    step=1, max_r=\r
1358
1359            mainloop    core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1360
1361            ldp         x29,x30, [sp], #16
1362            ret
1363END(convolve1_\r)
1364.endr
1365
1366.irep r, TUNED_LIST4, 25
1367PRIVATE(convolve4_\r)
1368            sub         x12, sp, #0x040
1369            bic         x9, x12, #0x07f
1370            mov         sp, x9
1371            stp         x12,x30, [sp, #-16]!
1372
1373            /* x9 now points to a buffer on the stack whose address has the low
1374             * 7 bits clear.  This allows easy address calculation in the
1375             * wrap-around cases.
1376             */
1377
1378
1379            prefetch    step=4, max_r=\r
1380
1381            mainloop    core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1382
1383            ldp         x12,x30, [sp]
1384            add         sp, x12, #0x40
1385            ret
1386END(convolve4_\r)
1387.endr
1388
1389/* void rsdIntrinsicBlurU1_K(
1390 *                  void *out,      // x0
1391 *                  void *in,       // x1
1392 *                  size_t w,       // x2
1393 *                  size_t h,       // x3
1394 *                  size_t p,       // x4
1395 *                  size_t x,       // x5
1396 *                  size_t y,       // x6
1397 *                  size_t count,   // x7
1398 *                  size_t r,       // [sp]
1399 *                  uint16_t *tab); // [sp,#8]
1400 */
1401ENTRY(rsdIntrinsicBlurU1_K)
1402            stp         x19,x30, [sp, #-16]!
1403            sub         x8, sp, #32
1404            sub         sp, sp, #64
1405            st1         {v8.1d - v11.1d}, [sp]
1406            st1         {v12.1d - v15.1d}, [x8]
1407            mov         x8, x5        // x
1408            ldr         w5, [sp,#80]  // r
1409            sub         x9, x2, x8
1410            sub         x10, x3, x6
1411            mov         x2, x4        // pitch
1412            mov         x3, x7        // count
1413            sub         x7, x10, #1
1414            sub         x9, x9, x3
1415
1416            ldr         x12, [sp, #88] // tab
1417
1418            add         x1, x1, x8
1419
1420            cmp         x6, x5
1421            csel        x6, x5, x6, hs
1422            cmp         x7, x5
1423            csel        x7, x5, x7, hs
1424            cmp         x8, x5
1425            csel        x8, x5, x8, hs
1426            cmp         x9, x5
1427            csel        x9, x5, x9, hs
1428
1429            add         x4, x8, x9
1430            add         x4, x4, x3
1431
1432            sub         x1, x1, x8
1433
1434            sub         x13, xzr, x2
1435            msub        x15, x2, x6, x1
1436            madd        x19, x2, x7, x1
1437
1438            ld1         {v0.8h,v1.8h}, [x12], #32
1439            ld1         {v2.8h,v3.8h}, [x12], #32
1440
1441            adr         x30, 1f
1442  .irep r, TUNED_LIST1
1443            cmp         x5, #\r
1444            bls         convolve1_\r
1445  .endr
1446            b           convolve1_25
1447
14481:          ld1         {v8.1d - v11.1d}, [sp], #32
1449            ld1         {v12.1d - v15.1d}, [sp], #32
1450            ldp         x19,x30, [sp], #16
1451            ret
1452END(rsdIntrinsicBlurU1_K)
1453
1454/* void rsdIntrinsicBlurU4_K(
1455 *                  void *out,      // x0
1456 *                  void *in,       // x1
1457 *                  size_t w,       // x2
1458 *                  size_t h,       // x3
1459 *                  size_t p,       // x4
1460 *                  size_t x,       // x5
1461 *                  size_t y,       // x6
1462 *                  size_t count,   // x7
1463 *                  size_t r,       // [sp]
1464 *                  uint16_t *tab); // [sp,#8]
1465 */
1466ENTRY(rsdIntrinsicBlurU4_K)
1467            stp         x19,x30, [sp, #-16]!
1468            sub         x8, sp, #32
1469            sub         sp, sp, #64
1470            st1         {v8.1d - v11.1d}, [sp]
1471            st1         {v12.1d - v15.1d}, [x8]
1472            mov         x8, x5        // x
1473            ldr         w5, [sp,#80]  // r
1474            sub         x9, x2, x8
1475            sub         x10, x3, x6
1476            mov         x2, x4        // pitch
1477            mov         x3, x7        // count
1478            sub         x7, x10, #1
1479            sub         x9, x9, x3
1480
1481            ldr         x12, [sp, #88]
1482
1483            add         x1, x1, x8, LSL #2
1484
1485            cmp         x6, x5
1486            csel        x6, x5, x6, hs
1487            cmp         x7, x5
1488            csel        x7, x5, x7, hs
1489            cmp         x8, x5
1490            csel        x8, x5, x8, hs
1491            cmp         x9, x5
1492            csel        x9, x5, x9, hs
1493
1494            lsl         x3, x3, #2
1495            add         x4, x8, x9
1496            add         x4, x3, x4, LSL #2
1497
1498            sub         x1, x1, x8, LSL #2
1499
1500            sub         x13, xzr, x2
1501            msub        x15, x2, x6, x1
1502            madd        x19, x2, x7, x1
1503
1504            ld1         {v0.8h,v1.8h}, [x12], #32
1505            ld1         {v2.8h,v3.8h}, [x12], #32
1506
1507            adr         x30, 1f
1508  .irep r, TUNED_LIST4
1509            cmp         x5, #\r
1510            bls         convolve4_\r
1511  .endr
1512            b           convolve4_25
1513
15141:          ld1         {v8.1d - v11.1d}, [sp], #32
1515            ld1         {v12.1d - v15.1d}, [sp], #32
1516            ldp         x19,x30, [sp], #16
1517            ret
1518END(rsdIntrinsicBlurU4_K)
1519