rsCpuIntrinsics_advsimd_Blur.S revision ea76eb386a2d851d50be69ebeb7ae593f84a5be9
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19#define END(f) .size f, .-f;
20
21.set FRACTION_BITS, 7
22.set MAX_R, 25
23
24
25/* A quick way of making a line of code conditional on some other condition.
26 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
27 * `ifcc`:
28 */
29.macro ifcc zzz:vararg
30.if cc
31            \zzz
32.endif
33.endm
34
35/* Fetch 16 columns of bytes (regardless of image format), convolve these
36 * vertically, and leave them in the register file.  If working near the top or
37 * bottom of an image then clamp the addressing while loading the data in.
38 *
39 * The convolution is fully unrolled for windows up to max_r, with the
40 * outermost edges calculated first.  This way it's possible to branch directly
41 * into the relevant part of the code for an arbitrary convolution radius.  Two
42 * variants of the loop are produced; one eliminates the clamping code for a
43 * slight speed advantage.
44 *
45 * Where the macro is called with reg=x, the specified register is taken to
46 * contain a pre-calculated pointer into one of the two loops.
47 *
48 * Input:
49 *      x1 -- src
50 *      x2 -- pitch
51 *      x5 -- r
52 *      x6 -- rup
53 *      x7 -- rdn
54 *      x12 -- switch index
55 *      q0-q3 -- coefficient table
56 *      x13 = -pitch
57 *      x15 = top-row in
58 *      x19 = bottom-row in
59 * Output:
60 *      x1 += 16
61 *      q10,q11 -- 16 convolved columns
62 * Modifies:
63 *      x10 = upper row pointer
64 *      x11 = lower row pointer
65 *      q12-q15 = temporary sums
66 */
67.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
68  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
69
70            ld1         {v15.16b}, [x1], #16
71            mov         x10, x15
72
73            uxtl        v14.8h, v15.8b
74//            prfm        PLDL1KEEP,[x1, #16] // TODO: confirm
75            uxtl2       v15.8h, v15.16b
76  .if \max_r < 16 // approximate
77    ifcc    adr         \reg, 1f
78  .else
79    ifcc    adrp        \reg, 1f
80    ifcc    add         \reg, \reg, #:lo12:1f
81  .endif
82
83            umull       v12.4s, v14.4h, v0.h[0]
84    ifcc    sub         \reg, \reg, x5, LSL #6
85            umull2      v13.4s, v14.8h, v0.h[0]
86            mov         x11, x19
87            umull       v14.4s, v15.4h, v0.h[0]
88    ifcc    add         \reg, \reg, x5, LSL #3
89            umull2      v15.4s, v15.8h, v0.h[0]
90            br          \reg
91
92  .irp rowclamp, 1, 0
93    .set cc, \rowclamp
94    .align 4
95    .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
96        .set i, \dreg * 8 + \lane
97        .if 0 < i && i <= \max_r
98            ld1         {v10.16b}, [x10], x2
99    ifcc    cmp         x6, #i
100            ld1         {v11.16b}, [x11], x13
101    ifcc    csel        x10, x15, x10, lo
102            uaddl       v16.8h, v10.8b, v11.8b
103    ifcc    cmp         x7, #i
104            uaddl2      v11.8h, v10.16b, v11.16b
105    ifcc    csel        x11, x19, x11, lo
106            umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
107            umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
108//            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
109nop
110            umlal       v14.4s, v11.4h, v\dreg\doth[\lane]
111//            prfm        PLDL1KEEP,[x11, #32] // TODO: confirm
112nop
113            umlal2      v15.4s, v11.8h, v\dreg\doth[\lane]
114        .endif
115    .endr ; .endr ; .endr
116    .if \rowclamp == 1
117        1: \labelc :
118            b           2f
119    .else
120        2: \labelnc :
121    .endif
122  .endr
123
124            uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
125            add         x15, x15, #16
126            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
127            add         x19, x19, #16
128            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
129            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
130.endm /*}}}*/
131
132/* Some portion of the convolution window (as much as will fit, and all of it
133 * for the uchar1 cases) is kept in the register file to avoid unnecessary
134 * memory accesses.  This forces the horizontal loops to be unrolled because
135 * there's no indexed addressing into the register file.
136 *
137 * As in the fetch macro, the operations are ordered from outside to inside, so
138 * that jumping into the middle of the block bypasses the unwanted window taps.
139 *
140 * There are several variants of the macro because of the fixed offets of the
141 * taps -- the wider the maximum radius the further the centre tap is from the
142 * most recently fetched data.  This means that pre-filling the window requires
143 * more data that won't be used and it means that rotating the window involves
144 * more mov operations.
145 *
146 * When the buffer gets too big the buffer at [x9] is used.
147 *
148 * Input:
149 *      q4-q11 -- convoltion window
150 *      x9 -- pointer to additional convolution window data
151 * Output:
152 *      x9 -- updated buffer pointer (if used)
153 *      d31 -- result to be stored
154 * Modifies:
155 *      x12 -- temp buffer pointer
156 *      q12-q13 -- temporaries for load and vext operations.
157 *      q14-q15 -- intermediate sums
158 */
159#define TUNED_LIST1 8, 16
160.macro hconv1_8/*{{{*/
161            umull       v14.4s, v9.4h, v0.h[0]
162            umull2      v15.4s, v9.8h, v0.h[0]
163
164            adr         x16, 100f
165            ldrsh       x12, [x16, x5, LSL #1]
166            add         x12, x12, x16
167            br          x12
168   100:     .hword -4
169            .hword 101f-100b
170            .hword 102f-100b
171            .hword 103f-100b
172            .hword 104f-100b
173            .hword 105f-100b
174            .hword 106f-100b
175            .hword 107f-100b
176            .hword 108f-100b
177            .align      4
178    108:    umlal       v14.4s, v8.4h, v1.h[0]
179            umlal2      v15.4s, v8.8h, v1.h[0]
180            umlal       v14.4s, v10.4h, v1.h[0]
181            umlal2      v15.4s, v10.8h, v1.h[0]
182    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
183            ext         v13.16b, v9.16b, v10.16b, #7*2
184            umlal       v14.4s, v12.4h, v0.h[7]
185            umlal2      v15.4s, v12.8h, v0.h[7]
186            umlal       v14.4s, v13.4h, v0.h[7]
187            umlal2      v15.4s, v13.8h, v0.h[7]
188    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
189            ext         v13.16b, v9.16b, v10.16b, #6*2
190            umlal       v14.4s, v12.4h, v0.h[6]
191            umlal2      v15.4s, v12.8h, v0.h[6]
192            umlal       v14.4s, v13.4h, v0.h[6]
193            umlal2      v15.4s, v13.8h, v0.h[6]
194    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
195            ext         v13.16b, v9.16b, v10.16b, #5*2
196            umlal       v14.4s, v12.4h, v0.h[5]
197            umlal2      v15.4s, v12.8h, v0.h[5]
198            umlal       v14.4s, v13.4h, v0.h[5]
199            umlal2      v15.4s, v13.8h, v0.h[5]
200    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
201            //ext         v13.16b, v9.16b, v10.16b, #4*2
202            umlal2      v14.4s, v8.8h, v0.h[4]
203            umlal       v15.4s, v9.4h, v0.h[4]
204            umlal2      v14.4s, v9.8h, v0.h[4]
205            umlal       v15.4s, v10.4h, v0.h[4]
206    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
207            ext         v13.16b, v9.16b, v10.16b, #3*2
208            umlal       v14.4s, v12.4h, v0.h[3]
209            umlal2      v15.4s, v12.8h, v0.h[3]
210            umlal       v14.4s, v13.4h, v0.h[3]
211            umlal2      v15.4s, v13.8h, v0.h[3]
212    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
213            ext         v13.16b, v9.16b, v10.16b, #2*2
214            umlal       v14.4s, v12.4h, v0.h[2]
215            umlal2      v15.4s, v12.8h, v0.h[2]
216            umlal       v14.4s, v13.4h, v0.h[2]
217            umlal2      v15.4s, v13.8h, v0.h[2]
218    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
219            ext         v13.16b, v9.16b, v10.16b, #1*2
220            umlal       v14.4s, v12.4h, v0.h[1]
221            umlal2      v15.4s, v12.8h, v0.h[1]
222            umlal       v14.4s, v13.4h, v0.h[1]
223            umlal2      v15.4s, v13.8h, v0.h[1]
224
225            uqrshrn     v14.4h, v14.4s, #16
226            uqrshrn2    v14.8h, v15.4s, #16
227            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
228
229            mov         v8.16b, v9.16b
230            mov         v9.16b, v10.16b
231            mov         v10.16b, v11.16b
232.endm/*}}}*/
233
234.macro hconv1_16/*{{{*/
235            umull       v14.4s, v8.4h, v0.h[0]
236            umull2      v15.4s, v8.8h, v0.h[0]
237
238            adr         x16, 100f
239            ldrsh       x12, [x16, x5, LSL #1]
240            add         x12, x12, x16
241            br          x12
242   100:     .hword -4
243            .hword 101f-100b
244            .hword 102f-100b
245            .hword 103f-100b
246            .hword 104f-100b
247            .hword 105f-100b
248            .hword 106f-100b
249            .hword 107f-100b
250            .hword 108f-100b
251            .hword 109f-100b
252            .hword 110f-100b
253            .hword 111f-100b
254            .hword 112f-100b
255            .hword 113f-100b
256            .hword 114f-100b
257            .hword 115f-100b
258            .hword 116f-100b
259            .align 4
260    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
261            //ext         v13.16b, v10.16b, v11.16b, #0*2
262            umlal       v14.4s, v6.4h, v2.h[0]
263            umlal2      v15.4s, v6.8h, v2.h[0]
264            umlal       v14.4s, v10.4h, v2.h[0]
265            umlal2      v15.4s, v10.8h, v2.h[0]
266    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
267            ext         v13.16b, v9.16b, v10.16b, #7*2
268            umlal       v14.4s, v12.4h, v1.h[7]
269            umlal2      v15.4s, v12.8h, v1.h[7]
270            umlal       v14.4s, v13.4h, v1.h[7]
271            umlal2      v15.4s, v13.8h, v1.h[7]
272    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
273            ext         v13.16b, v9.16b, v10.16b, #6*2
274            umlal       v14.4s, v12.4h, v1.h[6]
275            umlal2      v15.4s, v12.8h, v1.h[6]
276            umlal       v14.4s, v13.4h, v1.h[6]
277            umlal2      v15.4s, v13.8h, v1.h[6]
278    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
279            ext         v13.16b, v9.16b, v10.16b, #5*2
280            umlal       v14.4s, v12.4h, v1.h[5]
281            umlal2      v15.4s, v12.8h, v1.h[5]
282            umlal       v14.4s, v13.4h, v1.h[5]
283            umlal2      v15.4s, v13.8h, v1.h[5]
284    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
285            //ext         v13.16b, v9.16b, v10.16b, #4*2
286            umlal2      v14.4s, v6.8h, v1.h[4]
287            umlal       v15.4s, v7.4h, v1.h[4]
288            umlal2      v14.4s, v9.8h, v1.h[4]
289            umlal       v15.4s, v10.4h, v1.h[4]
290    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
291            ext         v13.16b, v9.16b, v10.16b, #3*2
292            umlal       v14.4s, v12.4h, v1.h[3]
293            umlal2      v15.4s, v12.8h, v1.h[3]
294            umlal       v14.4s, v13.4h, v1.h[3]
295            umlal2      v15.4s, v13.8h, v1.h[3]
296    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
297            ext         v13.16b, v9.16b, v10.16b, #2*2
298            umlal       v14.4s, v12.4h, v1.h[2]
299            umlal2      v15.4s, v12.8h, v1.h[2]
300            umlal       v14.4s, v13.4h, v1.h[2]
301            umlal2      v15.4s, v13.8h, v1.h[2]
302    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
303            ext         v13.16b, v9.16b, v10.16b, #1*2
304            umlal       v14.4s, v12.4h, v1.h[1]
305            umlal2      v15.4s, v12.8h, v1.h[1]
306            umlal       v14.4s, v13.4h, v1.h[1]
307            umlal2      v15.4s, v13.8h, v1.h[1]
308    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
309            //ext         v13.16b, v9.16b, v10.16b, #0*2
310            umlal       v14.4s, v7.4h, v1.h[0]
311            umlal2      v15.4s, v7.8h, v1.h[0]
312            umlal       v14.4s, v9.4h, v1.h[0]
313            umlal2      v15.4s, v9.8h, v1.h[0]
314    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
315            ext         v13.16b, v8.16b, v9.16b, #7*2
316            umlal       v14.4s, v12.4h, v0.h[7]
317            umlal2      v15.4s, v12.8h, v0.h[7]
318            umlal       v14.4s, v13.4h, v0.h[7]
319            umlal2      v15.4s, v13.8h, v0.h[7]
320    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
321            ext         v13.16b, v8.16b, v9.16b, #6*2
322            umlal       v14.4s, v12.4h, v0.h[6]
323            umlal2      v15.4s, v12.8h, v0.h[6]
324            umlal       v14.4s, v13.4h, v0.h[6]
325            umlal2      v15.4s, v13.8h, v0.h[6]
326    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
327            ext         v13.16b, v8.16b, v9.16b, #5*2
328            umlal       v14.4s, v12.4h, v0.h[5]
329            umlal2      v15.4s, v12.8h, v0.h[5]
330            umlal       v14.4s, v13.4h, v0.h[5]
331            umlal2      v15.4s, v13.8h, v0.h[5]
332    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
333            //ext         v13.16b, v8.16b, v9.16b, #4*2
334            umlal2      v14.4s, v7.8h, v0.h[4]
335            umlal       v15.4s, v8.4h, v0.h[4]
336            umlal2      v14.4s, v8.8h, v0.h[4]
337            umlal       v15.4s, v9.4h, v0.h[4]
338    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
339            ext         v13.16b, v8.16b, v9.16b, #3*2
340            umlal       v14.4s, v12.4h, v0.h[3]
341            umlal2      v15.4s, v12.8h, v0.h[3]
342            umlal       v14.4s, v13.4h, v0.h[3]
343            umlal2      v15.4s, v13.8h, v0.h[3]
344    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
345            ext         v13.16b, v8.16b, v9.16b, #2*2
346            umlal       v14.4s, v12.4h, v0.h[2]
347            umlal2      v15.4s, v12.8h, v0.h[2]
348            umlal       v14.4s, v13.4h, v0.h[2]
349            umlal2      v15.4s, v13.8h, v0.h[2]
350    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
351            ext         v13.16b, v8.16b, v9.16b, #1*2
352            umlal       v14.4s, v12.4h, v0.h[1]
353            umlal2      v15.4s, v12.8h, v0.h[1]
354            umlal       v14.4s, v13.4h, v0.h[1]
355            umlal2      v15.4s, v13.8h, v0.h[1]
356
357            uqrshrn     v14.4h, v14.4s, #16
358            uqrshrn2    v14.8h, v15.4s, #16
359            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
360
361            mov         v6.16b, v7.16b
362            mov         v7.16b, v8.16b
363            mov         v8.16b, v9.16b
364            mov         v9.16b, v10.16b
365            mov         v10.16b, v11.16b
366.endm/*}}}*/
367
368.macro hconv1_25/*{{{*/
369            ext         v12.16b, v6.16b, v7.16b, #7*2
370            umull       v14.4s, v12.4h, v0.h[0]
371            umull2      v15.4s, v12.8h, v0.h[0]
372
373            adr         x16, 100f
374            ldrsh       x12, [x16, x5, LSL #1]
375            add         x12, x12, x16
376            br          x12
377   100:     .hword -4
378            .hword 101f-100b
379            .hword 102f-100b
380            .hword 103f-100b
381            .hword 104f-100b
382            .hword 105f-100b
383            .hword 106f-100b
384            .hword 107f-100b
385            .hword 108f-100b
386            .hword 109f-100b
387            .hword 110f-100b
388            .hword 111f-100b
389            .hword 112f-100b
390            .hword 113f-100b
391            .hword 114f-100b
392            .hword 115f-100b
393            .hword 116f-100b
394            .hword 117f-100b
395            .hword 118f-100b
396            .hword 119f-100b
397            .hword 120f-100b
398            .hword 121f-100b
399            .hword 122f-100b
400            .hword 123f-100b
401            .hword 124f-100b
402            .hword 125f-100b
403            .align 4
404    125:    ext         v12.16b, v3.16b, v4.16b, #6*2
405            ext         v13.16b, v10.16b, v11.16b, #0*2
406            umlal       v14.4s, v12.4h, v3.h[1]
407            umlal2      v15.4s, v12.8h, v3.h[1]
408            umlal       v14.4s, v13.4h, v3.h[1]
409            umlal2      v15.4s, v13.8h, v3.h[1]
410    124:    ext         v12.16b, v3.16b, v4.16b, #7*2
411            ext         v13.16b, v9.16b, v10.16b, #7*2
412            umlal       v14.4s, v12.4h, v3.h[0]
413            umlal2      v15.4s, v12.8h, v3.h[0]
414            umlal       v14.4s, v13.4h, v3.h[0]
415            umlal2      v15.4s, v13.8h, v3.h[0]
416    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
417            ext         v13.16b, v9.16b, v10.16b, #6*2
418            umlal       v14.4s, v12.4h, v2.h[7]
419            umlal2      v15.4s, v12.8h, v2.h[7]
420            umlal       v14.4s, v13.4h, v2.h[7]
421            umlal2      v15.4s, v13.8h, v2.h[7]
422    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
423            ext         v13.16b, v9.16b, v10.16b, #5*2
424            umlal       v14.4s, v12.4h, v2.h[6]
425            umlal2      v15.4s, v12.8h, v2.h[6]
426            umlal       v14.4s, v13.4h, v2.h[6]
427            umlal2      v15.4s, v13.8h, v2.h[6]
428    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
429            ext         v13.16b, v9.16b, v10.16b, #4*2
430            umlal       v14.4s, v12.4h, v2.h[5]
431            umlal2      v15.4s, v12.8h, v2.h[5]
432            umlal       v14.4s, v13.4h, v2.h[5]
433            umlal2      v15.4s, v13.8h, v2.h[5]
434    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
435            ext         v13.16b, v9.16b, v10.16b, #3*2
436            umlal       v14.4s, v12.4h, v2.h[4]
437            umlal2      v15.4s, v12.8h, v2.h[4]
438            umlal       v14.4s, v13.4h, v2.h[4]
439            umlal2      v15.4s, v13.8h, v2.h[4]
440    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
441            ext         v13.16b, v9.16b, v10.16b, #2*2
442            umlal       v14.4s, v12.4h, v2.h[3]
443            umlal2      v15.4s, v12.8h, v2.h[3]
444            umlal       v14.4s, v13.4h, v2.h[3]
445            umlal2      v15.4s, v13.8h, v2.h[3]
446    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
447            ext         v13.16b, v9.16b, v10.16b, #1*2
448            umlal       v14.4s, v12.4h, v2.h[2]
449            umlal2      v15.4s, v12.8h, v2.h[2]
450            umlal       v14.4s, v13.4h, v2.h[2]
451            umlal2      v15.4s, v13.8h, v2.h[2]
452    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
453            ext         v13.16b, v9.16b, v10.16b, #0*2
454            umlal       v14.4s, v12.4h, v2.h[1]
455            umlal2      v15.4s, v12.8h, v2.h[1]
456            umlal       v14.4s, v13.4h, v2.h[1]
457            umlal2      v15.4s, v13.8h, v2.h[1]
458    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
459            ext         v13.16b, v8.16b, v9.16b, #7*2
460            umlal       v14.4s, v12.4h, v2.h[0]
461            umlal2      v15.4s, v12.8h, v2.h[0]
462            umlal       v14.4s, v13.4h, v2.h[0]
463            umlal2      v15.4s, v13.8h, v2.h[0]
464    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
465            ext         v13.16b, v8.16b, v9.16b, #6*2
466            umlal       v14.4s, v12.4h, v1.h[7]
467            umlal2      v15.4s, v12.8h, v1.h[7]
468            umlal       v14.4s, v13.4h, v1.h[7]
469            umlal2      v15.4s, v13.8h, v1.h[7]
470    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
471            ext         v13.16b, v8.16b, v9.16b, #5*2
472            umlal       v14.4s, v12.4h, v1.h[6]
473            umlal2      v15.4s, v12.8h, v1.h[6]
474            umlal       v14.4s, v13.4h, v1.h[6]
475            umlal2      v15.4s, v13.8h, v1.h[6]
476    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
477            ext         v13.16b, v8.16b, v9.16b, #4*2
478            umlal       v14.4s, v12.4h, v1.h[5]
479            umlal2      v15.4s, v12.8h, v1.h[5]
480            umlal       v14.4s, v13.4h, v1.h[5]
481            umlal2      v15.4s, v13.8h, v1.h[5]
482    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
483            ext         v13.16b, v8.16b, v9.16b, #3*2
484            umlal       v14.4s, v12.4h, v1.h[4]
485            umlal2      v15.4s, v12.8h, v1.h[4]
486            umlal       v14.4s, v13.4h, v1.h[4]
487            umlal2      v15.4s, v13.8h, v1.h[4]
488    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
489            ext         v13.16b, v8.16b, v9.16b, #2*2
490            umlal       v14.4s, v12.4h, v1.h[3]
491            umlal2      v15.4s, v12.8h, v1.h[3]
492            umlal       v14.4s, v13.4h, v1.h[3]
493            umlal2      v15.4s, v13.8h, v1.h[3]
494    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
495            ext         v13.16b, v8.16b, v9.16b, #1*2
496            umlal       v14.4s, v12.4h, v1.h[2]
497            umlal2      v15.4s, v12.8h, v1.h[2]
498            umlal       v14.4s, v13.4h, v1.h[2]
499            umlal2      v15.4s, v13.8h, v1.h[2]
500    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
501            ext         v13.16b, v8.16b, v9.16b, #0*2
502            umlal       v14.4s, v12.4h, v1.h[1]
503            umlal2      v15.4s, v12.8h, v1.h[1]
504            umlal       v14.4s, v13.4h, v1.h[1]
505            umlal2      v15.4s, v13.8h, v1.h[1]
506    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
507            ext         v13.16b, v7.16b, v8.16b, #7*2
508            umlal       v14.4s, v12.4h, v1.h[0]
509            umlal2      v15.4s, v12.8h, v1.h[0]
510            umlal       v14.4s, v13.4h, v1.h[0]
511            umlal2      v15.4s, v13.8h, v1.h[0]
512    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
513            ext         v13.16b, v7.16b, v8.16b, #6*2
514            umlal       v14.4s, v12.4h, v0.h[7]
515            umlal2      v15.4s, v12.8h, v0.h[7]
516            umlal       v14.4s, v13.4h, v0.h[7]
517            umlal2      v15.4s, v13.8h, v0.h[7]
518    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
519            ext         v13.16b, v7.16b, v8.16b, #5*2
520            umlal       v14.4s, v12.4h, v0.h[6]
521            umlal2      v15.4s, v12.8h, v0.h[6]
522            umlal       v14.4s, v13.4h, v0.h[6]
523            umlal2      v15.4s, v13.8h, v0.h[6]
524    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
525            ext         v13.16b, v7.16b, v8.16b, #4*2
526            umlal       v14.4s, v12.4h, v0.h[5]
527            umlal2      v15.4s, v12.8h, v0.h[5]
528            umlal       v14.4s, v13.4h, v0.h[5]
529            umlal2      v15.4s, v13.8h, v0.h[5]
530    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
531            ext         v13.16b, v7.16b, v8.16b, #3*2
532            umlal       v14.4s, v12.4h, v0.h[4]
533            umlal2      v15.4s, v12.8h, v0.h[4]
534            umlal       v14.4s, v13.4h, v0.h[4]
535            umlal2      v15.4s, v13.8h, v0.h[4]
536    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
537            ext         v13.16b, v7.16b, v8.16b, #2*2
538            umlal       v14.4s, v12.4h, v0.h[3]
539            umlal2      v15.4s, v12.8h, v0.h[3]
540            umlal       v14.4s, v13.4h, v0.h[3]
541            umlal2      v15.4s, v13.8h, v0.h[3]
542    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
543            ext         v13.16b, v7.16b, v8.16b, #1*2
544            umlal       v14.4s, v12.4h, v0.h[2]
545            umlal2      v15.4s, v12.8h, v0.h[2]
546            umlal       v14.4s, v13.4h, v0.h[2]
547            umlal2      v15.4s, v13.8h, v0.h[2]
548    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
549            ext         v13.16b, v7.16b, v8.16b, #0*2
550            umlal       v14.4s, v12.4h, v0.h[1]
551            umlal2      v15.4s, v12.8h, v0.h[1]
552            umlal       v14.4s, v13.4h, v0.h[1]
553            umlal2      v15.4s, v13.8h, v0.h[1]
554
555            uqrshrn     v14.4h, v14.4s, #16
556            uqrshrn2    v14.8h, v15.4s, #16
557            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
558
559            ins         v3.d[1], v4.d[0]
560            mov         v4.16b, v5.16b
561            mov         v5.16b, v6.16b
562            mov         v6.16b, v7.16b
563            mov         v7.16b, v8.16b
564            mov         v8.16b, v9.16b
565            mov         v9.16b, v10.16b
566            mov         v10.16b, v11.16b
567.endm/*}}}*/
568
569#define TUNED_LIST4 6, 12
570.macro hconv4_6/*{{{*/
571            umull       v14.4s, v7.4h, v0.h[0]
572            umull2      v15.4s, v7.8h, v0.h[0]
573
574            adr         x16, 100f
575            ldrsh       x12, [x16, x5, LSL #1]
576            add         x12, x12, x16
577            br          x12
578   100:     .hword -4
579            .hword 101f-100b
580            .hword 102f-100b
581            .hword 103f-100b
582            .hword 104f-100b
583            .hword 105f-100b
584            .hword 106f-100b
585            .align      4
586    106:    umlal       v14.4s, v4.4h,  v0.h[6]
587            umlal2      v15.4s, v4.8h,  v0.h[6]
588            umlal       v14.4s, v10.4h, v0.h[6]
589            umlal2      v15.4s, v10.8h, v0.h[6]
590    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
591            umlal       v15.4s, v5.4h, v0.h[5]
592            umlal2      v14.4s, v9.8h, v0.h[5]
593            umlal       v15.4s, v10.4h, v0.h[5]
594    104:    umlal       v14.4s, v5.4h, v0.h[4]
595            umlal2      v15.4s, v5.8h, v0.h[4]
596            umlal       v14.4s, v9.4h, v0.h[4]
597            umlal2      v15.4s, v9.8h, v0.h[4]
598    103:    umlal2      v14.4s, v5.8h, v0.h[3]
599            umlal       v15.4s, v6.4h, v0.h[3]
600            umlal2      v14.4s, v8.8h, v0.h[3]
601            umlal       v15.4s, v9.4h, v0.h[3]
602    102:    umlal       v14.4s, v6.4h, v0.h[2]
603            umlal2      v15.4s, v6.8h, v0.h[2]
604            umlal       v14.4s, v8.4h, v0.h[2]
605            umlal2      v15.4s, v8.8h, v0.h[2]
606    101:    umlal2      v14.4s, v6.8h, v0.h[1]
607            umlal       v15.4s, v7.4h, v0.h[1]
608            umlal2      v14.4s, v7.8h, v0.h[1]
609            umlal       v15.4s, v8.4h, v0.h[1]
610
611            uqrshrn     v14.4h, v14.4s, #16
612            uqrshrn2    v14.8h, v15.4s, #16
613            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
614
615            mov         v4.16b, v5.16b
616            mov         v5.16b, v6.16b
617            mov         v6.16b, v7.16b
618            mov         v7.16b, v8.16b
619            mov         v8.16b, v9.16b
620            mov         v9.16b, v10.16b
621            mov         v10.16b, v11.16b
622.endm/*}}}*/
623
624.macro hconv4_12/*{{{*/
625            umull       v14.4s, v4.4h, v0.h[0]
626            umull2      v15.4s, v4.8h, v0.h[0]
627
628            adr         x16, 100f
629            ldrsh       x12, [x16, x5, LSL #1]
630            add         x12, x12, x16
631            br          x12
632   100:     .hword -4
633            .hword 101f-100b
634            .hword 102f-100b
635            .hword 103f-100b
636            .hword 104f-100b
637            .hword 105f-100b
638            .hword 106f-100b
639            .hword 107f-100b
640            .hword 108f-100b
641            .hword 109f-100b
642            .hword 110f-100b
643            .hword 111f-100b
644            .hword 112f-100b
645            .align 4
646    112:    add         x12, x9, #0x1a0
647            bic         x12, x12, #0x200
648            ld1         {v12.8h}, [x12]
649            umlal       v14.4s, v12.4h, v1.h[4]
650            umlal2      v15.4s, v12.8h, v1.h[4]
651            umlal       v14.4s, v10.4h, v1.h[4]
652            umlal2      v15.4s, v10.8h, v1.h[4]
653    111:    add         x12, x9, #0x1a8
654            bic         x12, x12, #0x200
655            ld1         {v12.4h}, [x12], #8
656            bic         x12, x12, #0x200
657            ld1         {v13.4h}, [x12]
658            umlal       v14.4s, v12.4h, v1.h[3]
659            umlal       v15.4s, v13.4h, v1.h[3]
660            umlal2      v14.4s, v9.8h, v1.h[3]
661            umlal       v15.4s, v10.4h, v1.h[3]
662    110:    add         x12, x9, #0x1b0
663            bic         x12, x12, #0x200
664            ld1         {v12.8h}, [x12]
665            umlal       v14.4s, v12.4h, v1.h[2]
666            umlal2      v15.4s, v12.8h, v1.h[2]
667            umlal       v14.4s, v9.4h, v1.h[2]
668            umlal2      v15.4s, v9.8h, v1.h[2]
669    109:    add         x12, x9, #0x1b8
670            bic         x12, x12, #0x200
671            ld1         {v12.4h}, [x12], #8
672            bic         x12, x12, #0x200
673            ld1         {v13.4h}, [x12]
674            umlal       v14.4s, v12.4h, v1.h[1]
675            umlal       v15.4s, v13.4h, v1.h[1]
676            umlal2      v14.4s, v8.8h, v1.h[1]
677            umlal       v15.4s, v9.4h, v1.h[1]
678    108:    add         x12, x9, #0x1c0
679            bic         x12, x12, #0x200
680            ld1         {v12.8h}, [x12]
681            umlal       v14.4s, v12.4h, v1.h[0]
682            umlal2      v15.4s, v12.8h, v1.h[0]
683            umlal       v14.4s, v8.4h, v1.h[0]
684            umlal2      v15.4s, v8.8h, v1.h[0]
685    107:    add         x12, x9, #0x1c8
686            bic         x12, x12, #0x200
687            ld1         {v12.4h}, [x12], #8
688            bic         x12, x12, #0x200
689            ld1         {v13.4h}, [x12]
690            umlal       v14.4s, v12.4h, v0.h[7]
691            umlal       v15.4s, v13.4h, v0.h[7]
692            umlal2      v14.4s, v7.8h, v0.h[7]
693            umlal       v15.4s, v8.4h, v0.h[7]
694    106:    add         x12, x9, #0x1d0
695            bic         x12, x12, #0x200
696            ld1         {v12.8h}, [x12]
697            umlal       v14.4s, v12.4h, v0.h[6]
698            umlal2      v15.4s, v12.8h, v0.h[6]
699            umlal       v14.4s, v7.4h, v0.h[6]
700            umlal2      v15.4s, v7.8h, v0.h[6]
701    105:    add         x12, x9, #0x1d8
702            bic         x12, x12, #0x200
703            ld1         {v12.4h}, [x12], #8
704            bic         x12, x12, #0x200
705            ld1         {v13.4h}, [x12]
706            umlal       v14.4s, v12.4h, v0.h[5]
707            umlal       v15.4s, v13.4h, v0.h[5]
708            umlal2      v14.4s, v6.8h, v0.h[5]
709            umlal       v15.4s, v7.4h, v0.h[5]
710    104:    add         x12, x9, #0x1e0
711            bic         x12, x12, #0x200
712            ld1         {v12.8h}, [x12]
713            umlal       v14.4s, v12.4h, v0.h[4]
714            umlal2      v15.4s, v12.8h, v0.h[4]
715            umlal       v14.4s, v6.4h, v0.h[4]
716            umlal2      v15.4s, v6.8h, v0.h[4]
717    103:    add         x12, x9, #0x1e8
718            bic         x12, x12, #0x200
719            ld1         {v12.4h}, [x12], #8
720            bic         x12, x12, #0x200
721            ld1         {v13.4h}, [x12]
722            umlal       v14.4s, v12.4h, v0.h[3]
723            umlal       v15.4s, v13.4h, v0.h[3]
724            umlal2      v14.4s, v5.8h, v0.h[3]
725            umlal       v15.4s, v6.4h, v0.h[3]
726    102:    add         x12, x9, #0x1f0
727            bic         x12, x12, #0x200
728            ld1         {v12.8h}, [x12]
729            umlal       v14.4s, v12.4h, v0.h[2]
730            umlal2      v15.4s, v12.8h, v0.h[2]
731            umlal       v14.4s, v5.4h, v0.h[2]
732            umlal2      v15.4s, v5.8h, v0.h[2]
733    101:    add         x12, x9, #0x1f8
734            bic         x12, x12, #0x200
735            ld1         {v12.4h}, [x12]
736            umlal       v14.4s, v12.4h, v0.h[1]
737            umlal       v15.4s, v4.4h,  v0.h[1]
738            umlal2      v14.4s, v4.8h,  v0.h[1]
739            umlal       v15.4s, v5.4h, v0.h[1]
740
741            uqrshrn     v14.4h, v14.4s, #16
742            uqrshrn2    v14.8h, v15.4s, #16
743            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
744
745            st1         {v4.16b}, [x9], #16
746            bic         x9, x9, #0x200
747            mov         v4.16b, v5.16b
748            mov         v5.16b, v6.16b
749            mov         v6.16b, v7.16b
750            mov         v7.16b, v8.16b
751            mov         v8.16b, v9.16b
752            mov         v9.16b, v10.16b
753            mov         v10.16b, v11.16b
754.endm/*}}}*/
755
756.macro hconv4_25/*{{{*/
757            add         x12, x9, #0x198
758            bic         x12, x12, #0x200
759            ld1         {v12.4h}, [x12], #8
760            bic         x12, x12, #0x200
761            ld1         {v13.4h}, [x12]
762            umull       v14.4s, v12.4h, v0.h[0]
763            umull       v15.4s, v13.4h, v0.h[0]
764
765            adr         x16, 100f
766            ldrsh       x12, [x16, x5, LSL #1]
767            add         x12, x12, x16
768            br          x12
769   100:     .hword -4
770            .hword 101f-100b
771            .hword 102f-100b
772            .hword 103f-100b
773            .hword 104f-100b
774            .hword 105f-100b
775            .hword 106f-100b
776            .hword 107f-100b
777            .hword 108f-100b
778            .hword 109f-100b
779            .hword 110f-100b
780            .hword 111f-100b
781            .hword 112f-100b
782            .hword 113f-100b
783            .hword 114f-100b
784            .hword 115f-100b
785            .hword 116f-100b
786            .hword 117f-100b
787            .hword 118f-100b
788            .hword 119f-100b
789            .hword 120f-100b
790            .hword 121f-100b
791            .hword 122f-100b
792            .hword 123f-100b
793            .hword 124f-100b
794            .hword 125f-100b
795            .align 4
796    125:    add         x12, x9, #0x0d0
797            bic         x12, x12, #0x200
798            ld1         {v12.8h}, [x12]
799            umlal       v14.4s, v12.4h, v3.h[1]
800            umlal2      v15.4s, v12.8h, v3.h[1]
801            umlal       v14.4s, v10.4h, v3.h[1]
802            umlal2      v15.4s, v10.8h, v3.h[1]
803    124:    add         x12, x9, #0x0d8
804            bic         x12, x12, #0x200
805            ld1         {v12.4h}, [x12], #8
806            bic         x12, x12, #0x200
807            ld1         {v13.4h}, [x12]
808            umlal       v14.4s, v12.4h, v3.h[0]
809            umlal       v15.4s, v13.4h, v3.h[0]
810            umlal2      v14.4s, v9.8h, v3.h[0]
811            umlal       v15.4s, v10.4h, v3.h[0]
812    123:    add         x12, x9, #0x0e0
813            bic         x12, x12, #0x200
814            ld1         {v12.8h}, [x12]
815            umlal       v14.4s, v12.4h, v2.h[7]
816            umlal2      v15.4s, v12.8h, v2.h[7]
817            umlal       v14.4s, v9.4h, v2.h[7]
818            umlal2      v15.4s, v9.8h, v2.h[7]
819    122:    add         x12, x9, #0x0e8
820            bic         x12, x12, #0x200
821            ld1         {v12.4h}, [x12], #8
822            bic         x12, x12, #0x200
823            ld1         {v13.4h}, [x12]
824            umlal       v14.4s, v12.4h, v2.h[6]
825            umlal       v15.4s, v13.4h, v2.h[6]
826            umlal2      v14.4s, v8.8h, v2.h[6]
827            umlal       v15.4s, v9.4h, v2.h[6]
828    121:    add         x12, x9, #0x0f0
829            bic         x12, x12, #0x200
830            ld1         {v12.8h}, [x12]
831            umlal       v14.4s, v12.4h, v2.h[5]
832            umlal2      v15.4s, v12.8h, v2.h[5]
833            umlal       v14.4s, v8.4h, v2.h[5]
834            umlal2      v15.4s, v8.8h, v2.h[5]
835    120:    add         x12, x9, #0x0f8
836            bic         x12, x12, #0x200
837            ld1         {v12.4h}, [x12], #8
838            bic         x12, x12, #0x200
839            ld1         {v13.4h}, [x12]
840            umlal       v14.4s, v12.4h, v2.h[4]
841            umlal       v15.4s, v13.4h, v2.h[4]
842            umlal2      v14.4s, v7.8h, v2.h[4]
843            umlal       v15.4s, v8.4h, v2.h[4]
844    119:    add         x12, x9, #0x100
845            bic         x12, x12, #0x200
846            ld1         {v12.8h}, [x12]
847            umlal       v14.4s, v12.4h, v2.h[3]
848            umlal2      v15.4s, v12.8h, v2.h[3]
849            umlal       v14.4s, v7.4h, v2.h[3]
850            umlal2      v15.4s, v7.8h, v2.h[3]
851    118:    add         x12, x9, #0x108
852            bic         x12, x12, #0x200
853            ld1         {v12.4h}, [x12], #8
854            bic         x12, x12, #0x200
855            ld1         {v13.4h}, [x12]
856            umlal       v14.4s, v12.4h, v2.h[2]
857            umlal       v15.4s, v13.4h, v2.h[2]
858            umlal2      v14.4s, v6.8h, v2.h[2]
859            umlal       v15.4s, v7.4h, v2.h[2]
860    117:    add         x12, x9, #0x110
861            bic         x12, x12, #0x200
862            ld1         {v12.8h}, [x12]
863            umlal       v14.4s, v12.4h, v2.h[1]
864            umlal2      v15.4s, v12.8h, v2.h[1]
865            umlal       v14.4s, v6.4h, v2.h[1]
866            umlal2      v15.4s, v6.8h, v2.h[1]
867    116:    add         x12, x9, #0x118
868            bic         x12, x12, #0x200
869            ld1         {v12.4h}, [x12], #8
870            bic         x12, x12, #0x200
871            ld1         {v13.4h}, [x12]
872            umlal       v14.4s, v12.4h, v2.h[0]
873            umlal       v15.4s, v13.4h, v2.h[0]
874            umlal2      v14.4s, v5.8h, v2.h[0]
875            umlal       v15.4s, v6.4h, v2.h[0]
876    115:    add         x12, x9, #0x120
877            bic         x12, x12, #0x200
878            ld1         {v12.8h}, [x12]
879            umlal       v14.4s, v12.4h, v1.h[7]
880            umlal2      v15.4s, v12.8h, v1.h[7]
881            umlal       v14.4s, v5.4h, v1.h[7]
882            umlal2      v15.4s, v5.8h, v1.h[7]
883    114:    add         x12, x9, #0x128
884            bic         x12, x12, #0x200
885            ld1         {v12.4h}, [x12], #8
886            bic         x12, x12, #0x200
887            ld1         {v13.4h}, [x12]
888            umlal       v14.4s, v12.4h, v1.h[6]
889            umlal       v15.4s, v13.4h, v1.h[6]
890            umlal2      v14.4s, v4.8h,  v1.h[6]
891            umlal       v15.4s, v5.4h, v1.h[6]
892    113:    add         x12, x9, #0x130
893            bic         x12, x12, #0x200
894            ld1         {v12.8h}, [x12]
895            umlal       v14.4s, v12.4h, v1.h[5]
896            umlal2      v15.4s, v12.8h, v1.h[5]
897            umlal       v14.4s, v4.4h,  v1.h[5]
898            umlal2      v15.4s, v4.8h,  v1.h[5]
899    112:    add         x12, x9, #0x138
900            bic         x12, x12, #0x200
901            ld1         {v12.4h}, [x12], #8
902            bic         x12, x12, #0x200
903            ld1         {v16.4h}, [x12]
904                                            add         x12, x9, #0x1f8
905                                            bic         x12, x12, #0x200
906                                            ld1         {v13.4h}, [x12]
907            umlal       v14.4s, v12.4h, v1.h[4]
908            umlal       v15.4s, v16.4h, v1.h[4]
909            umlal       v14.4s, v13.4h, v1.h[4]   // Could be d7, without the load, right?
910            umlal       v15.4s, v4.4h,  v1.h[4]
911    111:    add         x12, x9, #0x140
912            bic         x12, x12, #0x200
913            ld1         {v12.8h}, [x12]
914                                            add         x12, x9, #0x1f0
915                                            bic         x12, x12, #0x200
916                                            ld1         {v13.8h}, [x12]
917            umlal       v14.4s, v12.4h, v1.h[3]
918            umlal2      v15.4s, v12.8h, v1.h[3]
919            umlal       v14.4s, v13.4h, v1.h[3]
920            umlal2      v15.4s, v13.8h, v1.h[3]
921    110:    add         x12, x9, #0x148
922            bic         x12, x12, #0x200
923            ld1         {v12.4h}, [x12], #8
924            bic         x12, x12, #0x200
925            ld1         {v16.4h}, [x12]
926                                            add         x12, x9, #0x1e8
927                                            bic         x12, x12, #0x200
928                                            ld1         {v13.4h}, [x12], #8
929                                            bic         x12, x12, #0x200
930                                            ld1         {v17.4h}, [x12]
931            umlal       v14.4s, v12.4h, v1.h[2]
932            umlal       v15.4s, v16.4h, v1.h[2]
933            umlal       v14.4s, v13.4h, v1.h[2]
934            umlal       v15.4s, v17.4h, v1.h[2]
935    109:    add         x12, x9, #0x150
936            bic         x12, x12, #0x200
937            ld1         {v12.8h}, [x12]
938                                            add         x12, x9, #0x1e0
939                                            bic         x12, x12, #0x200
940                                            ld1         {v13.8h}, [x12]
941            umlal       v14.4s, v12.4h, v1.h[1]
942            umlal2      v15.4s, v12.8h, v1.h[1]
943            umlal       v14.4s, v13.4h, v1.h[1]
944            umlal2      v15.4s, v13.8h, v1.h[1]
945    108:    add         x12, x9, #0x158
946            bic         x12, x12, #0x200
947            ld1         {v12.4h}, [x12], #8
948            bic         x12, x12, #0x200
949            ld1         {v16.4h}, [x12]
950                                            add         x12, x9, #0x1d8
951                                            bic         x12, x12, #0x200
952                                            ld1         {v13.4h}, [x12], #8
953                                            bic         x12, x12, #0x200
954                                            ld1         {v17.4h}, [x12]
955            umlal       v14.4s, v12.4h, v1.h[0]
956            umlal       v15.4s, v16.4h, v1.h[0]
957            umlal       v14.4s, v13.4h, v1.h[0]
958            umlal       v15.4s, v17.4h, v1.h[0]
959    107:    add         x12, x9, #0x160
960            bic         x12, x12, #0x200
961            ld1         {v12.8h}, [x12]
962                                            add         x12, x9, #0x1d0
963                                            bic         x12, x12, #0x200
964                                            ld1         {v13.8h}, [x12]
965            umlal       v14.4s, v12.4h, v0.h[7]
966            umlal2      v15.4s, v12.8h, v0.h[7]
967            umlal       v14.4s, v13.4h, v0.h[7]
968            umlal2      v15.4s, v13.8h, v0.h[7]
969    106:    add         x12, x9, #0x168
970            bic         x12, x12, #0x200
971            ld1         {v12.4h}, [x12], #8
972            bic         x12, x12, #0x200
973            ld1         {v16.4h}, [x12]
974                                            add         x12, x9, #0x1c8
975                                            bic         x12, x12, #0x200
976                                            ld1         {v13.4h}, [x12], #8
977                                            bic         x12, x12, #0x200
978                                            ld1         {v17.4h}, [x12]
979            umlal       v14.4s, v12.4h, v0.h[6]
980            umlal       v15.4s, v16.4h, v0.h[6]
981            umlal       v14.4s, v13.4h, v0.h[6]
982            umlal       v15.4s, v17.4h, v0.h[6]
983    105:    add         x12, x9, #0x170
984            bic         x12, x12, #0x200
985            ld1         {v12.8h}, [x12]
986                                            add         x12, x9, #0x1c0
987                                            bic         x12, x12, #0x200
988                                            ld1         {v13.8h}, [x12]
989            umlal       v14.4s, v12.4h, v0.h[5]
990            umlal2      v15.4s, v12.8h, v0.h[5]
991            umlal       v14.4s, v13.4h, v0.h[5]
992            umlal2      v15.4s, v13.8h, v0.h[5]
993    104:    add         x12, x9, #0x178
994            bic         x12, x12, #0x200
995            ld1         {v12.4h}, [x12], #8
996            bic         x12, x12, #0x200
997            ld1         {v16.4h}, [x12]
998                                            add         x12, x9, #0x1b8
999                                            bic         x12, x12, #0x200
1000                                            ld1         {v13.4h}, [x12], #8
1001                                            bic         x12, x12, #0x200
1002                                            ld1         {v17.4h}, [x12]
1003            umlal       v14.4s, v12.4h, v0.h[4]
1004            umlal       v15.4s, v16.4h, v0.h[4]
1005            umlal       v14.4s, v13.4h, v0.h[4]
1006            umlal       v15.4s, v17.4h, v0.h[4]
1007    103:    add         x12, x9, #0x180
1008            bic         x12, x12, #0x200
1009            ld1         {v12.8h}, [x12]
1010                                            add         x12, x9, #0x1b0
1011                                            bic         x12, x12, #0x200
1012                                            ld1         {v13.8h}, [x12]
1013            umlal       v14.4s, v12.4h, v0.h[3]
1014            umlal2      v15.4s, v12.8h, v0.h[3]
1015            umlal       v14.4s, v13.4h, v0.h[3]
1016            umlal2      v15.4s, v13.8h, v0.h[3]
1017    102:    add         x12, x9, #0x188
1018            bic         x12, x12, #0x200
1019            ld1         {v12.4h}, [x12], #8
1020            bic         x12, x12, #0x200
1021            ld1         {v16.4h}, [x12]
1022                                            add         x12, x9, #0x1a8
1023                                            bic         x12, x12, #0x200
1024                                            ld1         {v13.4h}, [x12], #8
1025                                            bic         x12, x12, #0x200
1026                                            ld1         {v17.4h}, [x12]
1027            umlal       v14.4s, v12.4h, v0.h[2]
1028            umlal       v15.4s, v16.4h, v0.h[2]
1029            umlal       v14.4s, v13.4h, v0.h[2]
1030            umlal       v15.4s, v17.4h, v0.h[2]
1031    101:    add         x12, x9, #0x190
1032            bic         x12, x12, #0x200
1033            ld1         {v12.8h}, [x12], #16
1034            bic         x12, x12, #0x200
1035            ld1         {v13.8h}, [x12]
1036            umlal       v14.4s, v12.4h, v0.h[1]
1037            umlal2      v15.4s, v12.8h, v0.h[1]
1038            umlal       v14.4s, v13.4h, v0.h[1]
1039            umlal2      v15.4s, v13.8h, v0.h[1]
1040
1041            uqrshrn     v14.4h, v14.4s, #16
1042            uqrshrn2    v14.8h, v15.4s, #16
1043            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1044
1045            st1         {v4.16b}, [x9], #16
1046            bic         x9, x9, #0x200
1047            mov         v4.16b, v5.16b
1048            mov         v5.16b, v6.16b
1049            mov         v6.16b, v7.16b
1050            mov         v7.16b, v8.16b
1051            mov         v8.16b, v9.16b
1052            mov         v9.16b, v10.16b
1053            mov         v10.16b, v11.16b
1054.endm/*}}}*/
1055
1056/* Dedicated function wrapper for the fetch macro, for the cases where
1057 * performance isn't that important, to keep code size down.
1058 */
1059PRIVATE(fetch_generic_asm)
1060            stp         x10, x11, [sp, #-16]!
1061            fetch
1062            ldp         x10, x11, [sp], #16
1063            ret
1064END(fetch_generic_asm)
1065
1066/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value
1067 * across to fill the rest of the register pair.  Used for filling the right
1068 * hand edge of the window when starting too close to the right hand edge of
1069 * the image.
1070 */
1071PRIVATE(prefetch_clamp1)
1072            sub         x11, xzr, x11
1073            sub         x15, x15, x1
1074            sub         x19, x19, x1
1075            tbz         x11, #3, 1f
1076            mov         v11.16b, v10.16b
1077            sub         x1, x1, #16
10781:          mov         v12.16b, v11.16b
1079            movi        v13.8b, #0xff
1080            tbz         x11, #2, 1f
1081            ext         v12.16b, v12.16b, v12.16b, #4*2
1082            sub         x1, x1, #8
1083            shl         v13.2d, v13.2d, #32
10841:          tbz         x11, #1, 1f
1085            ext         v12.16b, v12.16b, v12.16b, #6*2
1086            sub         x1, x1, #4
1087            shl         v13.2d, v13.2d, #16
10881:          tbz         x11, #0, 1f
1089            ext         v12.16b, v12.16b, v12.16b, #7*2
1090            sub         x1, x1, #2
1091            shl         v13.2d, v13.2d, #8
10921:          dup         v12.8h, v12.h[6]
1093            sxtl        v13.8h, v13.8b
1094            bif         v11.16b, v12.16b, v13.16b
10951:          tbz         x11, #3, 1f
1096            mov         v10.16b, v11.16b
1097            mov         v11.16b, v12.16b
10981:          sub         x11, xzr, x11
1099            add         x15, x15, x1
1100            add         x19, x19, x1
1101            ret
1102END(prefetch_clamp1)
1103
1104PRIVATE(prefetch_clamp4)
1105            sub         x11, xzr, x11
1106            sub         x15, x15, x1
1107            sub         x19, x19, x1
1108            tbz         x11, #3, 1f
1109            sub         x1, x1, #16     // what's this?
1110            mov         v11.16b, v10.16b
11111:          dup         v12.2d, v11.d[1]
1112            tbz         x11, #2, 1f
1113            dup         v12.2d, v11.d[0]
1114            sub         x1, x1, #8
1115            dup         v11.2d, v11.d[0]
11161:          tbz         x11, #3, 1f
1117            mov         v10.16b, v11.16b
1118            mov         v11.16b, v12.16b
11191:          sub         x11, xzr, x11
1120            add         x15, x15, x1
1121            add         x19, x19, x1
1122            ret
1123END(prefetch_clamp4)
1124
1125
1126/* Helpers for prefetch, below.
1127 */
1128.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi
1129  .if \store > 0
1130    .ifc \qsa,\qsb
1131            st1         {\qsa}, [x9], #16
1132            st1         {\qsb}, [x9], #16
1133    .else
1134            st1         {\qsa,\qsb}, [x9], #32
1135    .endif
1136  .elseif \store == 0
1137            mov         \qa, \qsa
1138            mov         \qb, \qsb
1139  .else
1140            ins         \qb, \qsb_hi
1141  .endif
1142.endm
1143
1144.macro prefetch_one  qa, qb, rem, c, store=0, step=1
1145.set i, (need - 16) - \rem
1146.if i >= 0
11471:          cmp         x10, #i+16
1148            blo         2f
1149            prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1]
1150            b           1f
11512:          cmp         x11, #i+16
1152            bls         3f
1153            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
1154            bl          fetch_generic_asm
1155            b           2f
11563:          bl          prefetch_clamp\step
1157            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
11584:          b           4f+4
1159           //v12 contains pad word from prefetch_clamp call
1160            prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
1161  .if \rem > 0
1162            b           4f+4
1163  .else
11641:
11652:
11663:
11674:          nop
1168  .endif
1169.endif
1170.endm
1171
1172/* Fill the convolution window with context data.  The aim here is to load
1173 * exactly rlf + rrt columns, and in the main loop to read as many columns as
1174 * will be written.  This is complicated by the need to handle cases when the
1175 * input starts very close to the left or right (or both) edges of the image,
1176 * and where these do not fall on 16-byte boundaries.
1177 *
1178 * Input:
1179 *      x1 -- src
1180 *      x2 -- pitch
1181 *      x3 -- count
1182 *      x4 -- inlen
1183 *      x5 -- r
1184 *      x6 -- rup
1185 *      x7 -- rdn
1186 *      x8 -- rlf
1187 *      x9 -- buffer (if needed)
1188 *      x13 = -pitch
1189 *      x15 = top-row in
1190 *      x19 = bottom-row in
1191 * Output:
1192 *      x1 += rlf + min(count, rrt)
1193 * Modifies:
1194 *      x10 -- fill start index in the window
1195 *      x11 -- fill stop index in the window
1196 *      x12 -- scratch
1197 */
1198.macro prefetch step=1, max_r=25
1199.set need, ((\max_r + \max_r) * \step + 15) & ~15
1200  .if \step == 1
1201            mov         x10, #need - (\max_r * \step)
1202            sub         x10, x10, x8
1203  .else
1204            mov         x10, #need - (\max_r * \step)
1205            sub         x10, x10, x8, LSL #2
1206  .endif
1207            add         x11, x10, x4
1208            subs        x11, x11, #need
1209            csel        x11, xzr, x11, hi
1210            add         x11, x11, #need
1211
1212            bl          fetch_generic_asm
1213  .if \step == 1
1214            dup         v9.8h, v10.h[0]
1215  .else
1216            dup         v9.2d, v10.d[0]
1217  .endif
1218            tst         x10, #15
1219            beq         2f
1220            sub         x12, xzr, x10
1221            tbz         x10, #3, 1f
1222            mov         v11.16b, v10.16b
1223            mov         v10.16b, v9.16b
12241:          tbz         x12, #2, 1f
1225            ext         v11.16b, v10.16b, v11.16b, #4*2
1226            ext         v10.16b, v9.16b, v10.16b, #4*2
1227  .if \step == 1
1228  1:        tbz         x12, #1, 1f
1229            ext         v11.16b, v10.16b, v11.16b, #2*2
1230            ext         v10.16b, v9.16b, v10.16b, #2*2
1231  1:        tbz         x12, #0, 1f
1232            ext         v11.16b, v10.16b, v11.16b, #1*2
1233            ext         v10.16b, v9.16b, v10.16b, #1*2
1234  .endif
12351:          sub         x1, x1, x10
1236            sub         x15, x15, x10
1237            sub         x19, x19, x10
1238            bic         x10, x10, #15
1239            add         x1, x1, x10
1240            add         x15, x15, x10
1241            add         x19, x19, x10
12422:
1243  .if \step > 1
1244            /* it's only in the uchar2 and uchar4 cases where the register file
1245             * is insufficient (given MAX_R <= 25).
1246             */
1247            prefetch_one xx, xx, 192, c=\max_r, step=\step, store=1
1248            prefetch_one xx, xx, 176, c=\max_r, step=\step, store=1
1249            prefetch_one xx, xx, 160, c=\max_r, step=\step, store=1
1250            prefetch_one xx, xx, 144, c=\max_r, step=\step, store=1
1251            prefetch_one xx, xx, 128, c=\max_r, step=\step, store=1
1252            prefetch_one xx, xx, 112, c=\max_r, step=\step, store=1
1253            prefetch_one xx, xx,  96, c=\max_r, step=\step, store=1
1254            prefetch_one xx, xx,  80, c=\max_r, step=\step, store=1
1255            prefetch_one xx, xx,  64, c=\max_r, step=\step, store=1
1256            prefetch_one xx, xx,  48, c=\max_r, step=\step, store=1
1257  .else
1258            /* q3 normally contains the coefficient table, but it's not fully
1259             * used.  In the uchar1, r=25 case the other half of q3 is used for
1260             * the last two window taps to avoid falling out to memory.
1261             */
1262            prefetch_one xx,  v3.d[1], 48, c=\max_r, step=\step, store=-1
1263  .endif
1264            prefetch_one v4.16b, v5.16b, 32, c=\max_r, step=\step, store=0
1265            prefetch_one v6.16b, v7.16b, 16, c=\max_r, step=\step, store=0
1266            prefetch_one v8.16b, v9.16b,  0, c=\max_r, step=\step, store=0
1267
1268  .if \step == 1
1269            add         x10, x8, #\max_r * \step
1270  .else
1271            lsl         x10, x8, #2
1272            add         x10, x10, #\max_r * \step
1273  .endif
1274            subs        x4, x4, x10
1275            csel        x4, xzr, x4, lo
1276.endm
1277
1278/* The main loop.
1279 *
1280 * Input:
1281 *      x0 = dst
1282 *      x1 = src
1283 *      x2 = pitch
1284 *      x3 = count
1285 *      x4 = inlen
1286 *      x5 = r
1287 *      x6 = rup
1288 *      x7 = rdn
1289 *      x9 = buffer
1290 *      x13 = -pitch
1291 *      x15 = top-row in
1292 *      x19 = bottom-row in
1293 * Modifies
1294 *      x8 = fetch code pointer
1295 */
1296.macro mainloop core, step=1, max_r=25, labelc="", labelnc=""
1297            adrp        x8, \labelnc
1298            add         x8, x8, #:lo12:\labelnc
1299            sub         x8, x8, x5, LSL #5
1300            sub         x8, x8, x5, LSL #3
1301            cmp         x5, x6
1302            ccmp        x5, x7, #0, eq
1303            beq         5f
1304
1305            /* if (r != rup || r != rdn) then the address-clamping table should
1306             * be used rather than the short-cut version.
1307             */
1308            adrp        x8, \labelc
1309            add         x8, x8, #:lo12:\labelc
1310            sub         x8, x8, x5, LSL #6
1311            add         x8, x8, x5, LSL #3
1312            b           5f
1313            .align  4
13143:          fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1315
1316            /* For each call to fetch two are made to \core.  It would be
1317             * preferable to have twice the work done in \core.
1318             */
1319            \core
1320            st1         {v15.8b}, [x0], #8
1321            \core
1322            st1         {v15.8b}, [x0], #8
1323
1324            sub         x3, x3, #16
13255:          subs        x4, x4, #16
1326            bhs         3b
1327            adds        x4, x4, #16
1328            bne         1f
1329  .if \step==1
1330            dup         v10.8h, v9.h[7]
1331            dup         v11.8h, v9.h[7]
1332  .else
1333            dup         v10.2d, v9.d[1]
1334            dup         v11.2d, v9.d[1]
1335  .endif
1336            b           4f
1337
13381:          sub         x1, x1, #16
1339            sub         x15, x15, #16
1340            sub         x19, x19, #16
1341            add         x1, x1, x4
1342            add         x15, x15, x4
1343            add         x19, x19, x4
1344            bl          fetch_generic_asm
1345
1346  .if \step==1
1347            dup         v12.8h, v11.h[7]
1348  .else
1349            dup         v12.2d, v11.d[1]
1350  .endif
1351            sub         x4, xzr, x4
1352            tbz         x4, #3, 1f
1353            mov         v10.16b, v11.16b
1354            mov         v11.16b, v12.16b
13551:          tbz         x4, #2, 1f
1356            ext         v10.16b, v10.16b, v11.16b, #4*2
1357            ext         v11.16b, v11.16b, v12.16b, #4*2
13581:          tbz         x4, #1, 1f
1359            ext         v10.16b, v10.16b, v11.16b, #2*2
1360            ext         v11.16b, v11.16b, v12.16b, #2*2
13611:          tbz         x4, #0, 4f
1362            ext         v10.16b, v10.16b, v11.16b, #1*2
1363            ext         v11.16b, v11.16b, v12.16b, #1*2
13644:          cbz         x3, 5f
13653:          \core
1366  .if \step==1
1367            dup         v11.8h, v11.h[7]
1368  .else
1369            dup         v11.2d, v11.d[1]
1370  .endif
1371            subs        x3, x3, #8
1372            blo         4f
1373            st1         {v15.8b}, [x0], #8
1374            beq         5f
1375            b           3b
13764:          tbz         x3, #2, 1f
1377            st1         {v15.s}[0], [x0], #4
1378            ext         v15.16b, v15.16b, v15.16b, #4*2
13791:          tbz         x3, #1, 1f
1380            st1         {v15.h}[0], [x0], #2
1381            ext         v15.16b, v15.16b, v15.16b, #2*2
13821:          tbz         x3, #0, 5f
1383            st1         {v15.b}[0], [x0], #1
1384            ext         v15.16b, v15.16b, v15.16b, #1*2
13855:          nop
1386.endm
1387
1388.irep r, TUNED_LIST1, 25
1389PRIVATE(convolve1_\r)
1390            stp         x29,x30, [sp, #-16]!
1391
1392            prefetch    step=1, max_r=\r
1393
1394            mainloop    core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1395
1396            ldp         x29,x30, [sp], #16
1397            ret
1398END(convolve1_\r)
1399.endr
1400
1401.irep r, TUNED_LIST4, 25
1402PRIVATE(convolve4_\r)
1403            sub         x12, sp, #0x200
1404            bic         x9, x12, #0x3fc
1405            mov         sp, x9
1406            stp         x12,x30, [sp, #-16]!
1407
1408            /* x9 now points to a buffer on the stack whose address has the low
1409             * 10 bits clear.  This allows easy address calculation in the
1410             * wrap-around cases.
1411             */
1412
1413
1414            prefetch    step=4, max_r=\r
1415
1416            mainloop    core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1417
1418            ldp         x12,x30, [sp]
1419            add         sp, x12, #0x200
1420            ret
1421END(convolve4_\r)
1422.endr
1423
1424/* void rsdIntrinsicBlurU1_K(
1425 *                  void *out,      // x0
1426 *                  void *in,       // x1
1427 *                  size_t w,       // x2
1428 *                  size_t h,       // x3
1429 *                  size_t p,       // x4
1430 *                  size_t x,       // x5
1431 *                  size_t y,       // x6
1432 *                  size_t count,   // x7
1433 *                  size_t r,       // [sp]
1434 *                  uint16_t *tab); // [sp,#8]
1435 */
1436ENTRY(rsdIntrinsicBlurU1_K)
1437            stp         x19,x30, [sp, #-16]!
1438            sub         x8, sp, #32
1439            sub         sp, sp, #64
1440            st1         {v8.1d - v11.1d}, [sp]
1441            st1         {v12.1d - v15.1d}, [x8]
1442            mov         x8, x5        // x
1443            ldr         w5, [sp,#80]  // r
1444            sub         x9, x2, x8
1445            sub         x10, x3, x6
1446            mov         x2, x4        // pitch
1447            mov         x3, x7        // count
1448            sub         x7, x10, #1
1449            sub         x9, x9, x3
1450
1451            ldr         x12, [sp, #88] // tab
1452
1453            add         x0, x0, x8
1454            add         x1, x1, x8
1455
1456            cmp         x6, x5
1457            csel        x6, x5, x6, hs
1458            cmp         x7, x5
1459            csel        x7, x5, x7, hs
1460            cmp         x8, x5
1461            csel        x8, x5, x8, hs
1462            cmp         x9, x5
1463            csel        x9, x5, x8, hs
1464
1465            add         x4, x8, x9
1466            add         x4, x4, x3
1467
1468            sub         x1, x1, x8
1469
1470            sub         x13, xzr, x2
1471            msub        x15, x2, x6, x1
1472            madd        x19, x2, x7, x1
1473
1474            ld1         {v0.8h,v1.8h}, [x12], #32
1475            ld1         {v2.8h,v3.8h}, [x12], #32
1476
1477            adr         x30, 1f
1478  .irep r, TUNED_LIST1
1479            cmp         x5, #\r
1480            bls         convolve1_\r
1481  .endr
1482            b           convolve1_25
1483
14841:          ld1         {v8.1d - v11.1d}, [sp], #32
1485            ld1         {v12.1d - v15.1d}, [sp], #32
1486            ldp         x19,x30, [sp], #16
1487            ret
1488END(rsdIntrinsicBlurU1_K)
1489
1490/* void rsdIntrinsicBlurU4_K(
1491 *                  void *out,      // x0
1492 *                  void *in,       // x1
1493 *                  size_t w,       // x2
1494 *                  size_t h,       // x3
1495 *                  size_t p,       // x4
1496 *                  size_t x,       // x5
1497 *                  size_t y,       // x6
1498 *                  size_t count,   // x7
1499 *                  size_t r,       // [sp]
1500 *                  uint16_t *tab); // [sp,#8]
1501 */
1502ENTRY(rsdIntrinsicBlurU4_K)
1503            stp         x19,x30, [sp, #-16]!
1504            sub         x8, sp, #32
1505            sub         sp, sp, #64
1506            st1         {v8.1d - v11.1d}, [sp]
1507            st1         {v12.1d - v15.1d}, [x8]
1508            mov         x8, x5        // x
1509            ldr         w5, [sp,#80]  // r
1510            sub         x9, x2, x8
1511            sub         x10, x3, x6
1512            mov         x2, x4        // pitch
1513            mov         x3, x7        // count
1514            sub         x7, x10, #1
1515            sub         x9, x9, x3
1516
1517            ldr         x12, [sp, #88]
1518
1519            add         x0, x0, x8, LSL #2
1520            add         x1, x1, x8, LSL #2
1521
1522            cmp         x6, x5
1523            csel        x6, x5, x6, hs
1524            cmp         x7, x5
1525            csel        x7, x5, x7, hs
1526            cmp         x8, x5
1527            csel        x8, x5, x8, hs
1528            cmp         x9, x5
1529            csel        x9, x5, x9, hs
1530
1531            lsl         x3, x3, #2
1532            add         x4, x8, x9
1533            add         x4, x3, x4, LSL #2
1534
1535            sub         x1, x1, x8, LSL #2
1536
1537            sub         x13, xzr, x2
1538            msub        x15, x2, x6, x1
1539            madd        x19, x2, x7, x1
1540
1541            ld1         {v0.8h,v1.8h}, [x12], #32
1542            ld1         {v2.8h,v3.8h}, [x12], #32
1543
1544            adr         x30, 1f
1545  .irep r, TUNED_LIST4
1546            cmp         x5, #\r
1547            bls         convolve4_\r
1548  .endr
1549            b           convolve4_25
1550
15511:          ld1         {v8.1d - v11.1d}, [sp], #32
1552            ld1         {v12.1d - v15.1d}, [sp], #32
1553            ldp         x19,x30, [sp], #16
1554            ret
1555END(rsdIntrinsicBlurU4_K)
1556