1/*
2 * Copyright © 2011 SCore Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 * Author:  Taekyun Kim (tkq.kim@samsung.com)
25 */
26
27/*
28 * This file contains scaled bilinear scanline functions implemented
29 * using older siarhei's bilinear macro template.
30 *
31 * << General scanline function procedures >>
32 *  1. bilinear interpolate source pixels
33 *  2. load mask pixels
34 *  3. load destination pixels
35 *  4. duplicate mask to fill whole register
36 *  5. interleave source & destination pixels
37 *  6. apply mask to source pixels
38 *  7. combine source & destination pixels
39 *  8, Deinterleave final result
40 *  9. store destination pixels
41 *
42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
43 * Registers with double numbers(src01, dst01) are 128-bits registers.
44 * All temp registers can be used freely outside the code block.
45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
46 *
47 * Remarks
48 *  There can be lots of pipeline stalls inside code block and between code blocks.
49 *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
50 */
51
52/* Prevent the stack from becoming executable for no reason... */
53#if defined(__linux__) && defined (__ELF__)
54.section .note.GNU-stack,"",%progbits
55#endif
56
57.text
58.fpu neon
59.arch armv7a
60.object_arch armv4
61.eabi_attribute 10, 0
62.eabi_attribute 12, 0
63.arm
64.altmacro
65.p2align 2
66
67#include "pixman-private.h"
68#include "pixman-arm-neon-asm.h"
69
70/*
71 * Bilinear macros from pixman-arm-neon-asm.S
72 */
73
74/* Supplementary macro for setting function attributes */
75.macro pixman_asm_function fname
76    .func fname
77    .global fname
78#ifdef __ELF__
79    .hidden fname
80    .type fname, %function
81#endif
82fname:
83.endm
84
85/*
86 * Bilinear scaling support code which tries to provide pixel fetching, color
87 * format conversion, and interpolation as separate macros which can be used
88 * as the basic building blocks for constructing bilinear scanline functions.
89 */
90
91.macro bilinear_load_8888 reg1, reg2, tmp
92    mov       TMP1, X, asr #16
93    add       X, X, UX
94    add       TMP1, TOP, TMP1, asl #2
95    vld1.32   {reg1}, [TMP1], STRIDE
96    vld1.32   {reg2}, [TMP1]
97.endm
98
99.macro bilinear_load_0565 reg1, reg2, tmp
100    mov       TMP1, X, asr #16
101    add       X, X, UX
102    add       TMP1, TOP, TMP1, asl #1
103    vld1.32   {reg2[0]}, [TMP1], STRIDE
104    vld1.32   {reg2[1]}, [TMP1]
105    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
106.endm
107
108.macro bilinear_load_and_vertical_interpolate_two_8888 \
109                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
110
111    bilinear_load_8888 reg1, reg2, tmp1
112    vmull.u8  acc1, reg1, d28
113    vmlal.u8  acc1, reg2, d29
114    bilinear_load_8888 reg3, reg4, tmp2
115    vmull.u8  acc2, reg3, d28
116    vmlal.u8  acc2, reg4, d29
117.endm
118
119.macro bilinear_load_and_vertical_interpolate_four_8888 \
120                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
121                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
122
123    bilinear_load_and_vertical_interpolate_two_8888 \
124                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
125    bilinear_load_and_vertical_interpolate_two_8888 \
126                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
127.endm
128
129.macro bilinear_load_and_vertical_interpolate_two_0565 \
130                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
131
132    mov       TMP1, X, asr #16
133    add       X, X, UX
134    add       TMP1, TOP, TMP1, asl #1
135    mov       TMP2, X, asr #16
136    add       X, X, UX
137    add       TMP2, TOP, TMP2, asl #1
138    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
139    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
140    vld1.32   {acc2lo[1]}, [TMP1]
141    vld1.32   {acc2hi[1]}, [TMP2]
142    convert_0565_to_x888 acc2, reg3, reg2, reg1
143    vzip.u8   reg1, reg3
144    vzip.u8   reg2, reg4
145    vzip.u8   reg3, reg4
146    vzip.u8   reg1, reg2
147    vmull.u8  acc1, reg1, d28
148    vmlal.u8  acc1, reg2, d29
149    vmull.u8  acc2, reg3, d28
150    vmlal.u8  acc2, reg4, d29
151.endm
152
153.macro bilinear_load_and_vertical_interpolate_four_0565 \
154                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
155                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
156
157    mov       TMP1, X, asr #16
158    add       X, X, UX
159    add       TMP1, TOP, TMP1, asl #1
160    mov       TMP2, X, asr #16
161    add       X, X, UX
162    add       TMP2, TOP, TMP2, asl #1
163    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
164    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
165    vld1.32   {xacc2lo[1]}, [TMP1]
166    vld1.32   {xacc2hi[1]}, [TMP2]
167    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
168    mov       TMP1, X, asr #16
169    add       X, X, UX
170    add       TMP1, TOP, TMP1, asl #1
171    mov       TMP2, X, asr #16
172    add       X, X, UX
173    add       TMP2, TOP, TMP2, asl #1
174    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
175    vzip.u8   xreg1, xreg3
176    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
177    vzip.u8   xreg2, xreg4
178    vld1.32   {yacc2lo[1]}, [TMP1]
179    vzip.u8   xreg3, xreg4
180    vld1.32   {yacc2hi[1]}, [TMP2]
181    vzip.u8   xreg1, xreg2
182    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
183    vmull.u8  xacc1, xreg1, d28
184    vzip.u8   yreg1, yreg3
185    vmlal.u8  xacc1, xreg2, d29
186    vzip.u8   yreg2, yreg4
187    vmull.u8  xacc2, xreg3, d28
188    vzip.u8   yreg3, yreg4
189    vmlal.u8  xacc2, xreg4, d29
190    vzip.u8   yreg1, yreg2
191    vmull.u8  yacc1, yreg1, d28
192    vmlal.u8  yacc1, yreg2, d29
193    vmull.u8  yacc2, yreg3, d28
194    vmlal.u8  yacc2, yreg4, d29
195.endm
196
197.macro bilinear_store_8888 numpix, tmp1, tmp2
198.if numpix == 4
199    vst1.32   {d0, d1}, [OUT]!
200.elseif numpix == 2
201    vst1.32   {d0}, [OUT]!
202.elseif numpix == 1
203    vst1.32   {d0[0]}, [OUT, :32]!
204.else
205    .error bilinear_store_8888 numpix is unsupported
206.endif
207.endm
208
209.macro bilinear_store_0565 numpix, tmp1, tmp2
210    vuzp.u8 d0, d1
211    vuzp.u8 d2, d3
212    vuzp.u8 d1, d3
213    vuzp.u8 d0, d2
214    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
215.if numpix == 4
216    vst1.16   {d2}, [OUT]!
217.elseif numpix == 2
218    vst1.32   {d2[0]}, [OUT]!
219.elseif numpix == 1
220    vst1.16   {d2[0]}, [OUT]!
221.else
222    .error bilinear_store_0565 numpix is unsupported
223.endif
224.endm
225
226
227/*
228 * Macros for loading mask pixels into register 'mask'.
229 * vdup must be done in somewhere else.
230 */
231.macro bilinear_load_mask_x numpix, mask
232.endm
233
234.macro bilinear_load_mask_8 numpix, mask
235.if numpix == 4
236    vld1.32     {mask[0]}, [MASK]!
237.elseif numpix == 2
238    vld1.16     {mask[0]}, [MASK]!
239.elseif numpix == 1
240    vld1.8      {mask[0]}, [MASK]!
241.else
242    .error bilinear_load_mask_8 numpix is unsupported
243.endif
244    pld         [MASK, #prefetch_offset]
245.endm
246
247.macro bilinear_load_mask mask_fmt, numpix, mask
248    bilinear_load_mask_&mask_fmt numpix, mask
249.endm
250
251
252/*
253 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
254 * Interleave should be done somewhere else.
255 */
256.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
257.endm
258
259.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
260.endm
261
262.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
263.if numpix == 4
264    vld1.32     {dst0, dst1}, [OUT]
265.elseif numpix == 2
266    vld1.32     {dst0}, [OUT]
267.elseif numpix == 1
268    vld1.32     {dst0[0]}, [OUT]
269.else
270    .error bilinear_load_dst_8888 numpix is unsupported
271.endif
272    pld         [OUT, #(prefetch_offset * 4)]
273.endm
274
275.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
276    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
277.endm
278
279.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
280    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
281.endm
282
283.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
284    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
285.endm
286
287/*
288 * Macros for duplicating partially loaded mask to fill entire register.
289 * We will apply mask to interleaved source pixels, that is
290 *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
291 *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
292 * So, we need to duplicate loaded mask into whole register.
293 *
294 * For two pixel case
295 *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
296 *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
297 * We can do some optimizations for this including last pixel cases.
298 */
299.macro bilinear_duplicate_mask_x numpix, mask
300.endm
301
302.macro bilinear_duplicate_mask_8 numpix, mask
303.if numpix == 4
304    vdup.32     mask, mask[0]
305.elseif numpix == 2
306    vdup.16     mask, mask[0]
307.elseif numpix == 1
308    vdup.8      mask, mask[0]
309.else
310    .error bilinear_duplicate_mask_8 is unsupported
311.endif
312.endm
313
314.macro bilinear_duplicate_mask mask_fmt, numpix, mask
315    bilinear_duplicate_mask_&mask_fmt numpix, mask
316.endm
317
318/*
319 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
320 * Interleave should be done when maks is enabled or operator is 'over'.
321 */
322.macro bilinear_interleave src0, src1, dst0, dst1
323    vuzp.8      src0, src1
324    vuzp.8      dst0, dst1
325    vuzp.8      src0, src1
326    vuzp.8      dst0, dst1
327.endm
328
329.macro bilinear_interleave_src_dst_x_src \
330                numpix, src0, src1, src01, dst0, dst1, dst01
331.endm
332
333.macro bilinear_interleave_src_dst_x_over \
334                numpix, src0, src1, src01, dst0, dst1, dst01
335
336    bilinear_interleave src0, src1, dst0, dst1
337.endm
338
339.macro bilinear_interleave_src_dst_x_add \
340                numpix, src0, src1, src01, dst0, dst1, dst01
341.endm
342
343.macro bilinear_interleave_src_dst_8_src \
344                numpix, src0, src1, src01, dst0, dst1, dst01
345
346    bilinear_interleave src0, src1, dst0, dst1
347.endm
348
349.macro bilinear_interleave_src_dst_8_over \
350                numpix, src0, src1, src01, dst0, dst1, dst01
351
352    bilinear_interleave src0, src1, dst0, dst1
353.endm
354
355.macro bilinear_interleave_src_dst_8_add \
356                numpix, src0, src1, src01, dst0, dst1, dst01
357
358    bilinear_interleave src0, src1, dst0, dst1
359.endm
360
361.macro bilinear_interleave_src_dst \
362                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
363
364    bilinear_interleave_src_dst_&mask_fmt&_&op \
365                numpix, src0, src1, src01, dst0, dst1, dst01
366.endm
367
368
369/*
370 * Macros for applying masks to src pixels. (see combine_mask_u() function)
371 * src, dst should be in interleaved form.
372 * mask register should be in form (m0, m1, m2, m3).
373 */
374.macro bilinear_apply_mask_to_src_x \
375                numpix, src0, src1, src01, mask, \
376                tmp01, tmp23, tmp45, tmp67
377.endm
378
379.macro bilinear_apply_mask_to_src_8 \
380                numpix, src0, src1, src01, mask, \
381                tmp01, tmp23, tmp45, tmp67
382
383    vmull.u8        tmp01, src0, mask
384    vmull.u8        tmp23, src1, mask
385    /* bubbles */
386    vrshr.u16       tmp45, tmp01, #8
387    vrshr.u16       tmp67, tmp23, #8
388    /* bubbles */
389    vraddhn.u16     src0, tmp45, tmp01
390    vraddhn.u16     src1, tmp67, tmp23
391.endm
392
393.macro bilinear_apply_mask_to_src \
394                mask_fmt, numpix, src0, src1, src01, mask, \
395                tmp01, tmp23, tmp45, tmp67
396
397    bilinear_apply_mask_to_src_&mask_fmt \
398                numpix, src0, src1, src01, mask, \
399                tmp01, tmp23, tmp45, tmp67
400.endm
401
402
403/*
404 * Macros for combining src and destination pixels.
405 * Interleave or not is depending on operator 'op'.
406 */
407.macro bilinear_combine_src \
408                numpix, src0, src1, src01, dst0, dst1, dst01, \
409                tmp01, tmp23, tmp45, tmp67, tmp8
410.endm
411
412.macro bilinear_combine_over \
413                numpix, src0, src1, src01, dst0, dst1, dst01, \
414                tmp01, tmp23, tmp45, tmp67, tmp8
415
416    vdup.32     tmp8, src1[1]
417    /* bubbles */
418    vmvn.8      tmp8, tmp8
419    /* bubbles */
420    vmull.u8    tmp01, dst0, tmp8
421    /* bubbles */
422    vmull.u8    tmp23, dst1, tmp8
423    /* bubbles */
424    vrshr.u16   tmp45, tmp01, #8
425    vrshr.u16   tmp67, tmp23, #8
426    /* bubbles */
427    vraddhn.u16 dst0, tmp45, tmp01
428    vraddhn.u16 dst1, tmp67, tmp23
429    /* bubbles */
430    vqadd.u8    src01, dst01, src01
431.endm
432
433.macro bilinear_combine_add \
434                numpix, src0, src1, src01, dst0, dst1, dst01, \
435                tmp01, tmp23, tmp45, tmp67, tmp8
436
437    vqadd.u8    src01, dst01, src01
438.endm
439
440.macro bilinear_combine \
441                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
442                tmp01, tmp23, tmp45, tmp67, tmp8
443
444    bilinear_combine_&op \
445                numpix, src0, src1, src01, dst0, dst1, dst01, \
446                tmp01, tmp23, tmp45, tmp67, tmp8
447.endm
448
449/*
450 * Macros for final deinterleaving of destination pixels if needed.
451 */
452.macro bilinear_deinterleave numpix, dst0, dst1, dst01
453    vuzp.8      dst0, dst1
454    /* bubbles */
455    vuzp.8      dst0, dst1
456.endm
457
458.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
459.endm
460
461.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
462    bilinear_deinterleave numpix, dst0, dst1, dst01
463.endm
464
465.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
466.endm
467
468.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
469    bilinear_deinterleave numpix, dst0, dst1, dst01
470.endm
471
472.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
473    bilinear_deinterleave numpix, dst0, dst1, dst01
474.endm
475
476.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
477    bilinear_deinterleave numpix, dst0, dst1, dst01
478.endm
479
480.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
481    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
482.endm
483
484
485.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
486    bilinear_load_&src_fmt d0, d1, d2
487    bilinear_load_mask mask_fmt, 1, d4
488    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
489    vmull.u8  q1, d0, d28
490    vmlal.u8  q1, d1, d29
491    /* 5 cycles bubble */
492    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
493    vmlsl.u16 q0, d2, d30
494    vmlal.u16 q0, d3, d30
495    /* 5 cycles bubble */
496    bilinear_duplicate_mask mask_fmt, 1, d4
497    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
498    /* 3 cycles bubble */
499    vmovn.u16 d0, q0
500    /* 1 cycle bubble */
501    bilinear_interleave_src_dst \
502                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
503    bilinear_apply_mask_to_src \
504                mask_fmt, 1, d0, d1, q0, d4, \
505                q3, q8, q10, q11
506    bilinear_combine \
507                op, 1, d0, d1, q0, d18, d19, q9, \
508                q3, q8, q10, q11, d5
509    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
510    bilinear_store_&dst_fmt 1, q2, q3
511.endm
512
513.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
514    bilinear_load_and_vertical_interpolate_two_&src_fmt \
515                q1, q11, d0, d1, d20, d21, d22, d23
516    bilinear_load_mask mask_fmt, 2, d4
517    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
518    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
519    vmlsl.u16 q0, d2, d30
520    vmlal.u16 q0, d3, d30
521    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
522    vmlsl.u16 q10, d22, d31
523    vmlal.u16 q10, d23, d31
524    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
525    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
526    bilinear_duplicate_mask mask_fmt, 2, d4
527    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
528    vadd.u16  q12, q12, q13
529    vmovn.u16 d0, q0
530    bilinear_interleave_src_dst \
531                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
532    bilinear_apply_mask_to_src \
533                mask_fmt, 2, d0, d1, q0, d4, \
534                q3, q8, q10, q11
535    bilinear_combine \
536                op, 2, d0, d1, q0, d18, d19, q9, \
537                q3, q8, q10, q11, d5
538    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
539    bilinear_store_&dst_fmt 2, q2, q3
540.endm
541
542.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
543    bilinear_load_and_vertical_interpolate_four_&src_fmt \
544                q1, q11, d0, d1, d20, d21, d22, d23 \
545                q3, q9,  d4, d5, d16, d17, d18, d19
546    pld       [TMP1, PF_OFFS]
547    sub       TMP1, TMP1, STRIDE
548    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
549    vmlsl.u16 q0, d2, d30
550    vmlal.u16 q0, d3, d30
551    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
552    vmlsl.u16 q10, d22, d31
553    vmlal.u16 q10, d23, d31
554    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
555    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
556    vmlsl.u16 q2, d6, d30
557    vmlal.u16 q2, d7, d30
558    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
559    bilinear_load_mask mask_fmt, 4, d22
560    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
561    pld       [TMP1, PF_OFFS]
562    vmlsl.u16 q8, d18, d31
563    vmlal.u16 q8, d19, d31
564    vadd.u16  q12, q12, q13
565    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
566    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
567    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
568    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
569    bilinear_duplicate_mask mask_fmt, 4, d22
570    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
571    vmovn.u16 d0, q0
572    vmovn.u16 d1, q2
573    vadd.u16  q12, q12, q13
574    bilinear_interleave_src_dst \
575                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
576    bilinear_apply_mask_to_src \
577                mask_fmt, 4, d0, d1, q0, d22, \
578                q3, q8, q9, q10
579    bilinear_combine \
580                op, 4, d0, d1, q0, d2, d3, q1, \
581                q3, q8, q9, q10, d23
582    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
583    bilinear_store_&dst_fmt 4, q2, q3
584.endm
585
586.set BILINEAR_FLAG_USE_MASK,		1
587.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
588
589/*
590 * Main template macro for generating NEON optimized bilinear scanline functions.
591 *
592 * Bilinear scanline generator macro take folling arguments:
593 *  fname			- name of the function to generate
594 *  src_fmt			- source color format (8888 or 0565)
595 *  dst_fmt			- destination color format (8888 or 0565)
596 *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
597 *  process_last_pixel		- code block that interpolate one pixel and does not
598 *				  update horizontal weight
599 *  process_two_pixels		- code block that interpolate two pixels and update
600 *				  horizontal weight
601 *  process_four_pixels		- code block that interpolate four pixels and update
602 *				  horizontal weight
603 *  process_pixblock_head	- head part of middle loop
604 *  process_pixblock_tail	- tail part of middle loop
605 *  process_pixblock_tail_head	- tail_head of middle loop
606 *  pixblock_size		- number of pixels processed in a single middle loop
607 *  prefetch_distance		- prefetch in the source image by that many pixels ahead
608 */
609
610.macro generate_bilinear_scanline_func \
611	fname, \
612	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
613	bilinear_process_last_pixel, \
614	bilinear_process_two_pixels, \
615	bilinear_process_four_pixels, \
616	bilinear_process_pixblock_head, \
617	bilinear_process_pixblock_tail, \
618	bilinear_process_pixblock_tail_head, \
619	pixblock_size, \
620	prefetch_distance, \
621	flags
622
623pixman_asm_function fname
624.if pixblock_size == 8
625.elseif pixblock_size == 4
626.else
627    .error unsupported pixblock size
628.endif
629
630.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
631    OUT       .req    r0
632    TOP       .req    r1
633    BOTTOM    .req    r2
634    WT        .req    r3
635    WB        .req    r4
636    X         .req    r5
637    UX        .req    r6
638    WIDTH     .req    ip
639    TMP1      .req    r3
640    TMP2      .req    r4
641    PF_OFFS   .req    r7
642    TMP3      .req    r8
643    TMP4      .req    r9
644    STRIDE    .req    r2
645
646    mov		ip, sp
647    push	{r4, r5, r6, r7, r8, r9}
648    mov		PF_OFFS, #prefetch_distance
649    ldmia	ip, {WB, X, UX, WIDTH}
650.else
651    OUT       .req      r0
652    MASK      .req      r1
653    TOP       .req      r2
654    BOTTOM    .req      r3
655    WT        .req      r4
656    WB        .req      r5
657    X         .req      r6
658    UX        .req      r7
659    WIDTH     .req      ip
660    TMP1      .req      r4
661    TMP2      .req      r5
662    PF_OFFS   .req      r8
663    TMP3      .req      r9
664    TMP4      .req      r10
665    STRIDE    .req      r3
666
667    .set prefetch_offset, prefetch_distance
668
669    mov       ip, sp
670    push      {r4, r5, r6, r7, r8, r9, r10, ip}
671    mov       PF_OFFS, #prefetch_distance
672    ldmia     ip, {WT, WB, X, UX, WIDTH}
673.endif
674
675    mul       PF_OFFS, PF_OFFS, UX
676
677.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
678    vpush     {d8-d15}
679.endif
680
681    sub	      STRIDE, BOTTOM, TOP
682    .unreq    BOTTOM
683
684    cmp       WIDTH, #0
685    ble       3f
686
687    vdup.u16  q12, X
688    vdup.u16  q13, UX
689    vdup.u8   d28, WT
690    vdup.u8   d29, WB
691    vadd.u16  d25, d25, d26
692
693    /* ensure good destination alignment  */
694    cmp       WIDTH, #1
695    blt       0f
696    tst       OUT, #(1 << dst_bpp_shift)
697    beq       0f
698    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
699    vadd.u16  q12, q12, q13
700    bilinear_process_last_pixel
701    sub       WIDTH, WIDTH, #1
7020:
703    vadd.u16  q13, q13, q13
704    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
705    vadd.u16  q12, q12, q13
706
707    cmp       WIDTH, #2
708    blt       0f
709    tst       OUT, #(1 << (dst_bpp_shift + 1))
710    beq       0f
711    bilinear_process_two_pixels
712    sub       WIDTH, WIDTH, #2
7130:
714.if pixblock_size == 8
715    cmp       WIDTH, #4
716    blt       0f
717    tst       OUT, #(1 << (dst_bpp_shift + 2))
718    beq       0f
719    bilinear_process_four_pixels
720    sub       WIDTH, WIDTH, #4
7210:
722.endif
723    subs      WIDTH, WIDTH, #pixblock_size
724    blt       1f
725    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
726    bilinear_process_pixblock_head
727    subs      WIDTH, WIDTH, #pixblock_size
728    blt       5f
7290:
730    bilinear_process_pixblock_tail_head
731    subs      WIDTH, WIDTH, #pixblock_size
732    bge       0b
7335:
734    bilinear_process_pixblock_tail
7351:
736.if pixblock_size == 8
737    tst       WIDTH, #4
738    beq       2f
739    bilinear_process_four_pixels
7402:
741.endif
742    /* handle the remaining trailing pixels */
743    tst       WIDTH, #2
744    beq       2f
745    bilinear_process_two_pixels
7462:
747    tst       WIDTH, #1
748    beq       3f
749    bilinear_process_last_pixel
7503:
751.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
752    vpop      {d8-d15}
753.endif
754
755.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
756    pop       {r4, r5, r6, r7, r8, r9}
757.else
758    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
759.endif
760    bx        lr
761
762    .unreq    OUT
763    .unreq    TOP
764    .unreq    WT
765    .unreq    WB
766    .unreq    X
767    .unreq    UX
768    .unreq    WIDTH
769    .unreq    TMP1
770    .unreq    TMP2
771    .unreq    PF_OFFS
772    .unreq    TMP3
773    .unreq    TMP4
774    .unreq    STRIDE
775.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
776    .unreq    MASK
777.endif
778
779.endfunc
780
781.endm
782
783/* src_8888_8_8888 */
784.macro bilinear_src_8888_8_8888_process_last_pixel
785    bilinear_interpolate_last_pixel 8888, 8, 8888, src
786.endm
787
788.macro bilinear_src_8888_8_8888_process_two_pixels
789    bilinear_interpolate_two_pixels 8888, 8, 8888, src
790.endm
791
792.macro bilinear_src_8888_8_8888_process_four_pixels
793    bilinear_interpolate_four_pixels 8888, 8, 8888, src
794.endm
795
796.macro bilinear_src_8888_8_8888_process_pixblock_head
797    bilinear_src_8888_8_8888_process_four_pixels
798.endm
799
800.macro bilinear_src_8888_8_8888_process_pixblock_tail
801.endm
802
803.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
804    bilinear_src_8888_8_8888_process_pixblock_tail
805    bilinear_src_8888_8_8888_process_pixblock_head
806.endm
807
808/* src_8888_8_0565 */
809.macro bilinear_src_8888_8_0565_process_last_pixel
810    bilinear_interpolate_last_pixel 8888, 8, 0565, src
811.endm
812
813.macro bilinear_src_8888_8_0565_process_two_pixels
814    bilinear_interpolate_two_pixels 8888, 8, 0565, src
815.endm
816
817.macro bilinear_src_8888_8_0565_process_four_pixels
818    bilinear_interpolate_four_pixels 8888, 8, 0565, src
819.endm
820
821.macro bilinear_src_8888_8_0565_process_pixblock_head
822    bilinear_src_8888_8_0565_process_four_pixels
823.endm
824
825.macro bilinear_src_8888_8_0565_process_pixblock_tail
826.endm
827
828.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
829    bilinear_src_8888_8_0565_process_pixblock_tail
830    bilinear_src_8888_8_0565_process_pixblock_head
831.endm
832
833/* src_0565_8_x888 */
834.macro bilinear_src_0565_8_x888_process_last_pixel
835    bilinear_interpolate_last_pixel 0565, 8, 8888, src
836.endm
837
838.macro bilinear_src_0565_8_x888_process_two_pixels
839    bilinear_interpolate_two_pixels 0565, 8, 8888, src
840.endm
841
842.macro bilinear_src_0565_8_x888_process_four_pixels
843    bilinear_interpolate_four_pixels 0565, 8, 8888, src
844.endm
845
846.macro bilinear_src_0565_8_x888_process_pixblock_head
847    bilinear_src_0565_8_x888_process_four_pixels
848.endm
849
850.macro bilinear_src_0565_8_x888_process_pixblock_tail
851.endm
852
853.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
854    bilinear_src_0565_8_x888_process_pixblock_tail
855    bilinear_src_0565_8_x888_process_pixblock_head
856.endm
857
858/* src_0565_8_0565 */
859.macro bilinear_src_0565_8_0565_process_last_pixel
860    bilinear_interpolate_last_pixel 0565, 8, 0565, src
861.endm
862
863.macro bilinear_src_0565_8_0565_process_two_pixels
864    bilinear_interpolate_two_pixels 0565, 8, 0565, src
865.endm
866
867.macro bilinear_src_0565_8_0565_process_four_pixels
868    bilinear_interpolate_four_pixels 0565, 8, 0565, src
869.endm
870
871.macro bilinear_src_0565_8_0565_process_pixblock_head
872    bilinear_src_0565_8_0565_process_four_pixels
873.endm
874
875.macro bilinear_src_0565_8_0565_process_pixblock_tail
876.endm
877
878.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
879    bilinear_src_0565_8_0565_process_pixblock_tail
880    bilinear_src_0565_8_0565_process_pixblock_head
881.endm
882
883/* over_8888_8888 */
884.macro bilinear_over_8888_8888_process_last_pixel
885    bilinear_interpolate_last_pixel 8888, x, 8888, over
886.endm
887
888.macro bilinear_over_8888_8888_process_two_pixels
889    bilinear_interpolate_two_pixels 8888, x, 8888, over
890.endm
891
892.macro bilinear_over_8888_8888_process_four_pixels
893    bilinear_interpolate_four_pixels 8888, x, 8888, over
894.endm
895
896.macro bilinear_over_8888_8888_process_pixblock_head
897    mov         TMP1, X, asr #16
898    add         X, X, UX
899    add         TMP1, TOP, TMP1, asl #2
900    mov         TMP2, X, asr #16
901    add         X, X, UX
902    add         TMP2, TOP, TMP2, asl #2
903
904    vld1.32     {d22}, [TMP1], STRIDE
905    vld1.32     {d23}, [TMP1]
906    mov         TMP3, X, asr #16
907    add         X, X, UX
908    add         TMP3, TOP, TMP3, asl #2
909    vmull.u8    q8, d22, d28
910    vmlal.u8    q8, d23, d29
911
912    vld1.32     {d22}, [TMP2], STRIDE
913    vld1.32     {d23}, [TMP2]
914    mov         TMP4, X, asr #16
915    add         X, X, UX
916    add         TMP4, TOP, TMP4, asl #2
917    vmull.u8    q9, d22, d28
918    vmlal.u8    q9, d23, d29
919
920    vld1.32     {d22}, [TMP3], STRIDE
921    vld1.32     {d23}, [TMP3]
922    vmull.u8    q10, d22, d28
923    vmlal.u8    q10, d23, d29
924
925    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
926    vmlsl.u16   q0, d16, d30
927    vmlal.u16   q0, d17, d30
928
929    pld         [TMP4, PF_OFFS]
930    vld1.32     {d16}, [TMP4], STRIDE
931    vld1.32     {d17}, [TMP4]
932    pld         [TMP4, PF_OFFS]
933    vmull.u8    q11, d16, d28
934    vmlal.u8    q11, d17, d29
935
936    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
937    vmlsl.u16   q1, d18, d31
938    vmlal.u16   q1, d19, d31
939    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
940    vadd.u16    q12, q12, q13
941.endm
942
943.macro bilinear_over_8888_8888_process_pixblock_tail
944    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
945    vmlsl.u16   q2, d20, d30
946    vmlal.u16   q2, d21, d30
947    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
948    vmlsl.u16   q3, d22, d31
949    vmlal.u16   q3, d23, d31
950    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
951    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
952    vld1.32     {d2, d3}, [OUT, :128]
953    pld         [OUT, #(prefetch_offset * 4)]
954    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
955    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
956    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
957    vmovn.u16   d6, q0
958    vmovn.u16   d7, q2
959    vuzp.8      d6, d7
960    vuzp.8      d2, d3
961    vuzp.8      d6, d7
962    vuzp.8      d2, d3
963    vdup.32     d4, d7[1]
964    vmvn.8      d4, d4
965    vmull.u8    q11, d2, d4
966    vmull.u8    q2, d3, d4
967    vrshr.u16   q1, q11, #8
968    vrshr.u16   q10, q2, #8
969    vraddhn.u16 d2, q1, q11
970    vraddhn.u16 d3, q10, q2
971    vqadd.u8    q3, q1, q3
972    vuzp.8      d6, d7
973    vuzp.8      d6, d7
974    vadd.u16    q12, q12, q13
975    vst1.32     {d6, d7}, [OUT, :128]!
976.endm
977
978.macro bilinear_over_8888_8888_process_pixblock_tail_head
979                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
980    mov         TMP1, X, asr #16
981    add         X, X, UX
982    add         TMP1, TOP, TMP1, asl #2
983                                            vmlsl.u16   q2, d20, d30
984    mov         TMP2, X, asr #16
985    add         X, X, UX
986    add         TMP2, TOP, TMP2, asl #2
987                                            vmlal.u16   q2, d21, d30
988                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
989    vld1.32     {d20}, [TMP1], STRIDE
990                                            vmlsl.u16   q3, d22, d31
991                                            vmlal.u16   q3, d23, d31
992    vld1.32     {d21}, [TMP1]
993    vmull.u8    q8, d20, d28
994    vmlal.u8    q8, d21, d29
995                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
996                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
997                                            vld1.32     {d2, d3}, [OUT, :128]
998                                            pld         [OUT, PF_OFFS]
999                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
1000                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1001    vld1.32     {d22}, [TMP2], STRIDE
1002                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
1003                                            vmovn.u16   d6, q0
1004    vld1.32     {d23}, [TMP2]
1005    vmull.u8    q9, d22, d28
1006    mov         TMP3, X, asr #16
1007    add         X, X, UX
1008    add         TMP3, TOP, TMP3, asl #2
1009    mov         TMP4, X, asr #16
1010    add         X, X, UX
1011    add         TMP4, TOP, TMP4, asl #2
1012    vmlal.u8    q9, d23, d29
1013                                            vmovn.u16   d7, q2
1014    vld1.32     {d22}, [TMP3], STRIDE
1015                                            vuzp.8      d6, d7
1016                                            vuzp.8      d2, d3
1017                                            vuzp.8      d6, d7
1018                                            vuzp.8      d2, d3
1019                                            vdup.32     d4, d7[1]
1020    vld1.32     {d23}, [TMP3]
1021                                            vmvn.8      d4, d4
1022    vmull.u8    q10, d22, d28
1023    vmlal.u8    q10, d23, d29
1024                                            vmull.u8    q11, d2, d4
1025                                            vmull.u8    q2, d3, d4
1026    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
1027    vmlsl.u16   q0, d16, d30
1028                                            vrshr.u16   q1, q11, #8
1029    vmlal.u16   q0, d17, d30
1030                                            vrshr.u16   q8, q2, #8
1031                                            vraddhn.u16 d2, q1, q11
1032                                            vraddhn.u16 d3, q8, q2
1033    pld         [TMP4, PF_OFFS]
1034    vld1.32     {d16}, [TMP4], STRIDE
1035                                            vqadd.u8    q3, q1, q3
1036    vld1.32     {d17}, [TMP4]
1037    pld         [TMP4, PF_OFFS]
1038    vmull.u8    q11, d16, d28
1039    vmlal.u8    q11, d17, d29
1040                                            vuzp.8      d6, d7
1041    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
1042                                            vuzp.8      d6, d7
1043    vmlsl.u16   q1, d18, d31
1044                                            vadd.u16    q12, q12, q13
1045    vmlal.u16   q1, d19, d31
1046    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1047    vadd.u16    q12, q12, q13
1048                                            vst1.32     {d6, d7}, [OUT, :128]!
1049.endm
1050
1051/* over_8888_8_8888 */
1052.macro bilinear_over_8888_8_8888_process_last_pixel
1053    bilinear_interpolate_last_pixel 8888, 8, 8888, over
1054.endm
1055
1056.macro bilinear_over_8888_8_8888_process_two_pixels
1057    bilinear_interpolate_two_pixels 8888, 8, 8888, over
1058.endm
1059
1060.macro bilinear_over_8888_8_8888_process_four_pixels
1061    bilinear_interpolate_four_pixels 8888, 8, 8888, over
1062.endm
1063
1064.macro bilinear_over_8888_8_8888_process_pixblock_head
1065    mov         TMP1, X, asr #16
1066    add         X, X, UX
1067    add         TMP1, TOP, TMP1, asl #2
1068    vld1.32     {d0}, [TMP1], STRIDE
1069    mov         TMP2, X, asr #16
1070    add         X, X, UX
1071    add         TMP2, TOP, TMP2, asl #2
1072    vld1.32     {d1}, [TMP1]
1073    mov         TMP3, X, asr #16
1074    add         X, X, UX
1075    add         TMP3, TOP, TMP3, asl #2
1076    vld1.32     {d2}, [TMP2], STRIDE
1077    mov         TMP4, X, asr #16
1078    add         X, X, UX
1079    add         TMP4, TOP, TMP4, asl #2
1080    vld1.32     {d3}, [TMP2]
1081    vmull.u8    q2, d0, d28
1082    vmull.u8    q3, d2, d28
1083    vmlal.u8    q2, d1, d29
1084    vmlal.u8    q3, d3, d29
1085    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
1086    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
1087    vmlsl.u16   q0, d4, d30
1088    vmlsl.u16   q1, d6, d31
1089    vmlal.u16   q0, d5, d30
1090    vmlal.u16   q1, d7, d31
1091    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1092    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1093    vld1.32     {d2}, [TMP3], STRIDE
1094    vld1.32     {d3}, [TMP3]
1095    pld         [TMP4, PF_OFFS]
1096    vld1.32     {d4}, [TMP4], STRIDE
1097    vld1.32     {d5}, [TMP4]
1098    pld         [TMP4, PF_OFFS]
1099    vmull.u8    q3, d2, d28
1100    vmlal.u8    q3, d3, d29
1101    vmull.u8    q1, d4, d28
1102    vmlal.u8    q1, d5, d29
1103    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1104    vld1.32     {d22[0]}, [MASK]!
1105    pld         [MASK, #prefetch_offset]
1106    vadd.u16    q12, q12, q13
1107    vmovn.u16   d16, q0
1108.endm
1109
1110.macro bilinear_over_8888_8_8888_process_pixblock_tail
1111    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
1112    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
1113    vmlsl.u16   q9, d6, d30
1114    vmlsl.u16   q10, d2, d31
1115    vmlal.u16   q9, d7, d30
1116    vmlal.u16   q10, d3, d31
1117    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1118    vadd.u16    q12, q12, q13
1119    vdup.32     d22, d22[0]
1120    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1121    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1122    vmovn.u16   d17, q9
1123    vld1.32     {d18, d19}, [OUT, :128]
1124    pld         [OUT, PF_OFFS]
1125    vuzp.8      d16, d17
1126    vuzp.8      d18, d19
1127    vuzp.8      d16, d17
1128    vuzp.8      d18, d19
1129    vmull.u8    q10, d16, d22
1130    vmull.u8    q11, d17, d22
1131    vrsra.u16   q10, q10, #8
1132    vrsra.u16   q11, q11, #8
1133    vrshrn.u16  d16, q10, #8
1134    vrshrn.u16  d17, q11, #8
1135    vdup.32     d22, d17[1]
1136    vmvn.8      d22, d22
1137    vmull.u8    q10, d18, d22
1138    vmull.u8    q11, d19, d22
1139    vrshr.u16   q9, q10, #8
1140    vrshr.u16   q0, q11, #8
1141    vraddhn.u16 d18, q9, q10
1142    vraddhn.u16 d19, q0, q11
1143    vqadd.u8    q9, q8, q9
1144    vuzp.8      d18, d19
1145    vuzp.8      d18, d19
1146    vst1.32     {d18, d19}, [OUT, :128]!
1147.endm
1148
1149.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
1150                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
1151    mov         TMP1, X, asr #16
1152    add         X, X, UX
1153    add         TMP1, TOP, TMP1, asl #2
1154                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
1155    vld1.32     {d0}, [TMP1], STRIDE
1156    mov         TMP2, X, asr #16
1157    add         X, X, UX
1158    add         TMP2, TOP, TMP2, asl #2
1159                                            vmlsl.u16   q9, d6, d30
1160                                            vmlsl.u16   q10, d2, d31
1161    vld1.32     {d1}, [TMP1]
1162    mov         TMP3, X, asr #16
1163    add         X, X, UX
1164    add         TMP3, TOP, TMP3, asl #2
1165                                            vmlal.u16   q9, d7, d30
1166                                            vmlal.u16   q10, d3, d31
1167    vld1.32     {d2}, [TMP2], STRIDE
1168    mov         TMP4, X, asr #16
1169    add         X, X, UX
1170    add         TMP4, TOP, TMP4, asl #2
1171                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1172                                            vadd.u16    q12, q12, q13
1173    vld1.32     {d3}, [TMP2]
1174                                            vdup.32     d22, d22[0]
1175                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1176                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1177    vmull.u8    q2, d0, d28
1178    vmull.u8    q3, d2, d28
1179                                            vmovn.u16   d17, q9
1180                                            vld1.32     {d18, d19}, [OUT, :128]
1181                                            pld         [OUT, #(prefetch_offset * 4)]
1182    vmlal.u8    q2, d1, d29
1183    vmlal.u8    q3, d3, d29
1184                                            vuzp.8      d16, d17
1185                                            vuzp.8      d18, d19
1186    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
1187    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
1188                                            vuzp.8      d16, d17
1189                                            vuzp.8      d18, d19
1190    vmlsl.u16   q0, d4, d30
1191    vmlsl.u16   q1, d6, d31
1192                                            vmull.u8    q10, d16, d22
1193                                            vmull.u8    q11, d17, d22
1194    vmlal.u16   q0, d5, d30
1195    vmlal.u16   q1, d7, d31
1196                                            vrsra.u16   q10, q10, #8
1197                                            vrsra.u16   q11, q11, #8
1198    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1199    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1200                                            vrshrn.u16  d16, q10, #8
1201                                            vrshrn.u16  d17, q11, #8
1202    vld1.32     {d2}, [TMP3], STRIDE
1203                                            vdup.32     d22, d17[1]
1204    vld1.32     {d3}, [TMP3]
1205                                            vmvn.8      d22, d22
1206    pld         [TMP4, PF_OFFS]
1207    vld1.32     {d4}, [TMP4], STRIDE
1208                                            vmull.u8    q10, d18, d22
1209                                            vmull.u8    q11, d19, d22
1210    vld1.32     {d5}, [TMP4]
1211    pld         [TMP4, PF_OFFS]
1212    vmull.u8    q3, d2, d28
1213                                            vrshr.u16   q9, q10, #8
1214                                            vrshr.u16   q15, q11, #8
1215    vmlal.u8    q3, d3, d29
1216    vmull.u8    q1, d4, d28
1217                                            vraddhn.u16 d18, q9, q10
1218                                            vraddhn.u16 d19, q15, q11
1219    vmlal.u8    q1, d5, d29
1220    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1221                                            vqadd.u8    q9, q8, q9
1222    vld1.32     {d22[0]}, [MASK]!
1223                                            vuzp.8      d18, d19
1224    vadd.u16    q12, q12, q13
1225                                            vuzp.8      d18, d19
1226    vmovn.u16   d16, q0
1227                                            vst1.32     {d18, d19}, [OUT, :128]!
1228.endm
1229
1230/* add_8888_8888 */
1231.macro bilinear_add_8888_8888_process_last_pixel
1232    bilinear_interpolate_last_pixel 8888, x, 8888, add
1233.endm
1234
1235.macro bilinear_add_8888_8888_process_two_pixels
1236    bilinear_interpolate_two_pixels 8888, x, 8888, add
1237.endm
1238
1239.macro bilinear_add_8888_8888_process_four_pixels
1240    bilinear_interpolate_four_pixels 8888, x, 8888, add
1241.endm
1242
1243.macro bilinear_add_8888_8888_process_pixblock_head
1244    bilinear_add_8888_8888_process_four_pixels
1245.endm
1246
1247.macro bilinear_add_8888_8888_process_pixblock_tail
1248.endm
1249
1250.macro bilinear_add_8888_8888_process_pixblock_tail_head
1251    bilinear_add_8888_8888_process_pixblock_tail
1252    bilinear_add_8888_8888_process_pixblock_head
1253.endm
1254
1255/* add_8888_8_8888 */
1256.macro bilinear_add_8888_8_8888_process_last_pixel
1257    bilinear_interpolate_last_pixel 8888, 8, 8888, add
1258.endm
1259
1260.macro bilinear_add_8888_8_8888_process_two_pixels
1261    bilinear_interpolate_two_pixels 8888, 8, 8888, add
1262.endm
1263
1264.macro bilinear_add_8888_8_8888_process_four_pixels
1265    bilinear_interpolate_four_pixels 8888, 8, 8888, add
1266.endm
1267
1268.macro bilinear_add_8888_8_8888_process_pixblock_head
1269    bilinear_add_8888_8_8888_process_four_pixels
1270.endm
1271
1272.macro bilinear_add_8888_8_8888_process_pixblock_tail
1273.endm
1274
1275.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
1276    bilinear_add_8888_8_8888_process_pixblock_tail
1277    bilinear_add_8888_8_8888_process_pixblock_head
1278.endm
1279
1280
1281/* Bilinear scanline functions */
1282generate_bilinear_scanline_func \
1283    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
1284    8888, 8888, 2, 2, \
1285    bilinear_src_8888_8_8888_process_last_pixel, \
1286    bilinear_src_8888_8_8888_process_two_pixels, \
1287    bilinear_src_8888_8_8888_process_four_pixels, \
1288    bilinear_src_8888_8_8888_process_pixblock_head, \
1289    bilinear_src_8888_8_8888_process_pixblock_tail, \
1290    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
1291    4, 28, BILINEAR_FLAG_USE_MASK
1292
1293generate_bilinear_scanline_func \
1294    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
1295    8888, 0565, 2, 1, \
1296    bilinear_src_8888_8_0565_process_last_pixel, \
1297    bilinear_src_8888_8_0565_process_two_pixels, \
1298    bilinear_src_8888_8_0565_process_four_pixels, \
1299    bilinear_src_8888_8_0565_process_pixblock_head, \
1300    bilinear_src_8888_8_0565_process_pixblock_tail, \
1301    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
1302    4, 28, BILINEAR_FLAG_USE_MASK
1303
1304generate_bilinear_scanline_func \
1305    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
1306    0565, 8888, 1, 2, \
1307    bilinear_src_0565_8_x888_process_last_pixel, \
1308    bilinear_src_0565_8_x888_process_two_pixels, \
1309    bilinear_src_0565_8_x888_process_four_pixels, \
1310    bilinear_src_0565_8_x888_process_pixblock_head, \
1311    bilinear_src_0565_8_x888_process_pixblock_tail, \
1312    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
1313    4, 28, BILINEAR_FLAG_USE_MASK
1314
1315generate_bilinear_scanline_func \
1316    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
1317    0565, 0565, 1, 1, \
1318    bilinear_src_0565_8_0565_process_last_pixel, \
1319    bilinear_src_0565_8_0565_process_two_pixels, \
1320    bilinear_src_0565_8_0565_process_four_pixels, \
1321    bilinear_src_0565_8_0565_process_pixblock_head, \
1322    bilinear_src_0565_8_0565_process_pixblock_tail, \
1323    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
1324    4, 28, BILINEAR_FLAG_USE_MASK
1325
1326generate_bilinear_scanline_func \
1327    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
1328    8888, 8888, 2, 2, \
1329    bilinear_over_8888_8888_process_last_pixel, \
1330    bilinear_over_8888_8888_process_two_pixels, \
1331    bilinear_over_8888_8888_process_four_pixels, \
1332    bilinear_over_8888_8888_process_pixblock_head, \
1333    bilinear_over_8888_8888_process_pixblock_tail, \
1334    bilinear_over_8888_8888_process_pixblock_tail_head, \
1335    4, 28, 0
1336
1337generate_bilinear_scanline_func \
1338    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
1339    8888, 8888, 2, 2, \
1340    bilinear_over_8888_8_8888_process_last_pixel, \
1341    bilinear_over_8888_8_8888_process_two_pixels, \
1342    bilinear_over_8888_8_8888_process_four_pixels, \
1343    bilinear_over_8888_8_8888_process_pixblock_head, \
1344    bilinear_over_8888_8_8888_process_pixblock_tail, \
1345    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
1346    4, 28, BILINEAR_FLAG_USE_MASK
1347
1348generate_bilinear_scanline_func \
1349    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
1350    8888, 8888, 2, 2, \
1351    bilinear_add_8888_8888_process_last_pixel, \
1352    bilinear_add_8888_8888_process_two_pixels, \
1353    bilinear_add_8888_8888_process_four_pixels, \
1354    bilinear_add_8888_8888_process_pixblock_head, \
1355    bilinear_add_8888_8888_process_pixblock_tail, \
1356    bilinear_add_8888_8888_process_pixblock_tail_head, \
1357    4, 28, 0
1358
1359generate_bilinear_scanline_func \
1360    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
1361    8888, 8888, 2, 2, \
1362    bilinear_add_8888_8_8888_process_last_pixel, \
1363    bilinear_add_8888_8_8888_process_two_pixels, \
1364    bilinear_add_8888_8_8888_process_four_pixels, \
1365    bilinear_add_8888_8_8888_process_pixblock_head, \
1366    bilinear_add_8888_8_8888_process_pixblock_tail, \
1367    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
1368    4, 28, BILINEAR_FLAG_USE_MASK
1369