1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 *  - pixman_composite_over_8888_0565_asm_neon
34 *  - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37/* Prevent the stack from becoming executable for no reason... */
38#if defined(__linux__) && defined(__ELF__)
39.section .note.GNU-stack,"",%progbits
40#endif
41
42    .text
43    .fpu neon
44    .arch armv7a
45    .object_arch armv4
46    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
48    .arm
49    .altmacro
50    .p2align 2
51
52#include "pixman-private.h"
53#include "pixman-arm-neon-asm.h"
54
55/* Global configuration options and preferences */
56
57/*
58 * The code can optionally make use of unaligned memory accesses to improve
59 * performance of handling leading/trailing pixels for each scanline.
60 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
61 * example in linux if unaligned memory accesses are not configured to
62 * generate.exceptions.
63 */
64.set RESPECT_STRICT_ALIGNMENT, 1
65
66/*
67 * Set default prefetch type. There is a choice between the following options:
68 *
69 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
70 * as NOP to workaround some HW bugs or for whatever other reason)
71 *
72 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
73 * advanced prefetch intruduces heavy overhead)
74 *
75 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
76 * which can run ARM and NEON instructions simultaneously so that extra ARM
77 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
78 *
79 * Note: some types of function can't support advanced prefetch and fallback
80 *       to simple one (those which handle 24bpp pixels)
81 */
82.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
83
84/* Prefetch distance in pixels for simple prefetch */
85.set PREFETCH_DISTANCE_SIMPLE, 64
86
87/*
88 * Implementation of pixman_composite_over_8888_0565_asm_neon
89 *
90 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
91 * performs OVER compositing operation. Function fast_composite_over_8888_0565
92 * from pixman-fast-path.c does the same in C and can be used as a reference.
93 *
94 * First we need to have some NEON assembly code which can do the actual
95 * operation on the pixels and provide it to the template macro.
96 *
97 * Template macro quite conveniently takes care of emitting all the necessary
98 * code for memory reading and writing (including quite tricky cases of
99 * handling unaligned leading/trailing pixels), so we only need to deal with
100 * the data in NEON registers.
101 *
102 * NEON registers allocation in general is recommented to be the following:
103 * d0,  d1,  d2,  d3  - contain loaded source pixel data
104 * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
105 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
106 * d28, d29, d30, d31 - place for storing the result (destination pixels)
107 *
108 * As can be seen above, four 64-bit NEON registers are used for keeping
109 * intermediate pixel data and up to 8 pixels can be processed in one step
110 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
111 *
112 * This particular function uses the following registers allocation:
113 * d0,  d1,  d2,  d3  - contain loaded source pixel data
114 * d4,  d5            - contain loaded destination pixels (they are needed)
115 * d28, d29           - place for storing the result (destination pixels)
116 */
117
118/*
119 * Step one. We need to have some code to do some arithmetics on pixel data.
120 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
121 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
122 * perform all the needed calculations and write the result to {d28, d29}.
123 * The rationale for having two macros and not just one will be explained
124 * later. In practice, any single monolitic function which does the work can
125 * be split into two parts in any arbitrary way without affecting correctness.
126 *
127 * There is one special trick here too. Common template macro can optionally
128 * make our life a bit easier by doing R, G, B, A color components
129 * deinterleaving for 32bpp pixel formats (and this feature is used in
130 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
131 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
132 * actually use d0 register for blue channel (a vector of eight 8-bit
133 * values), d1 register for green, d2 for red and d3 for alpha. This
134 * simple conversion can be also done with a few NEON instructions:
135 *
136 * Packed to planar conversion:
137 *  vuzp.8 d0, d1
138 *  vuzp.8 d2, d3
139 *  vuzp.8 d1, d3
140 *  vuzp.8 d0, d2
141 *
142 * Planar to packed conversion:
143 *  vzip.8 d0, d2
144 *  vzip.8 d1, d3
145 *  vzip.8 d2, d3
146 *  vzip.8 d0, d1
147 *
148 * But pixel can be loaded directly in planar format using VLD4.8 NEON
149 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
150 * desirable, that's why deinterleaving is optional.
151 *
152 * But anyway, here is the code:
153 */
154.macro pixman_composite_over_8888_0565_process_pixblock_head
155    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
156       and put data into d6 - red, d7 - green, d30 - blue */
157    vshrn.u16   d6, q2, #8
158    vshrn.u16   d7, q2, #3
159    vsli.u16    q2, q2, #5
160    vsri.u8     d6, d6, #5
161    vmvn.8      d3, d3      /* invert source alpha */
162    vsri.u8     d7, d7, #6
163    vshrn.u16   d30, q2, #2
164    /* now do alpha blending, storing results in 8-bit planar format
165       into d16 - red, d19 - green, d18 - blue */
166    vmull.u8    q10, d3, d6
167    vmull.u8    q11, d3, d7
168    vmull.u8    q12, d3, d30
169    vrshr.u16   q13, q10, #8
170    vrshr.u16   q3, q11, #8
171    vrshr.u16   q15, q12, #8
172    vraddhn.u16 d20, q10, q13
173    vraddhn.u16 d23, q11, q3
174    vraddhn.u16 d22, q12, q15
175.endm
176
177.macro pixman_composite_over_8888_0565_process_pixblock_tail
178    /* ... continue alpha blending */
179    vqadd.u8    d16, d2, d20
180    vqadd.u8    q9, q0, q11
181    /* convert the result to r5g6b5 and store it into {d28, d29} */
182    vshll.u8    q14, d16, #8
183    vshll.u8    q8, d19, #8
184    vshll.u8    q9, d18, #8
185    vsri.u16    q14, q8, #5
186    vsri.u16    q14, q9, #11
187.endm
188
189/*
190 * OK, now we got almost everything that we need. Using the above two
191 * macros, the work can be done right. But now we want to optimize
192 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
193 * a lot from good code scheduling and software pipelining.
194 *
195 * Let's construct some code, which will run in the core main loop.
196 * Some pseudo-code of the main loop will look like this:
197 *   head
198 *   while (...) {
199 *     tail
200 *     head
201 *   }
202 *   tail
203 *
204 * It may look a bit weird, but this setup allows to hide instruction
205 * latencies better and also utilize dual-issue capability more
206 * efficiently (make pairs of load-store and ALU instructions).
207 *
208 * So what we need now is a '*_tail_head' macro, which will be used
209 * in the core main loop. A trivial straightforward implementation
210 * of this macro would look like this:
211 *
212 *   pixman_composite_over_8888_0565_process_pixblock_tail
213 *   vst1.16     {d28, d29}, [DST_W, :128]!
214 *   vld1.16     {d4, d5}, [DST_R, :128]!
215 *   vld4.32     {d0, d1, d2, d3}, [SRC]!
216 *   pixman_composite_over_8888_0565_process_pixblock_head
217 *   cache_preload 8, 8
218 *
219 * Now it also got some VLD/VST instructions. We simply can't move from
220 * processing one block of pixels to the other one with just arithmetics.
221 * The previously processed data needs to be written to memory and new
222 * data needs to be fetched. Fortunately, this main loop does not deal
223 * with partial leading/trailing pixels and can load/store a full block
224 * of pixels in a bulk. Additionally, destination buffer is already
225 * 16 bytes aligned here (which is good for performance).
226 *
227 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
228 * are the aliases for ARM registers which are used as pointers for
229 * accessing data. We maintain separate pointers for reading and writing
230 * destination buffer (DST_R and DST_W).
231 *
232 * Another new thing is 'cache_preload' macro. It is used for prefetching
233 * data into CPU L2 cache and improve performance when dealing with large
234 * images which are far larger than cache size. It uses one argument
235 * (actually two, but they need to be the same here) - number of pixels
236 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
237 * details about this macro. Moreover, if good performance is needed
238 * the code from this macro needs to be copied into '*_tail_head' macro
239 * and mixed with the rest of code for optimal instructions scheduling.
240 * We are actually doing it below.
241 *
242 * Now after all the explanations, here is the optimized code.
243 * Different instruction streams (originaling from '*_head', '*_tail'
244 * and 'cache_preload' macro) use different indentation levels for
245 * better readability. Actually taking the code from one of these
246 * indentation levels and ignoring a few VLD/VST instructions would
247 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
248 * macro!
249 */
250
251#if 1
252
253.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
254        vqadd.u8    d16, d2, d20
255    vld1.16     {d4, d5}, [DST_R, :128]!
256        vqadd.u8    q9, q0, q11
257    vshrn.u16   d6, q2, #8
258    fetch_src_pixblock
259    vshrn.u16   d7, q2, #3
260    vsli.u16    q2, q2, #5
261        vshll.u8    q14, d16, #8
262                                    PF add PF_X, PF_X, #8
263        vshll.u8    q8, d19, #8
264                                    PF tst PF_CTL, #0xF
265    vsri.u8     d6, d6, #5
266                                    PF addne PF_X, PF_X, #8
267    vmvn.8      d3, d3
268                                    PF subne PF_CTL, PF_CTL, #1
269    vsri.u8     d7, d7, #6
270    vshrn.u16   d30, q2, #2
271    vmull.u8    q10, d3, d6
272                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
273    vmull.u8    q11, d3, d7
274    vmull.u8    q12, d3, d30
275                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
276        vsri.u16    q14, q8, #5
277                                    PF cmp PF_X, ORIG_W
278        vshll.u8    q9, d18, #8
279    vrshr.u16   q13, q10, #8
280                                    PF subge PF_X, PF_X, ORIG_W
281    vrshr.u16   q3, q11, #8
282    vrshr.u16   q15, q12, #8
283                                    PF subges PF_CTL, PF_CTL, #0x10
284        vsri.u16    q14, q9, #11
285                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
286    vraddhn.u16 d20, q10, q13
287    vraddhn.u16 d23, q11, q3
288                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
289    vraddhn.u16 d22, q12, q15
290        vst1.16     {d28, d29}, [DST_W, :128]!
291.endm
292
293#else
294
295/* If we did not care much about the performance, we would just use this... */
296.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
297    pixman_composite_over_8888_0565_process_pixblock_tail
298    vst1.16     {d28, d29}, [DST_W, :128]!
299    vld1.16     {d4, d5}, [DST_R, :128]!
300    fetch_src_pixblock
301    pixman_composite_over_8888_0565_process_pixblock_head
302    cache_preload 8, 8
303.endm
304
305#endif
306
307/*
308 * And now the final part. We are using 'generate_composite_function' macro
309 * to put all the stuff together. We are specifying the name of the function
310 * which we want to get, number of bits per pixel for the source, mask and
311 * destination (0 if unused, like mask in this case). Next come some bit
312 * flags:
313 *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
314 *                             and written, for write-only buffer we would use
315 *                             FLAG_DST_WRITEONLY flag instead
316 *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
317 *                             and separate color channels for 32bpp format.
318 * The next things are:
319 *  - the number of pixels processed per iteration (8 in this case, because
320 *    that's the maximum what can fit into four 64-bit NEON registers).
321 *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
322 *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
323 *    prefetch distance can be selected by running some benchmarks.
324 *
325 * After that we specify some macros, these are 'default_init',
326 * 'default_cleanup' here which are empty (but it is possible to have custom
327 * init/cleanup macros to be able to save/restore some extra NEON registers
328 * like d8-d15 or do anything else) followed by
329 * 'pixman_composite_over_8888_0565_process_pixblock_head',
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
331 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
332 * which we got implemented above.
333 *
334 * The last part is the NEON registers allocation scheme.
335 */
336generate_composite_function \
337    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
338    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
339    8, /* number of pixels, processed in a single block */ \
340    5, /* prefetch distance */ \
341    default_init, \
342    default_cleanup, \
343    pixman_composite_over_8888_0565_process_pixblock_head, \
344    pixman_composite_over_8888_0565_process_pixblock_tail, \
345    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
346    28, /* dst_w_basereg */ \
347    4,  /* dst_r_basereg */ \
348    0,  /* src_basereg   */ \
349    24  /* mask_basereg  */
350
351/******************************************************************************/
352
353.macro pixman_composite_over_n_0565_process_pixblock_head
354    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
355       and put data into d6 - red, d7 - green, d30 - blue */
356    vshrn.u16   d6, q2, #8
357    vshrn.u16   d7, q2, #3
358    vsli.u16    q2, q2, #5
359    vsri.u8     d6, d6, #5
360    vsri.u8     d7, d7, #6
361    vshrn.u16   d30, q2, #2
362    /* now do alpha blending, storing results in 8-bit planar format
363       into d16 - red, d19 - green, d18 - blue */
364    vmull.u8    q10, d3, d6
365    vmull.u8    q11, d3, d7
366    vmull.u8    q12, d3, d30
367    vrshr.u16   q13, q10, #8
368    vrshr.u16   q3, q11, #8
369    vrshr.u16   q15, q12, #8
370    vraddhn.u16 d20, q10, q13
371    vraddhn.u16 d23, q11, q3
372    vraddhn.u16 d22, q12, q15
373.endm
374
375.macro pixman_composite_over_n_0565_process_pixblock_tail
376    /* ... continue alpha blending */
377    vqadd.u8    d16, d2, d20
378    vqadd.u8    q9, q0, q11
379    /* convert the result to r5g6b5 and store it into {d28, d29} */
380    vshll.u8    q14, d16, #8
381    vshll.u8    q8, d19, #8
382    vshll.u8    q9, d18, #8
383    vsri.u16    q14, q8, #5
384    vsri.u16    q14, q9, #11
385.endm
386
387/* TODO: expand macros and do better instructions scheduling */
388.macro pixman_composite_over_n_0565_process_pixblock_tail_head
389    pixman_composite_over_n_0565_process_pixblock_tail
390    vld1.16     {d4, d5}, [DST_R, :128]!
391    vst1.16     {d28, d29}, [DST_W, :128]!
392    pixman_composite_over_n_0565_process_pixblock_head
393    cache_preload 8, 8
394.endm
395
396.macro pixman_composite_over_n_0565_init
397    add         DUMMY, sp, #ARGS_STACK_OFFSET
398    vld1.32     {d3[0]}, [DUMMY]
399    vdup.8      d0, d3[0]
400    vdup.8      d1, d3[1]
401    vdup.8      d2, d3[2]
402    vdup.8      d3, d3[3]
403    vmvn.8      d3, d3      /* invert source alpha */
404.endm
405
406generate_composite_function \
407    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
408    FLAG_DST_READWRITE, \
409    8, /* number of pixels, processed in a single block */ \
410    5, /* prefetch distance */ \
411    pixman_composite_over_n_0565_init, \
412    default_cleanup, \
413    pixman_composite_over_n_0565_process_pixblock_head, \
414    pixman_composite_over_n_0565_process_pixblock_tail, \
415    pixman_composite_over_n_0565_process_pixblock_tail_head, \
416    28, /* dst_w_basereg */ \
417    4,  /* dst_r_basereg */ \
418    0,  /* src_basereg   */ \
419    24  /* mask_basereg  */
420
421/******************************************************************************/
422
423.macro pixman_composite_src_8888_0565_process_pixblock_head
424    vshll.u8    q8, d1, #8
425    vshll.u8    q14, d2, #8
426    vshll.u8    q9, d0, #8
427.endm
428
429.macro pixman_composite_src_8888_0565_process_pixblock_tail
430    vsri.u16    q14, q8, #5
431    vsri.u16    q14, q9, #11
432.endm
433
434.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
435        vsri.u16    q14, q8, #5
436                                    PF add PF_X, PF_X, #8
437                                    PF tst PF_CTL, #0xF
438    fetch_src_pixblock
439                                    PF addne PF_X, PF_X, #8
440                                    PF subne PF_CTL, PF_CTL, #1
441        vsri.u16    q14, q9, #11
442                                    PF cmp PF_X, ORIG_W
443                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
444    vshll.u8    q8, d1, #8
445        vst1.16     {d28, d29}, [DST_W, :128]!
446                                    PF subge PF_X, PF_X, ORIG_W
447                                    PF subges PF_CTL, PF_CTL, #0x10
448    vshll.u8    q14, d2, #8
449                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
450    vshll.u8    q9, d0, #8
451.endm
452
453generate_composite_function \
454    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
455    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
456    8, /* number of pixels, processed in a single block */ \
457    10, /* prefetch distance */ \
458    default_init, \
459    default_cleanup, \
460    pixman_composite_src_8888_0565_process_pixblock_head, \
461    pixman_composite_src_8888_0565_process_pixblock_tail, \
462    pixman_composite_src_8888_0565_process_pixblock_tail_head
463
464/******************************************************************************/
465
466.macro pixman_composite_src_0565_8888_process_pixblock_head
467    vshrn.u16   d30, q0, #8
468    vshrn.u16   d29, q0, #3
469    vsli.u16    q0, q0, #5
470    vmov.u8     d31, #255
471    vsri.u8     d30, d30, #5
472    vsri.u8     d29, d29, #6
473    vshrn.u16   d28, q0, #2
474.endm
475
476.macro pixman_composite_src_0565_8888_process_pixblock_tail
477.endm
478
479/* TODO: expand macros and do better instructions scheduling */
480.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
481    pixman_composite_src_0565_8888_process_pixblock_tail
482    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
483    fetch_src_pixblock
484    pixman_composite_src_0565_8888_process_pixblock_head
485    cache_preload 8, 8
486.endm
487
488generate_composite_function \
489    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
490    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
491    8, /* number of pixels, processed in a single block */ \
492    10, /* prefetch distance */ \
493    default_init, \
494    default_cleanup, \
495    pixman_composite_src_0565_8888_process_pixblock_head, \
496    pixman_composite_src_0565_8888_process_pixblock_tail, \
497    pixman_composite_src_0565_8888_process_pixblock_tail_head
498
499/******************************************************************************/
500
501.macro pixman_composite_add_8_8_process_pixblock_head
502    vqadd.u8    q14, q0, q2
503    vqadd.u8    q15, q1, q3
504.endm
505
506.macro pixman_composite_add_8_8_process_pixblock_tail
507.endm
508
509.macro pixman_composite_add_8_8_process_pixblock_tail_head
510    fetch_src_pixblock
511                                    PF add PF_X, PF_X, #32
512                                    PF tst PF_CTL, #0xF
513    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
514                                    PF addne PF_X, PF_X, #32
515                                    PF subne PF_CTL, PF_CTL, #1
516        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
517                                    PF cmp PF_X, ORIG_W
518                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
519                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
520                                    PF subge PF_X, PF_X, ORIG_W
521                                    PF subges PF_CTL, PF_CTL, #0x10
522    vqadd.u8    q14, q0, q2
523                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
524                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
525    vqadd.u8    q15, q1, q3
526.endm
527
528generate_composite_function \
529    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
530    FLAG_DST_READWRITE, \
531    32, /* number of pixels, processed in a single block */ \
532    10, /* prefetch distance */ \
533    default_init, \
534    default_cleanup, \
535    pixman_composite_add_8_8_process_pixblock_head, \
536    pixman_composite_add_8_8_process_pixblock_tail, \
537    pixman_composite_add_8_8_process_pixblock_tail_head
538
539/******************************************************************************/
540
541.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
542    fetch_src_pixblock
543                                    PF add PF_X, PF_X, #8
544                                    PF tst PF_CTL, #0xF
545    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
546                                    PF addne PF_X, PF_X, #8
547                                    PF subne PF_CTL, PF_CTL, #1
548        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
549                                    PF cmp PF_X, ORIG_W
550                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
551                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
552                                    PF subge PF_X, PF_X, ORIG_W
553                                    PF subges PF_CTL, PF_CTL, #0x10
554    vqadd.u8    q14, q0, q2
555                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
556                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
557    vqadd.u8    q15, q1, q3
558.endm
559
560generate_composite_function \
561    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
562    FLAG_DST_READWRITE, \
563    8, /* number of pixels, processed in a single block */ \
564    10, /* prefetch distance */ \
565    default_init, \
566    default_cleanup, \
567    pixman_composite_add_8_8_process_pixblock_head, \
568    pixman_composite_add_8_8_process_pixblock_tail, \
569    pixman_composite_add_8888_8888_process_pixblock_tail_head
570
571generate_composite_function_single_scanline \
572    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
573    FLAG_DST_READWRITE, \
574    8, /* number of pixels, processed in a single block */ \
575    default_init, \
576    default_cleanup, \
577    pixman_composite_add_8_8_process_pixblock_head, \
578    pixman_composite_add_8_8_process_pixblock_tail, \
579    pixman_composite_add_8888_8888_process_pixblock_tail_head
580
581/******************************************************************************/
582
583.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
584    vmvn.8      d24, d3  /* get inverted alpha */
585    /* do alpha blending */
586    vmull.u8    q8, d24, d4
587    vmull.u8    q9, d24, d5
588    vmull.u8    q10, d24, d6
589    vmull.u8    q11, d24, d7
590.endm
591
592.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
593    vrshr.u16   q14, q8, #8
594    vrshr.u16   q15, q9, #8
595    vrshr.u16   q12, q10, #8
596    vrshr.u16   q13, q11, #8
597    vraddhn.u16 d28, q14, q8
598    vraddhn.u16 d29, q15, q9
599    vraddhn.u16 d30, q12, q10
600    vraddhn.u16 d31, q13, q11
601.endm
602
603.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
604    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
605        vrshr.u16   q14, q8, #8
606                                    PF add PF_X, PF_X, #8
607                                    PF tst PF_CTL, #0xF
608        vrshr.u16   q15, q9, #8
609        vrshr.u16   q12, q10, #8
610        vrshr.u16   q13, q11, #8
611                                    PF addne PF_X, PF_X, #8
612                                    PF subne PF_CTL, PF_CTL, #1
613        vraddhn.u16 d28, q14, q8
614        vraddhn.u16 d29, q15, q9
615                                    PF cmp PF_X, ORIG_W
616        vraddhn.u16 d30, q12, q10
617        vraddhn.u16 d31, q13, q11
618    fetch_src_pixblock
619                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
620    vmvn.8      d22, d3
621                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
622        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
623                                    PF subge PF_X, PF_X, ORIG_W
624    vmull.u8    q8, d22, d4
625                                    PF subges PF_CTL, PF_CTL, #0x10
626    vmull.u8    q9, d22, d5
627                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
628    vmull.u8    q10, d22, d6
629                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
630    vmull.u8    q11, d22, d7
631.endm
632
633generate_composite_function_single_scanline \
634    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
635    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
636    8, /* number of pixels, processed in a single block */ \
637    default_init, \
638    default_cleanup, \
639    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
640    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
641    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
642
643/******************************************************************************/
644
645.macro pixman_composite_over_8888_8888_process_pixblock_head
646    pixman_composite_out_reverse_8888_8888_process_pixblock_head
647.endm
648
649.macro pixman_composite_over_8888_8888_process_pixblock_tail
650    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
651    vqadd.u8    q14, q0, q14
652    vqadd.u8    q15, q1, q15
653.endm
654
655.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
656    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
657        vrshr.u16   q14, q8, #8
658                                    PF add PF_X, PF_X, #8
659                                    PF tst PF_CTL, #0xF
660        vrshr.u16   q15, q9, #8
661        vrshr.u16   q12, q10, #8
662        vrshr.u16   q13, q11, #8
663                                    PF addne PF_X, PF_X, #8
664                                    PF subne PF_CTL, PF_CTL, #1
665        vraddhn.u16 d28, q14, q8
666        vraddhn.u16 d29, q15, q9
667                                    PF cmp PF_X, ORIG_W
668        vraddhn.u16 d30, q12, q10
669        vraddhn.u16 d31, q13, q11
670        vqadd.u8    q14, q0, q14
671        vqadd.u8    q15, q1, q15
672    fetch_src_pixblock
673                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
674    vmvn.8      d22, d3
675                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
676        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
677                                    PF subge PF_X, PF_X, ORIG_W
678    vmull.u8    q8, d22, d4
679                                    PF subges PF_CTL, PF_CTL, #0x10
680    vmull.u8    q9, d22, d5
681                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
682    vmull.u8    q10, d22, d6
683                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
684    vmull.u8    q11, d22, d7
685.endm
686
687generate_composite_function \
688    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
689    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
690    8, /* number of pixels, processed in a single block */ \
691    5, /* prefetch distance */ \
692    default_init, \
693    default_cleanup, \
694    pixman_composite_over_8888_8888_process_pixblock_head, \
695    pixman_composite_over_8888_8888_process_pixblock_tail, \
696    pixman_composite_over_8888_8888_process_pixblock_tail_head
697
698generate_composite_function_single_scanline \
699    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
700    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
701    8, /* number of pixels, processed in a single block */ \
702    default_init, \
703    default_cleanup, \
704    pixman_composite_over_8888_8888_process_pixblock_head, \
705    pixman_composite_over_8888_8888_process_pixblock_tail, \
706    pixman_composite_over_8888_8888_process_pixblock_tail_head
707
708/******************************************************************************/
709
710.macro pixman_composite_over_n_8888_process_pixblock_head
711    /* deinterleaved source pixels in {d0, d1, d2, d3} */
712    /* inverted alpha in {d24} */
713    /* destination pixels in {d4, d5, d6, d7} */
714    vmull.u8    q8, d24, d4
715    vmull.u8    q9, d24, d5
716    vmull.u8    q10, d24, d6
717    vmull.u8    q11, d24, d7
718.endm
719
720.macro pixman_composite_over_n_8888_process_pixblock_tail
721    vrshr.u16   q14, q8, #8
722    vrshr.u16   q15, q9, #8
723    vrshr.u16   q2, q10, #8
724    vrshr.u16   q3, q11, #8
725    vraddhn.u16 d28, q14, q8
726    vraddhn.u16 d29, q15, q9
727    vraddhn.u16 d30, q2, q10
728    vraddhn.u16 d31, q3, q11
729    vqadd.u8    q14, q0, q14
730    vqadd.u8    q15, q1, q15
731.endm
732
733.macro pixman_composite_over_n_8888_process_pixblock_tail_head
734        vrshr.u16   q14, q8, #8
735        vrshr.u16   q15, q9, #8
736        vrshr.u16   q2, q10, #8
737        vrshr.u16   q3, q11, #8
738        vraddhn.u16 d28, q14, q8
739        vraddhn.u16 d29, q15, q9
740        vraddhn.u16 d30, q2, q10
741        vraddhn.u16 d31, q3, q11
742    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
743        vqadd.u8    q14, q0, q14
744                                    PF add PF_X, PF_X, #8
745                                    PF tst PF_CTL, #0x0F
746                                    PF addne PF_X, PF_X, #8
747                                    PF subne PF_CTL, PF_CTL, #1
748        vqadd.u8    q15, q1, q15
749                                    PF cmp PF_X, ORIG_W
750    vmull.u8    q8, d24, d4
751                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
752    vmull.u8    q9, d24, d5
753                                    PF subge PF_X, PF_X, ORIG_W
754    vmull.u8    q10, d24, d6
755                                    PF subges PF_CTL, PF_CTL, #0x10
756    vmull.u8    q11, d24, d7
757                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
758        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
759.endm
760
761.macro pixman_composite_over_n_8888_init
762    add         DUMMY, sp, #ARGS_STACK_OFFSET
763    vld1.32     {d3[0]}, [DUMMY]
764    vdup.8      d0, d3[0]
765    vdup.8      d1, d3[1]
766    vdup.8      d2, d3[2]
767    vdup.8      d3, d3[3]
768    vmvn.8      d24, d3  /* get inverted alpha */
769.endm
770
771generate_composite_function \
772    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
773    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
774    8, /* number of pixels, processed in a single block */ \
775    5, /* prefetch distance */ \
776    pixman_composite_over_n_8888_init, \
777    default_cleanup, \
778    pixman_composite_over_8888_8888_process_pixblock_head, \
779    pixman_composite_over_8888_8888_process_pixblock_tail, \
780    pixman_composite_over_n_8888_process_pixblock_tail_head
781
782/******************************************************************************/
783
784.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
785        vrshr.u16   q14, q8, #8
786                                    PF add PF_X, PF_X, #8
787                                    PF tst PF_CTL, #0xF
788        vrshr.u16   q15, q9, #8
789        vrshr.u16   q12, q10, #8
790        vrshr.u16   q13, q11, #8
791                                    PF addne PF_X, PF_X, #8
792                                    PF subne PF_CTL, PF_CTL, #1
793        vraddhn.u16 d28, q14, q8
794        vraddhn.u16 d29, q15, q9
795                                    PF cmp PF_X, ORIG_W
796        vraddhn.u16 d30, q12, q10
797        vraddhn.u16 d31, q13, q11
798        vqadd.u8    q14, q0, q14
799        vqadd.u8    q15, q1, q15
800    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
801    vmvn.8      d22, d3
802                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
803        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
804                                    PF subge PF_X, PF_X, ORIG_W
805    vmull.u8    q8, d22, d4
806                                    PF subges PF_CTL, PF_CTL, #0x10
807    vmull.u8    q9, d22, d5
808    vmull.u8    q10, d22, d6
809                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
810    vmull.u8    q11, d22, d7
811.endm
812
813.macro pixman_composite_over_reverse_n_8888_init
814    add         DUMMY, sp, #ARGS_STACK_OFFSET
815    vld1.32     {d7[0]}, [DUMMY]
816    vdup.8      d4, d7[0]
817    vdup.8      d5, d7[1]
818    vdup.8      d6, d7[2]
819    vdup.8      d7, d7[3]
820.endm
821
822generate_composite_function \
823    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
824    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
825    8, /* number of pixels, processed in a single block */ \
826    5, /* prefetch distance */ \
827    pixman_composite_over_reverse_n_8888_init, \
828    default_cleanup, \
829    pixman_composite_over_8888_8888_process_pixblock_head, \
830    pixman_composite_over_8888_8888_process_pixblock_tail, \
831    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
832    28, /* dst_w_basereg */ \
833    0,  /* dst_r_basereg */ \
834    4,  /* src_basereg   */ \
835    24  /* mask_basereg  */
836
837/******************************************************************************/
838
839.macro pixman_composite_over_8888_8_0565_process_pixblock_head
840    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
841    vmull.u8    q1,  d24, d9
842    vmull.u8    q6,  d24, d10
843    vmull.u8    q7,  d24, d11
844        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
845        vshrn.u16   d7,  q2, #3
846        vsli.u16    q2,  q2, #5
847    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
848    vrshr.u16   q9,  q1,  #8
849    vrshr.u16   q10, q6,  #8
850    vrshr.u16   q11, q7,  #8
851    vraddhn.u16 d0,  q0,  q8
852    vraddhn.u16 d1,  q1,  q9
853    vraddhn.u16 d2,  q6,  q10
854    vraddhn.u16 d3,  q7,  q11
855        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
856        vsri.u8     d7,  d7, #6
857    vmvn.8      d3,  d3
858        vshrn.u16   d30, q2, #2
859    vmull.u8    q8,  d3, d6     /* now do alpha blending */
860    vmull.u8    q9,  d3, d7
861    vmull.u8    q10, d3, d30
862.endm
863
864.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
865    /* 3 cycle bubble (after vmull.u8) */
866    vrshr.u16   q13, q8,  #8
867    vrshr.u16   q11, q9,  #8
868    vrshr.u16   q15, q10, #8
869    vraddhn.u16 d16, q8,  q13
870    vraddhn.u16 d27, q9,  q11
871    vraddhn.u16 d26, q10, q15
872    vqadd.u8    d16, d2,  d16
873    /* 1 cycle bubble */
874    vqadd.u8    q9,  q0,  q13
875    vshll.u8    q14, d16, #8    /* convert to 16bpp */
876    vshll.u8    q8,  d19, #8
877    vshll.u8    q9,  d18, #8
878    vsri.u16    q14, q8,  #5
879    /* 1 cycle bubble */
880    vsri.u16    q14, q9,  #11
881.endm
882
883.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
884    vld1.16     {d4, d5}, [DST_R, :128]!
885    vshrn.u16   d6,  q2,  #8
886    fetch_mask_pixblock
887    vshrn.u16   d7,  q2,  #3
888    fetch_src_pixblock
889    vmull.u8    q6,  d24, d10
890        vrshr.u16   q13, q8,  #8
891        vrshr.u16   q11, q9,  #8
892        vrshr.u16   q15, q10, #8
893        vraddhn.u16 d16, q8,  q13
894        vraddhn.u16 d27, q9,  q11
895        vraddhn.u16 d26, q10, q15
896        vqadd.u8    d16, d2,  d16
897    vmull.u8    q1,  d24, d9
898        vqadd.u8    q9,  q0,  q13
899        vshll.u8    q14, d16, #8
900    vmull.u8    q0,  d24, d8
901        vshll.u8    q8,  d19, #8
902        vshll.u8    q9,  d18, #8
903        vsri.u16    q14, q8,  #5
904    vmull.u8    q7,  d24, d11
905        vsri.u16    q14, q9,  #11
906
907    cache_preload 8, 8
908
909    vsli.u16    q2,  q2,  #5
910    vrshr.u16   q8,  q0,  #8
911    vrshr.u16   q9,  q1,  #8
912    vrshr.u16   q10, q6,  #8
913    vrshr.u16   q11, q7,  #8
914    vraddhn.u16 d0,  q0,  q8
915    vraddhn.u16 d1,  q1,  q9
916    vraddhn.u16 d2,  q6,  q10
917    vraddhn.u16 d3,  q7,  q11
918    vsri.u8     d6,  d6,  #5
919    vsri.u8     d7,  d7,  #6
920    vmvn.8      d3,  d3
921    vshrn.u16   d30, q2,  #2
922    vst1.16     {d28, d29}, [DST_W, :128]!
923    vmull.u8    q8,  d3,  d6
924    vmull.u8    q9,  d3,  d7
925    vmull.u8    q10, d3,  d30
926.endm
927
928generate_composite_function \
929    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
930    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
931    8, /* number of pixels, processed in a single block */ \
932    5, /* prefetch distance */ \
933    default_init_need_all_regs, \
934    default_cleanup_need_all_regs, \
935    pixman_composite_over_8888_8_0565_process_pixblock_head, \
936    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
937    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
938    28, /* dst_w_basereg */ \
939    4,  /* dst_r_basereg */ \
940    8,  /* src_basereg   */ \
941    24  /* mask_basereg  */
942
943/******************************************************************************/
944
945/*
946 * This function needs a special initialization of solid mask.
947 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
948 * offset, split into color components and replicated in d8-d11
949 * registers. Additionally, this function needs all the NEON registers,
950 * so it has to save d8-d15 registers which are callee saved according
951 * to ABI. These registers are restored from 'cleanup' macro. All the
952 * other NEON registers are caller saved, so can be clobbered freely
953 * without introducing any problems.
954 */
955.macro pixman_composite_over_n_8_0565_init
956    add         DUMMY, sp, #ARGS_STACK_OFFSET
957    vpush       {d8-d15}
958    vld1.32     {d11[0]}, [DUMMY]
959    vdup.8      d8, d11[0]
960    vdup.8      d9, d11[1]
961    vdup.8      d10, d11[2]
962    vdup.8      d11, d11[3]
963.endm
964
965.macro pixman_composite_over_n_8_0565_cleanup
966    vpop        {d8-d15}
967.endm
968
969generate_composite_function \
970    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
971    FLAG_DST_READWRITE, \
972    8, /* number of pixels, processed in a single block */ \
973    5, /* prefetch distance */ \
974    pixman_composite_over_n_8_0565_init, \
975    pixman_composite_over_n_8_0565_cleanup, \
976    pixman_composite_over_8888_8_0565_process_pixblock_head, \
977    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
978    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
979
980/******************************************************************************/
981
982.macro pixman_composite_over_8888_n_0565_init
983    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
984    vpush       {d8-d15}
985    vld1.32     {d24[0]}, [DUMMY]
986    vdup.8      d24, d24[3]
987.endm
988
989.macro pixman_composite_over_8888_n_0565_cleanup
990    vpop        {d8-d15}
991.endm
992
993generate_composite_function \
994    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
995    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
996    8, /* number of pixels, processed in a single block */ \
997    5, /* prefetch distance */ \
998    pixman_composite_over_8888_n_0565_init, \
999    pixman_composite_over_8888_n_0565_cleanup, \
1000    pixman_composite_over_8888_8_0565_process_pixblock_head, \
1001    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1002    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1003    28, /* dst_w_basereg */ \
1004    4,  /* dst_r_basereg */ \
1005    8,  /* src_basereg   */ \
1006    24  /* mask_basereg  */
1007
1008/******************************************************************************/
1009
1010.macro pixman_composite_src_0565_0565_process_pixblock_head
1011.endm
1012
1013.macro pixman_composite_src_0565_0565_process_pixblock_tail
1014.endm
1015
1016.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1017    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1018    fetch_src_pixblock
1019    cache_preload 16, 16
1020.endm
1021
1022generate_composite_function \
1023    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1024    FLAG_DST_WRITEONLY, \
1025    16, /* number of pixels, processed in a single block */ \
1026    10, /* prefetch distance */ \
1027    default_init, \
1028    default_cleanup, \
1029    pixman_composite_src_0565_0565_process_pixblock_head, \
1030    pixman_composite_src_0565_0565_process_pixblock_tail, \
1031    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1032    0, /* dst_w_basereg */ \
1033    0, /* dst_r_basereg */ \
1034    0, /* src_basereg   */ \
1035    0  /* mask_basereg  */
1036
1037/******************************************************************************/
1038
1039.macro pixman_composite_src_n_8_process_pixblock_head
1040.endm
1041
1042.macro pixman_composite_src_n_8_process_pixblock_tail
1043.endm
1044
1045.macro pixman_composite_src_n_8_process_pixblock_tail_head
1046    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
1047.endm
1048
1049.macro pixman_composite_src_n_8_init
1050    add         DUMMY, sp, #ARGS_STACK_OFFSET
1051    vld1.32     {d0[0]}, [DUMMY]
1052    vsli.u64    d0, d0, #8
1053    vsli.u64    d0, d0, #16
1054    vsli.u64    d0, d0, #32
1055    vorr        d1, d0, d0
1056    vorr        q1, q0, q0
1057.endm
1058
1059.macro pixman_composite_src_n_8_cleanup
1060.endm
1061
1062generate_composite_function \
1063    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1064    FLAG_DST_WRITEONLY, \
1065    32, /* number of pixels, processed in a single block */ \
1066    0,  /* prefetch distance */ \
1067    pixman_composite_src_n_8_init, \
1068    pixman_composite_src_n_8_cleanup, \
1069    pixman_composite_src_n_8_process_pixblock_head, \
1070    pixman_composite_src_n_8_process_pixblock_tail, \
1071    pixman_composite_src_n_8_process_pixblock_tail_head, \
1072    0, /* dst_w_basereg */ \
1073    0, /* dst_r_basereg */ \
1074    0, /* src_basereg   */ \
1075    0  /* mask_basereg  */
1076
1077/******************************************************************************/
1078
1079.macro pixman_composite_src_n_0565_process_pixblock_head
1080.endm
1081
1082.macro pixman_composite_src_n_0565_process_pixblock_tail
1083.endm
1084
1085.macro pixman_composite_src_n_0565_process_pixblock_tail_head
1086    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1087.endm
1088
1089.macro pixman_composite_src_n_0565_init
1090    add         DUMMY, sp, #ARGS_STACK_OFFSET
1091    vld1.32     {d0[0]}, [DUMMY]
1092    vsli.u64    d0, d0, #16
1093    vsli.u64    d0, d0, #32
1094    vorr        d1, d0, d0
1095    vorr        q1, q0, q0
1096.endm
1097
1098.macro pixman_composite_src_n_0565_cleanup
1099.endm
1100
1101generate_composite_function \
1102    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1103    FLAG_DST_WRITEONLY, \
1104    16, /* number of pixels, processed in a single block */ \
1105    0,  /* prefetch distance */ \
1106    pixman_composite_src_n_0565_init, \
1107    pixman_composite_src_n_0565_cleanup, \
1108    pixman_composite_src_n_0565_process_pixblock_head, \
1109    pixman_composite_src_n_0565_process_pixblock_tail, \
1110    pixman_composite_src_n_0565_process_pixblock_tail_head, \
1111    0, /* dst_w_basereg */ \
1112    0, /* dst_r_basereg */ \
1113    0, /* src_basereg   */ \
1114    0  /* mask_basereg  */
1115
1116/******************************************************************************/
1117
1118.macro pixman_composite_src_n_8888_process_pixblock_head
1119.endm
1120
1121.macro pixman_composite_src_n_8888_process_pixblock_tail
1122.endm
1123
1124.macro pixman_composite_src_n_8888_process_pixblock_tail_head
1125    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1126.endm
1127
1128.macro pixman_composite_src_n_8888_init
1129    add         DUMMY, sp, #ARGS_STACK_OFFSET
1130    vld1.32     {d0[0]}, [DUMMY]
1131    vsli.u64    d0, d0, #32
1132    vorr        d1, d0, d0
1133    vorr        q1, q0, q0
1134.endm
1135
1136.macro pixman_composite_src_n_8888_cleanup
1137.endm
1138
1139generate_composite_function \
1140    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1141    FLAG_DST_WRITEONLY, \
1142    8, /* number of pixels, processed in a single block */ \
1143    0, /* prefetch distance */ \
1144    pixman_composite_src_n_8888_init, \
1145    pixman_composite_src_n_8888_cleanup, \
1146    pixman_composite_src_n_8888_process_pixblock_head, \
1147    pixman_composite_src_n_8888_process_pixblock_tail, \
1148    pixman_composite_src_n_8888_process_pixblock_tail_head, \
1149    0, /* dst_w_basereg */ \
1150    0, /* dst_r_basereg */ \
1151    0, /* src_basereg   */ \
1152    0  /* mask_basereg  */
1153
1154/******************************************************************************/
1155
1156.macro pixman_composite_src_8888_8888_process_pixblock_head
1157.endm
1158
1159.macro pixman_composite_src_8888_8888_process_pixblock_tail
1160.endm
1161
1162.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1163    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1164    fetch_src_pixblock
1165    cache_preload 8, 8
1166.endm
1167
1168generate_composite_function \
1169    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1170    FLAG_DST_WRITEONLY, \
1171    8, /* number of pixels, processed in a single block */ \
1172    10, /* prefetch distance */ \
1173    default_init, \
1174    default_cleanup, \
1175    pixman_composite_src_8888_8888_process_pixblock_head, \
1176    pixman_composite_src_8888_8888_process_pixblock_tail, \
1177    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1178    0, /* dst_w_basereg */ \
1179    0, /* dst_r_basereg */ \
1180    0, /* src_basereg   */ \
1181    0  /* mask_basereg  */
1182
1183/******************************************************************************/
1184
1185.macro pixman_composite_src_x888_8888_process_pixblock_head
1186    vorr     q0, q0, q2
1187    vorr     q1, q1, q2
1188.endm
1189
1190.macro pixman_composite_src_x888_8888_process_pixblock_tail
1191.endm
1192
1193.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1194    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1195    fetch_src_pixblock
1196    vorr     q0, q0, q2
1197    vorr     q1, q1, q2
1198    cache_preload 8, 8
1199.endm
1200
1201.macro pixman_composite_src_x888_8888_init
1202    vmov.u8  q2, #0xFF
1203    vshl.u32 q2, q2, #24
1204.endm
1205
1206generate_composite_function \
1207    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1208    FLAG_DST_WRITEONLY, \
1209    8, /* number of pixels, processed in a single block */ \
1210    10, /* prefetch distance */ \
1211    pixman_composite_src_x888_8888_init, \
1212    default_cleanup, \
1213    pixman_composite_src_x888_8888_process_pixblock_head, \
1214    pixman_composite_src_x888_8888_process_pixblock_tail, \
1215    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1216    0, /* dst_w_basereg */ \
1217    0, /* dst_r_basereg */ \
1218    0, /* src_basereg   */ \
1219    0  /* mask_basereg  */
1220
1221/******************************************************************************/
1222
1223.macro pixman_composite_src_n_8_8888_process_pixblock_head
1224    /* expecting solid source in {d0, d1, d2, d3} */
1225    /* mask is in d24 (d25, d26, d27 are unused) */
1226
1227    /* in */
1228    vmull.u8    q8, d24, d0
1229    vmull.u8    q9, d24, d1
1230    vmull.u8    q10, d24, d2
1231    vmull.u8    q11, d24, d3
1232    vrsra.u16   q8, q8, #8
1233    vrsra.u16   q9, q9, #8
1234    vrsra.u16   q10, q10, #8
1235    vrsra.u16   q11, q11, #8
1236.endm
1237
1238.macro pixman_composite_src_n_8_8888_process_pixblock_tail
1239    vrshrn.u16  d28, q8, #8
1240    vrshrn.u16  d29, q9, #8
1241    vrshrn.u16  d30, q10, #8
1242    vrshrn.u16  d31, q11, #8
1243.endm
1244
1245.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1246    fetch_mask_pixblock
1247                                    PF add PF_X, PF_X, #8
1248        vrshrn.u16  d28, q8, #8
1249                                    PF tst PF_CTL, #0x0F
1250        vrshrn.u16  d29, q9, #8
1251                                    PF addne PF_X, PF_X, #8
1252        vrshrn.u16  d30, q10, #8
1253                                    PF subne PF_CTL, PF_CTL, #1
1254        vrshrn.u16  d31, q11, #8
1255                                    PF cmp PF_X, ORIG_W
1256    vmull.u8    q8, d24, d0
1257                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1258    vmull.u8    q9, d24, d1
1259                                    PF subge PF_X, PF_X, ORIG_W
1260    vmull.u8    q10, d24, d2
1261                                    PF subges PF_CTL, PF_CTL, #0x10
1262    vmull.u8    q11, d24, d3
1263                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1264        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1265    vrsra.u16   q8, q8, #8
1266    vrsra.u16   q9, q9, #8
1267    vrsra.u16   q10, q10, #8
1268    vrsra.u16   q11, q11, #8
1269.endm
1270
1271.macro pixman_composite_src_n_8_8888_init
1272    add         DUMMY, sp, #ARGS_STACK_OFFSET
1273    vld1.32     {d3[0]}, [DUMMY]
1274    vdup.8      d0, d3[0]
1275    vdup.8      d1, d3[1]
1276    vdup.8      d2, d3[2]
1277    vdup.8      d3, d3[3]
1278.endm
1279
1280.macro pixman_composite_src_n_8_8888_cleanup
1281.endm
1282
1283generate_composite_function \
1284    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1285    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1286    8, /* number of pixels, processed in a single block */ \
1287    5, /* prefetch distance */ \
1288    pixman_composite_src_n_8_8888_init, \
1289    pixman_composite_src_n_8_8888_cleanup, \
1290    pixman_composite_src_n_8_8888_process_pixblock_head, \
1291    pixman_composite_src_n_8_8888_process_pixblock_tail, \
1292    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1293
1294/******************************************************************************/
1295
1296.macro pixman_composite_src_n_8_8_process_pixblock_head
1297    vmull.u8    q0, d24, d16
1298    vmull.u8    q1, d25, d16
1299    vmull.u8    q2, d26, d16
1300    vmull.u8    q3, d27, d16
1301    vrsra.u16   q0, q0,  #8
1302    vrsra.u16   q1, q1,  #8
1303    vrsra.u16   q2, q2,  #8
1304    vrsra.u16   q3, q3,  #8
1305.endm
1306
1307.macro pixman_composite_src_n_8_8_process_pixblock_tail
1308    vrshrn.u16  d28, q0, #8
1309    vrshrn.u16  d29, q1, #8
1310    vrshrn.u16  d30, q2, #8
1311    vrshrn.u16  d31, q3, #8
1312.endm
1313
1314.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1315    fetch_mask_pixblock
1316                                    PF add PF_X, PF_X, #8
1317        vrshrn.u16  d28, q0, #8
1318                                    PF tst PF_CTL, #0x0F
1319        vrshrn.u16  d29, q1, #8
1320                                    PF addne PF_X, PF_X, #8
1321        vrshrn.u16  d30, q2, #8
1322                                    PF subne PF_CTL, PF_CTL, #1
1323        vrshrn.u16  d31, q3, #8
1324                                    PF cmp PF_X, ORIG_W
1325    vmull.u8    q0,  d24, d16
1326                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1327    vmull.u8    q1,  d25, d16
1328                                    PF subge PF_X, PF_X, ORIG_W
1329    vmull.u8    q2,  d26, d16
1330                                    PF subges PF_CTL, PF_CTL, #0x10
1331    vmull.u8    q3,  d27, d16
1332                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1333        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1334    vrsra.u16   q0, q0,  #8
1335    vrsra.u16   q1, q1,  #8
1336    vrsra.u16   q2, q2,  #8
1337    vrsra.u16   q3, q3,  #8
1338.endm
1339
1340.macro pixman_composite_src_n_8_8_init
1341    add         DUMMY, sp, #ARGS_STACK_OFFSET
1342    vld1.32     {d16[0]}, [DUMMY]
1343    vdup.8      d16, d16[3]
1344.endm
1345
1346.macro pixman_composite_src_n_8_8_cleanup
1347.endm
1348
1349generate_composite_function \
1350    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1351    FLAG_DST_WRITEONLY, \
1352    32, /* number of pixels, processed in a single block */ \
1353    5, /* prefetch distance */ \
1354    pixman_composite_src_n_8_8_init, \
1355    pixman_composite_src_n_8_8_cleanup, \
1356    pixman_composite_src_n_8_8_process_pixblock_head, \
1357    pixman_composite_src_n_8_8_process_pixblock_tail, \
1358    pixman_composite_src_n_8_8_process_pixblock_tail_head
1359
1360/******************************************************************************/
1361
1362.macro pixman_composite_over_n_8_8888_process_pixblock_head
1363    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1364    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1365    /* and destination data in {d4, d5, d6, d7} */
1366    /* mask is in d24 (d25, d26, d27 are unused) */
1367
1368    /* in */
1369    vmull.u8    q6, d24, d8
1370    vmull.u8    q7, d24, d9
1371    vmull.u8    q8, d24, d10
1372    vmull.u8    q9, d24, d11
1373    vrshr.u16   q10, q6, #8
1374    vrshr.u16   q11, q7, #8
1375    vrshr.u16   q12, q8, #8
1376    vrshr.u16   q13, q9, #8
1377    vraddhn.u16 d0, q6, q10
1378    vraddhn.u16 d1, q7, q11
1379    vraddhn.u16 d2, q8, q12
1380    vraddhn.u16 d3, q9, q13
1381    vmvn.8      d25, d3  /* get inverted alpha */
1382    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
1383    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1384    /* now do alpha blending */
1385    vmull.u8    q8, d25, d4
1386    vmull.u8    q9, d25, d5
1387    vmull.u8    q10, d25, d6
1388    vmull.u8    q11, d25, d7
1389.endm
1390
1391.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1392    vrshr.u16   q14, q8, #8
1393    vrshr.u16   q15, q9, #8
1394    vrshr.u16   q6, q10, #8
1395    vrshr.u16   q7, q11, #8
1396    vraddhn.u16 d28, q14, q8
1397    vraddhn.u16 d29, q15, q9
1398    vraddhn.u16 d30, q6, q10
1399    vraddhn.u16 d31, q7, q11
1400    vqadd.u8    q14, q0, q14
1401    vqadd.u8    q15, q1, q15
1402.endm
1403
1404.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1405        vrshr.u16   q14, q8, #8
1406    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1407        vrshr.u16   q15, q9, #8
1408    fetch_mask_pixblock
1409        vrshr.u16   q6, q10, #8
1410                                    PF add PF_X, PF_X, #8
1411        vrshr.u16   q7, q11, #8
1412                                    PF tst PF_CTL, #0x0F
1413        vraddhn.u16 d28, q14, q8
1414                                    PF addne PF_X, PF_X, #8
1415        vraddhn.u16 d29, q15, q9
1416                                    PF subne PF_CTL, PF_CTL, #1
1417        vraddhn.u16 d30, q6, q10
1418                                    PF cmp PF_X, ORIG_W
1419        vraddhn.u16 d31, q7, q11
1420                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1421    vmull.u8    q6, d24, d8
1422                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1423    vmull.u8    q7, d24, d9
1424                                    PF subge PF_X, PF_X, ORIG_W
1425    vmull.u8    q8, d24, d10
1426                                    PF subges PF_CTL, PF_CTL, #0x10
1427    vmull.u8    q9, d24, d11
1428                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1429        vqadd.u8    q14, q0, q14
1430                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1431        vqadd.u8    q15, q1, q15
1432    vrshr.u16   q10, q6, #8
1433    vrshr.u16   q11, q7, #8
1434    vrshr.u16   q12, q8, #8
1435    vrshr.u16   q13, q9, #8
1436    vraddhn.u16 d0, q6, q10
1437    vraddhn.u16 d1, q7, q11
1438    vraddhn.u16 d2, q8, q12
1439    vraddhn.u16 d3, q9, q13
1440        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1441    vmvn.8      d25, d3
1442    vmull.u8    q8, d25, d4
1443    vmull.u8    q9, d25, d5
1444    vmull.u8    q10, d25, d6
1445    vmull.u8    q11, d25, d7
1446.endm
1447
1448.macro pixman_composite_over_n_8_8888_init
1449    add         DUMMY, sp, #ARGS_STACK_OFFSET
1450    vpush       {d8-d15}
1451    vld1.32     {d11[0]}, [DUMMY]
1452    vdup.8      d8, d11[0]
1453    vdup.8      d9, d11[1]
1454    vdup.8      d10, d11[2]
1455    vdup.8      d11, d11[3]
1456.endm
1457
1458.macro pixman_composite_over_n_8_8888_cleanup
1459    vpop        {d8-d15}
1460.endm
1461
1462generate_composite_function \
1463    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1464    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1465    8, /* number of pixels, processed in a single block */ \
1466    5, /* prefetch distance */ \
1467    pixman_composite_over_n_8_8888_init, \
1468    pixman_composite_over_n_8_8888_cleanup, \
1469    pixman_composite_over_n_8_8888_process_pixblock_head, \
1470    pixman_composite_over_n_8_8888_process_pixblock_tail, \
1471    pixman_composite_over_n_8_8888_process_pixblock_tail_head
1472
1473/******************************************************************************/
1474
1475.macro pixman_composite_over_n_8_8_process_pixblock_head
1476    vmull.u8    q0,  d24, d8
1477    vmull.u8    q1,  d25, d8
1478    vmull.u8    q6,  d26, d8
1479    vmull.u8    q7,  d27, d8
1480    vrshr.u16   q10, q0,  #8
1481    vrshr.u16   q11, q1,  #8
1482    vrshr.u16   q12, q6,  #8
1483    vrshr.u16   q13, q7,  #8
1484    vraddhn.u16 d0,  q0,  q10
1485    vraddhn.u16 d1,  q1,  q11
1486    vraddhn.u16 d2,  q6,  q12
1487    vraddhn.u16 d3,  q7,  q13
1488    vmvn.8      q12, q0
1489    vmvn.8      q13, q1
1490    vmull.u8    q8,  d24, d4
1491    vmull.u8    q9,  d25, d5
1492    vmull.u8    q10, d26, d6
1493    vmull.u8    q11, d27, d7
1494.endm
1495
1496.macro pixman_composite_over_n_8_8_process_pixblock_tail
1497    vrshr.u16   q14, q8,  #8
1498    vrshr.u16   q15, q9,  #8
1499    vrshr.u16   q12, q10, #8
1500    vrshr.u16   q13, q11, #8
1501    vraddhn.u16 d28, q14, q8
1502    vraddhn.u16 d29, q15, q9
1503    vraddhn.u16 d30, q12, q10
1504    vraddhn.u16 d31, q13, q11
1505    vqadd.u8    q14, q0,  q14
1506    vqadd.u8    q15, q1,  q15
1507.endm
1508
1509/* TODO: expand macros and do better instructions scheduling */
1510.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1511    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1512    pixman_composite_over_n_8_8_process_pixblock_tail
1513    fetch_mask_pixblock
1514    cache_preload 32, 32
1515    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1516    pixman_composite_over_n_8_8_process_pixblock_head
1517.endm
1518
1519.macro pixman_composite_over_n_8_8_init
1520    add         DUMMY, sp, #ARGS_STACK_OFFSET
1521    vpush       {d8-d15}
1522    vld1.32     {d8[0]}, [DUMMY]
1523    vdup.8      d8, d8[3]
1524.endm
1525
1526.macro pixman_composite_over_n_8_8_cleanup
1527    vpop        {d8-d15}
1528.endm
1529
1530generate_composite_function \
1531    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1532    FLAG_DST_READWRITE, \
1533    32, /* number of pixels, processed in a single block */ \
1534    5, /* prefetch distance */ \
1535    pixman_composite_over_n_8_8_init, \
1536    pixman_composite_over_n_8_8_cleanup, \
1537    pixman_composite_over_n_8_8_process_pixblock_head, \
1538    pixman_composite_over_n_8_8_process_pixblock_tail, \
1539    pixman_composite_over_n_8_8_process_pixblock_tail_head
1540
1541/******************************************************************************/
1542
1543.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1544    /*
1545     * 'combine_mask_ca' replacement
1546     *
1547     * input:  solid src (n) in {d8,  d9,  d10, d11}
1548     *         dest in          {d4,  d5,  d6,  d7 }
1549     *         mask in          {d24, d25, d26, d27}
1550     * output: updated src in   {d0,  d1,  d2,  d3 }
1551     *         updated mask in  {d24, d25, d26, d3 }
1552     */
1553    vmull.u8    q0,  d24, d8
1554    vmull.u8    q1,  d25, d9
1555    vmull.u8    q6,  d26, d10
1556    vmull.u8    q7,  d27, d11
1557    vmull.u8    q9,  d11, d25
1558    vmull.u8    q12, d11, d24
1559    vmull.u8    q13, d11, d26
1560    vrshr.u16   q8,  q0,  #8
1561    vrshr.u16   q10, q1,  #8
1562    vrshr.u16   q11, q6,  #8
1563    vraddhn.u16 d0,  q0,  q8
1564    vraddhn.u16 d1,  q1,  q10
1565    vraddhn.u16 d2,  q6,  q11
1566    vrshr.u16   q11, q12, #8
1567    vrshr.u16   q8,  q9,  #8
1568    vrshr.u16   q6,  q13, #8
1569    vrshr.u16   q10, q7,  #8
1570    vraddhn.u16 d24, q12, q11
1571    vraddhn.u16 d25, q9,  q8
1572    vraddhn.u16 d26, q13, q6
1573    vraddhn.u16 d3,  q7,  q10
1574    /*
1575     * 'combine_over_ca' replacement
1576     *
1577     * output: updated dest in {d28, d29, d30, d31}
1578     */
1579    vmvn.8      q12, q12
1580    vmvn.8      d26, d26
1581    vmull.u8    q8,  d24, d4
1582    vmull.u8    q9,  d25, d5
1583    vmvn.8      d27, d3
1584    vmull.u8    q10, d26, d6
1585    vmull.u8    q11, d27, d7
1586.endm
1587
1588.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1589    /* ... continue 'combine_over_ca' replacement */
1590    vrshr.u16   q14, q8,  #8
1591    vrshr.u16   q15, q9,  #8
1592    vrshr.u16   q6,  q10, #8
1593    vrshr.u16   q7,  q11, #8
1594    vraddhn.u16 d28, q14, q8
1595    vraddhn.u16 d29, q15, q9
1596    vraddhn.u16 d30, q6,  q10
1597    vraddhn.u16 d31, q7,  q11
1598    vqadd.u8    q14, q0,  q14
1599    vqadd.u8    q15, q1,  q15
1600.endm
1601
1602.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1603        vrshr.u16   q14, q8, #8
1604        vrshr.u16   q15, q9, #8
1605    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1606        vrshr.u16   q6, q10, #8
1607        vrshr.u16   q7, q11, #8
1608        vraddhn.u16 d28, q14, q8
1609        vraddhn.u16 d29, q15, q9
1610        vraddhn.u16 d30, q6, q10
1611        vraddhn.u16 d31, q7, q11
1612    fetch_mask_pixblock
1613        vqadd.u8    q14, q0, q14
1614        vqadd.u8    q15, q1, q15
1615    cache_preload 8, 8
1616    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1617    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1618.endm
1619
1620.macro pixman_composite_over_n_8888_8888_ca_init
1621    add         DUMMY, sp, #ARGS_STACK_OFFSET
1622    vpush       {d8-d15}
1623    vld1.32     {d11[0]}, [DUMMY]
1624    vdup.8      d8, d11[0]
1625    vdup.8      d9, d11[1]
1626    vdup.8      d10, d11[2]
1627    vdup.8      d11, d11[3]
1628.endm
1629
1630.macro pixman_composite_over_n_8888_8888_ca_cleanup
1631    vpop        {d8-d15}
1632.endm
1633
1634generate_composite_function \
1635    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1636    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1637    8, /* number of pixels, processed in a single block */ \
1638    5, /* prefetch distance */ \
1639    pixman_composite_over_n_8888_8888_ca_init, \
1640    pixman_composite_over_n_8888_8888_ca_cleanup, \
1641    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1642    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1643    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1644
1645/******************************************************************************/
1646
1647.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1648    /*
1649     * 'combine_mask_ca' replacement
1650     *
1651     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
1652     *         mask in          {d24, d25, d26}       [B, G, R]
1653     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
1654     *         updated mask in  {d24, d25, d26}       [B, G, R]
1655     */
1656    vmull.u8    q0,  d24, d8
1657    vmull.u8    q1,  d25, d9
1658    vmull.u8    q6,  d26, d10
1659    vmull.u8    q9,  d11, d25
1660    vmull.u8    q12, d11, d24
1661    vmull.u8    q13, d11, d26
1662    vrshr.u16   q8,  q0,  #8
1663    vrshr.u16   q10, q1,  #8
1664    vrshr.u16   q11, q6,  #8
1665    vraddhn.u16 d0,  q0,  q8
1666    vraddhn.u16 d1,  q1,  q10
1667    vraddhn.u16 d2,  q6,  q11
1668    vrshr.u16   q11, q12, #8
1669    vrshr.u16   q8,  q9,  #8
1670    vrshr.u16   q6,  q13, #8
1671    vraddhn.u16 d24, q12, q11
1672    vraddhn.u16 d25, q9,  q8
1673    /*
1674     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1675     * and put data into d16 - blue, d17 - green, d18 - red
1676     */
1677       vshrn.u16   d17, q2,  #3
1678       vshrn.u16   d18, q2,  #8
1679    vraddhn.u16 d26, q13, q6
1680       vsli.u16    q2,  q2,  #5
1681       vsri.u8     d18, d18, #5
1682       vsri.u8     d17, d17, #6
1683    /*
1684     * 'combine_over_ca' replacement
1685     *
1686     * output: updated dest in d16 - blue, d17 - green, d18 - red
1687     */
1688    vmvn.8      q12, q12
1689       vshrn.u16   d16, q2,  #2
1690    vmvn.8      d26, d26
1691    vmull.u8    q6,  d16, d24
1692    vmull.u8    q7,  d17, d25
1693    vmull.u8    q11, d18, d26
1694.endm
1695
1696.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1697    /* ... continue 'combine_over_ca' replacement */
1698    vrshr.u16   q10, q6,  #8
1699    vrshr.u16   q14, q7,  #8
1700    vrshr.u16   q15, q11, #8
1701    vraddhn.u16 d16, q10, q6
1702    vraddhn.u16 d17, q14, q7
1703    vraddhn.u16 d18, q15, q11
1704    vqadd.u8    q8,  q0,  q8
1705    vqadd.u8    d18, d2,  d18
1706    /*
1707     * convert the results in d16, d17, d18 to r5g6b5 and store
1708     * them into {d28, d29}
1709     */
1710    vshll.u8    q14, d18, #8
1711    vshll.u8    q10, d17, #8
1712    vshll.u8    q15, d16, #8
1713    vsri.u16    q14, q10, #5
1714    vsri.u16    q14, q15, #11
1715.endm
1716
1717.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1718    fetch_mask_pixblock
1719        vrshr.u16   q10, q6, #8
1720        vrshr.u16   q14, q7, #8
1721    vld1.16     {d4, d5}, [DST_R, :128]!
1722        vrshr.u16   q15, q11, #8
1723        vraddhn.u16 d16, q10, q6
1724        vraddhn.u16 d17, q14, q7
1725        vraddhn.u16 d22, q15, q11
1726            /* process_pixblock_head */
1727            /*
1728             * 'combine_mask_ca' replacement
1729             *
1730             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
1731             *         mask in          {d24, d25, d26}       [B, G, R]
1732             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
1733             *         updated mask in  {d24, d25, d26}       [B, G, R]
1734             */
1735            vmull.u8    q6,  d26, d10
1736        vqadd.u8    q8,  q0, q8
1737            vmull.u8    q0,  d24, d8
1738        vqadd.u8    d22, d2, d22
1739            vmull.u8    q1,  d25, d9
1740        /*
1741         * convert the result in d16, d17, d22 to r5g6b5 and store
1742         * it into {d28, d29}
1743         */
1744        vshll.u8    q14, d22, #8
1745        vshll.u8    q10, d17, #8
1746        vshll.u8    q15, d16, #8
1747            vmull.u8    q9,  d11, d25
1748        vsri.u16    q14, q10, #5
1749            vmull.u8    q12, d11, d24
1750            vmull.u8    q13, d11, d26
1751        vsri.u16    q14, q15, #11
1752    cache_preload 8, 8
1753            vrshr.u16   q8,  q0,  #8
1754            vrshr.u16   q10, q1,  #8
1755            vrshr.u16   q11, q6,  #8
1756            vraddhn.u16 d0,  q0,  q8
1757            vraddhn.u16 d1,  q1,  q10
1758            vraddhn.u16 d2,  q6,  q11
1759            vrshr.u16   q11, q12, #8
1760            vrshr.u16   q8,  q9,  #8
1761            vrshr.u16   q6,  q13, #8
1762            vraddhn.u16 d24, q12, q11
1763            vraddhn.u16 d25, q9,  q8
1764                /*
1765                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1766	         * 8-bit format and put data into d16 - blue, d17 - green,
1767	         * d18 - red
1768                 */
1769                vshrn.u16   d17, q2,  #3
1770                vshrn.u16   d18, q2,  #8
1771            vraddhn.u16 d26, q13, q6
1772                vsli.u16    q2,  q2,  #5
1773                vsri.u8     d17, d17, #6
1774                vsri.u8     d18, d18, #5
1775            /*
1776             * 'combine_over_ca' replacement
1777             *
1778             * output: updated dest in d16 - blue, d17 - green, d18 - red
1779             */
1780            vmvn.8      q12, q12
1781                vshrn.u16   d16, q2,  #2
1782            vmvn.8      d26, d26
1783            vmull.u8    q7,  d17, d25
1784            vmull.u8    q6,  d16, d24
1785            vmull.u8    q11, d18, d26
1786    vst1.16     {d28, d29}, [DST_W, :128]!
1787.endm
1788
1789.macro pixman_composite_over_n_8888_0565_ca_init
1790    add         DUMMY, sp, #ARGS_STACK_OFFSET
1791    vpush       {d8-d15}
1792    vld1.32     {d11[0]}, [DUMMY]
1793    vdup.8      d8, d11[0]
1794    vdup.8      d9, d11[1]
1795    vdup.8      d10, d11[2]
1796    vdup.8      d11, d11[3]
1797.endm
1798
1799.macro pixman_composite_over_n_8888_0565_ca_cleanup
1800    vpop        {d8-d15}
1801.endm
1802
1803generate_composite_function \
1804    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1805    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1806    8, /* number of pixels, processed in a single block */ \
1807    5, /* prefetch distance */ \
1808    pixman_composite_over_n_8888_0565_ca_init, \
1809    pixman_composite_over_n_8888_0565_ca_cleanup, \
1810    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1811    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1812    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1813
1814/******************************************************************************/
1815
1816.macro pixman_composite_in_n_8_process_pixblock_head
1817    /* expecting source data in {d0, d1, d2, d3} */
1818    /* and destination data in {d4, d5, d6, d7} */
1819    vmull.u8    q8,  d4,  d3
1820    vmull.u8    q9,  d5,  d3
1821    vmull.u8    q10, d6,  d3
1822    vmull.u8    q11, d7,  d3
1823.endm
1824
1825.macro pixman_composite_in_n_8_process_pixblock_tail
1826    vrshr.u16   q14, q8,  #8
1827    vrshr.u16   q15, q9,  #8
1828    vrshr.u16   q12, q10, #8
1829    vrshr.u16   q13, q11, #8
1830    vraddhn.u16 d28, q8,  q14
1831    vraddhn.u16 d29, q9,  q15
1832    vraddhn.u16 d30, q10, q12
1833    vraddhn.u16 d31, q11, q13
1834.endm
1835
1836.macro pixman_composite_in_n_8_process_pixblock_tail_head
1837    pixman_composite_in_n_8_process_pixblock_tail
1838    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1839    cache_preload 32, 32
1840    pixman_composite_in_n_8_process_pixblock_head
1841    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1842.endm
1843
1844.macro pixman_composite_in_n_8_init
1845    add         DUMMY, sp, #ARGS_STACK_OFFSET
1846    vld1.32     {d3[0]}, [DUMMY]
1847    vdup.8      d3, d3[3]
1848.endm
1849
1850.macro pixman_composite_in_n_8_cleanup
1851.endm
1852
1853generate_composite_function \
1854    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1855    FLAG_DST_READWRITE, \
1856    32, /* number of pixels, processed in a single block */ \
1857    5, /* prefetch distance */ \
1858    pixman_composite_in_n_8_init, \
1859    pixman_composite_in_n_8_cleanup, \
1860    pixman_composite_in_n_8_process_pixblock_head, \
1861    pixman_composite_in_n_8_process_pixblock_tail, \
1862    pixman_composite_in_n_8_process_pixblock_tail_head, \
1863    28, /* dst_w_basereg */ \
1864    4,  /* dst_r_basereg */ \
1865    0,  /* src_basereg   */ \
1866    24  /* mask_basereg  */
1867
1868.macro pixman_composite_add_n_8_8_process_pixblock_head
1869    /* expecting source data in {d8, d9, d10, d11} */
1870    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1871    /* and destination data in {d4, d5, d6, d7} */
1872    /* mask is in d24, d25, d26, d27 */
1873    vmull.u8    q0, d24, d11
1874    vmull.u8    q1, d25, d11
1875    vmull.u8    q6, d26, d11
1876    vmull.u8    q7, d27, d11
1877    vrshr.u16   q10, q0, #8
1878    vrshr.u16   q11, q1, #8
1879    vrshr.u16   q12, q6, #8
1880    vrshr.u16   q13, q7, #8
1881    vraddhn.u16 d0, q0, q10
1882    vraddhn.u16 d1, q1, q11
1883    vraddhn.u16 d2, q6, q12
1884    vraddhn.u16 d3, q7, q13
1885    vqadd.u8    q14, q0, q2
1886    vqadd.u8    q15, q1, q3
1887.endm
1888
1889.macro pixman_composite_add_n_8_8_process_pixblock_tail
1890.endm
1891
1892/* TODO: expand macros and do better instructions scheduling */
1893.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1894    pixman_composite_add_n_8_8_process_pixblock_tail
1895    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1896    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1897    fetch_mask_pixblock
1898    cache_preload 32, 32
1899    pixman_composite_add_n_8_8_process_pixblock_head
1900.endm
1901
1902.macro pixman_composite_add_n_8_8_init
1903    add         DUMMY, sp, #ARGS_STACK_OFFSET
1904    vpush       {d8-d15}
1905    vld1.32     {d11[0]}, [DUMMY]
1906    vdup.8      d11, d11[3]
1907.endm
1908
1909.macro pixman_composite_add_n_8_8_cleanup
1910    vpop        {d8-d15}
1911.endm
1912
1913generate_composite_function \
1914    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1915    FLAG_DST_READWRITE, \
1916    32, /* number of pixels, processed in a single block */ \
1917    5, /* prefetch distance */ \
1918    pixman_composite_add_n_8_8_init, \
1919    pixman_composite_add_n_8_8_cleanup, \
1920    pixman_composite_add_n_8_8_process_pixblock_head, \
1921    pixman_composite_add_n_8_8_process_pixblock_tail, \
1922    pixman_composite_add_n_8_8_process_pixblock_tail_head
1923
1924/******************************************************************************/
1925
1926.macro pixman_composite_add_8_8_8_process_pixblock_head
1927    /* expecting source data in {d0, d1, d2, d3} */
1928    /* destination data in {d4, d5, d6, d7} */
1929    /* mask in {d24, d25, d26, d27} */
1930    vmull.u8    q8, d24, d0
1931    vmull.u8    q9, d25, d1
1932    vmull.u8    q10, d26, d2
1933    vmull.u8    q11, d27, d3
1934    vrshr.u16   q0, q8, #8
1935    vrshr.u16   q1, q9, #8
1936    vrshr.u16   q12, q10, #8
1937    vrshr.u16   q13, q11, #8
1938    vraddhn.u16 d0, q0, q8
1939    vraddhn.u16 d1, q1, q9
1940    vraddhn.u16 d2, q12, q10
1941    vraddhn.u16 d3, q13, q11
1942    vqadd.u8    q14, q0, q2
1943    vqadd.u8    q15, q1, q3
1944.endm
1945
1946.macro pixman_composite_add_8_8_8_process_pixblock_tail
1947.endm
1948
1949/* TODO: expand macros and do better instructions scheduling */
1950.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1951    pixman_composite_add_8_8_8_process_pixblock_tail
1952    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1953    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1954    fetch_mask_pixblock
1955    fetch_src_pixblock
1956    cache_preload 32, 32
1957    pixman_composite_add_8_8_8_process_pixblock_head
1958.endm
1959
1960.macro pixman_composite_add_8_8_8_init
1961.endm
1962
1963.macro pixman_composite_add_8_8_8_cleanup
1964.endm
1965
1966generate_composite_function \
1967    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1968    FLAG_DST_READWRITE, \
1969    32, /* number of pixels, processed in a single block */ \
1970    5, /* prefetch distance */ \
1971    pixman_composite_add_8_8_8_init, \
1972    pixman_composite_add_8_8_8_cleanup, \
1973    pixman_composite_add_8_8_8_process_pixblock_head, \
1974    pixman_composite_add_8_8_8_process_pixblock_tail, \
1975    pixman_composite_add_8_8_8_process_pixblock_tail_head
1976
1977/******************************************************************************/
1978
1979.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1980    /* expecting source data in {d0, d1, d2, d3} */
1981    /* destination data in {d4, d5, d6, d7} */
1982    /* mask in {d24, d25, d26, d27} */
1983    vmull.u8    q8,  d27, d0
1984    vmull.u8    q9,  d27, d1
1985    vmull.u8    q10, d27, d2
1986    vmull.u8    q11, d27, d3
1987    /* 1 cycle bubble */
1988    vrsra.u16   q8,  q8,  #8
1989    vrsra.u16   q9,  q9,  #8
1990    vrsra.u16   q10, q10, #8
1991    vrsra.u16   q11, q11, #8
1992.endm
1993
1994.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1995    /* 2 cycle bubble */
1996    vrshrn.u16  d28, q8,  #8
1997    vrshrn.u16  d29, q9,  #8
1998    vrshrn.u16  d30, q10, #8
1999    vrshrn.u16  d31, q11, #8
2000    vqadd.u8    q14, q2,  q14
2001    /* 1 cycle bubble */
2002    vqadd.u8    q15, q3,  q15
2003.endm
2004
2005.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2006    fetch_src_pixblock
2007        vrshrn.u16  d28, q8,  #8
2008    fetch_mask_pixblock
2009        vrshrn.u16  d29, q9,  #8
2010    vmull.u8    q8,  d27, d0
2011        vrshrn.u16  d30, q10, #8
2012    vmull.u8    q9,  d27, d1
2013        vrshrn.u16  d31, q11, #8
2014    vmull.u8    q10, d27, d2
2015        vqadd.u8    q14, q2,  q14
2016    vmull.u8    q11, d27, d3
2017        vqadd.u8    q15, q3,  q15
2018    vrsra.u16   q8,  q8,  #8
2019    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
2020    vrsra.u16   q9,  q9,  #8
2021        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
2022    vrsra.u16   q10, q10, #8
2023
2024    cache_preload 8, 8
2025
2026    vrsra.u16   q11, q11, #8
2027.endm
2028
2029generate_composite_function \
2030    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2031    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2032    8, /* number of pixels, processed in a single block */ \
2033    10, /* prefetch distance */ \
2034    default_init, \
2035    default_cleanup, \
2036    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2037    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2038    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2039
2040generate_composite_function_single_scanline \
2041    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2042    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2043    8, /* number of pixels, processed in a single block */ \
2044    default_init, \
2045    default_cleanup, \
2046    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2047    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2048    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2049
2050/******************************************************************************/
2051
2052generate_composite_function \
2053    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2054    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2055    8, /* number of pixels, processed in a single block */ \
2056    5, /* prefetch distance */ \
2057    default_init, \
2058    default_cleanup, \
2059    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2060    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2061    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2062    28, /* dst_w_basereg */ \
2063    4,  /* dst_r_basereg */ \
2064    0,  /* src_basereg   */ \
2065    27  /* mask_basereg  */
2066
2067/******************************************************************************/
2068
2069.macro pixman_composite_add_n_8_8888_init
2070    add         DUMMY, sp, #ARGS_STACK_OFFSET
2071    vld1.32     {d3[0]}, [DUMMY]
2072    vdup.8      d0, d3[0]
2073    vdup.8      d1, d3[1]
2074    vdup.8      d2, d3[2]
2075    vdup.8      d3, d3[3]
2076.endm
2077
2078.macro pixman_composite_add_n_8_8888_cleanup
2079.endm
2080
2081generate_composite_function \
2082    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2083    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2084    8, /* number of pixels, processed in a single block */ \
2085    5, /* prefetch distance */ \
2086    pixman_composite_add_n_8_8888_init, \
2087    pixman_composite_add_n_8_8888_cleanup, \
2088    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2089    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2090    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2091    28, /* dst_w_basereg */ \
2092    4,  /* dst_r_basereg */ \
2093    0,  /* src_basereg   */ \
2094    27  /* mask_basereg  */
2095
2096/******************************************************************************/
2097
2098.macro pixman_composite_add_8888_n_8888_init
2099    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2100    vld1.32     {d27[0]}, [DUMMY]
2101    vdup.8      d27, d27[3]
2102.endm
2103
2104.macro pixman_composite_add_8888_n_8888_cleanup
2105.endm
2106
2107generate_composite_function \
2108    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2109    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2110    8, /* number of pixels, processed in a single block */ \
2111    5, /* prefetch distance */ \
2112    pixman_composite_add_8888_n_8888_init, \
2113    pixman_composite_add_8888_n_8888_cleanup, \
2114    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2115    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2116    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2117    28, /* dst_w_basereg */ \
2118    4,  /* dst_r_basereg */ \
2119    0,  /* src_basereg   */ \
2120    27  /* mask_basereg  */
2121
2122/******************************************************************************/
2123
2124.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2125    /* expecting source data in {d0, d1, d2, d3} */
2126    /* destination data in {d4, d5, d6, d7} */
2127    /* solid mask is in d15 */
2128
2129    /* 'in' */
2130    vmull.u8    q8, d15, d3
2131    vmull.u8    q6, d15, d2
2132    vmull.u8    q5, d15, d1
2133    vmull.u8    q4, d15, d0
2134    vrshr.u16   q13, q8, #8
2135    vrshr.u16   q12, q6, #8
2136    vrshr.u16   q11, q5, #8
2137    vrshr.u16   q10, q4, #8
2138    vraddhn.u16 d3, q8, q13
2139    vraddhn.u16 d2, q6, q12
2140    vraddhn.u16 d1, q5, q11
2141    vraddhn.u16 d0, q4, q10
2142    vmvn.8      d24, d3  /* get inverted alpha */
2143    /* now do alpha blending */
2144    vmull.u8    q8, d24, d4
2145    vmull.u8    q9, d24, d5
2146    vmull.u8    q10, d24, d6
2147    vmull.u8    q11, d24, d7
2148.endm
2149
2150.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2151    vrshr.u16   q14, q8, #8
2152    vrshr.u16   q15, q9, #8
2153    vrshr.u16   q12, q10, #8
2154    vrshr.u16   q13, q11, #8
2155    vraddhn.u16 d28, q14, q8
2156    vraddhn.u16 d29, q15, q9
2157    vraddhn.u16 d30, q12, q10
2158    vraddhn.u16 d31, q13, q11
2159.endm
2160
2161/* TODO: expand macros and do better instructions scheduling */
2162.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2163    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2164    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2165    fetch_src_pixblock
2166    cache_preload 8, 8
2167    fetch_mask_pixblock
2168    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2169    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2170.endm
2171
2172generate_composite_function_single_scanline \
2173    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2174    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2175    8, /* number of pixels, processed in a single block */ \
2176    default_init_need_all_regs, \
2177    default_cleanup_need_all_regs, \
2178    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2179    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2180    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2181    28, /* dst_w_basereg */ \
2182    4,  /* dst_r_basereg */ \
2183    0,  /* src_basereg   */ \
2184    12  /* mask_basereg  */
2185
2186/******************************************************************************/
2187
2188.macro pixman_composite_over_8888_n_8888_process_pixblock_head
2189    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2190.endm
2191
2192.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2193    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2194    vqadd.u8    q14, q0, q14
2195    vqadd.u8    q15, q1, q15
2196.endm
2197
2198/* TODO: expand macros and do better instructions scheduling */
2199.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2200    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2201    pixman_composite_over_8888_n_8888_process_pixblock_tail
2202    fetch_src_pixblock
2203    cache_preload 8, 8
2204    pixman_composite_over_8888_n_8888_process_pixblock_head
2205    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2206.endm
2207
2208.macro pixman_composite_over_8888_n_8888_init
2209    add         DUMMY, sp, #48
2210    vpush       {d8-d15}
2211    vld1.32     {d15[0]}, [DUMMY]
2212    vdup.8      d15, d15[3]
2213.endm
2214
2215.macro pixman_composite_over_8888_n_8888_cleanup
2216    vpop        {d8-d15}
2217.endm
2218
2219generate_composite_function \
2220    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2221    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2222    8, /* number of pixels, processed in a single block */ \
2223    5, /* prefetch distance */ \
2224    pixman_composite_over_8888_n_8888_init, \
2225    pixman_composite_over_8888_n_8888_cleanup, \
2226    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2227    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2228    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2229
2230/******************************************************************************/
2231
2232/* TODO: expand macros and do better instructions scheduling */
2233.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2234    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2235    pixman_composite_over_8888_n_8888_process_pixblock_tail
2236    fetch_src_pixblock
2237    cache_preload 8, 8
2238    fetch_mask_pixblock
2239    pixman_composite_over_8888_n_8888_process_pixblock_head
2240    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2241.endm
2242
2243generate_composite_function \
2244    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2245    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2246    8, /* number of pixels, processed in a single block */ \
2247    5, /* prefetch distance */ \
2248    default_init_need_all_regs, \
2249    default_cleanup_need_all_regs, \
2250    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2251    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2252    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2253    28, /* dst_w_basereg */ \
2254    4,  /* dst_r_basereg */ \
2255    0,  /* src_basereg   */ \
2256    12  /* mask_basereg  */
2257
2258generate_composite_function_single_scanline \
2259    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2260    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2261    8, /* number of pixels, processed in a single block */ \
2262    default_init_need_all_regs, \
2263    default_cleanup_need_all_regs, \
2264    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2265    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2266    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2267    28, /* dst_w_basereg */ \
2268    4,  /* dst_r_basereg */ \
2269    0,  /* src_basereg   */ \
2270    12  /* mask_basereg  */
2271
2272/******************************************************************************/
2273
2274/* TODO: expand macros and do better instructions scheduling */
2275.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2276    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2277    pixman_composite_over_8888_n_8888_process_pixblock_tail
2278    fetch_src_pixblock
2279    cache_preload 8, 8
2280    fetch_mask_pixblock
2281    pixman_composite_over_8888_n_8888_process_pixblock_head
2282    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2283.endm
2284
2285generate_composite_function \
2286    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2287    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2288    8, /* number of pixels, processed in a single block */ \
2289    5, /* prefetch distance */ \
2290    default_init_need_all_regs, \
2291    default_cleanup_need_all_regs, \
2292    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2293    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2294    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2295    28, /* dst_w_basereg */ \
2296    4,  /* dst_r_basereg */ \
2297    0,  /* src_basereg   */ \
2298    15  /* mask_basereg  */
2299
2300/******************************************************************************/
2301
2302.macro pixman_composite_src_0888_0888_process_pixblock_head
2303.endm
2304
2305.macro pixman_composite_src_0888_0888_process_pixblock_tail
2306.endm
2307
2308.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2309    vst3.8 {d0, d1, d2}, [DST_W]!
2310    fetch_src_pixblock
2311    cache_preload 8, 8
2312.endm
2313
2314generate_composite_function \
2315    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2316    FLAG_DST_WRITEONLY, \
2317    8, /* number of pixels, processed in a single block */ \
2318    10, /* prefetch distance */ \
2319    default_init, \
2320    default_cleanup, \
2321    pixman_composite_src_0888_0888_process_pixblock_head, \
2322    pixman_composite_src_0888_0888_process_pixblock_tail, \
2323    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2324    0, /* dst_w_basereg */ \
2325    0, /* dst_r_basereg */ \
2326    0, /* src_basereg   */ \
2327    0  /* mask_basereg  */
2328
2329/******************************************************************************/
2330
2331.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2332    vswp   d0, d2
2333.endm
2334
2335.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2336.endm
2337
2338.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2339    vst4.8 {d0, d1, d2, d3}, [DST_W]!
2340    fetch_src_pixblock
2341    vswp   d0, d2
2342    cache_preload 8, 8
2343.endm
2344
2345.macro pixman_composite_src_0888_8888_rev_init
2346    veor   d3, d3, d3
2347.endm
2348
2349generate_composite_function \
2350    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2351    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2352    8, /* number of pixels, processed in a single block */ \
2353    10, /* prefetch distance */ \
2354    pixman_composite_src_0888_8888_rev_init, \
2355    default_cleanup, \
2356    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2357    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2358    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2359    0, /* dst_w_basereg */ \
2360    0, /* dst_r_basereg */ \
2361    0, /* src_basereg   */ \
2362    0  /* mask_basereg  */
2363
2364/******************************************************************************/
2365
2366.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2367    vshll.u8    q8, d1, #8
2368    vshll.u8    q9, d2, #8
2369.endm
2370
2371.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2372    vshll.u8    q14, d0, #8
2373    vsri.u16    q14, q8, #5
2374    vsri.u16    q14, q9, #11
2375.endm
2376
2377.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2378        vshll.u8    q14, d0, #8
2379    fetch_src_pixblock
2380        vsri.u16    q14, q8, #5
2381        vsri.u16    q14, q9, #11
2382    vshll.u8    q8, d1, #8
2383        vst1.16 {d28, d29}, [DST_W, :128]!
2384    vshll.u8    q9, d2, #8
2385.endm
2386
2387generate_composite_function \
2388    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2389    FLAG_DST_WRITEONLY, \
2390    8, /* number of pixels, processed in a single block */ \
2391    10, /* prefetch distance */ \
2392    default_init, \
2393    default_cleanup, \
2394    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2395    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2396    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2397    28, /* dst_w_basereg */ \
2398    0, /* dst_r_basereg */ \
2399    0, /* src_basereg   */ \
2400    0  /* mask_basereg  */
2401
2402/******************************************************************************/
2403
2404.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2405    vmull.u8    q8, d3, d0
2406    vmull.u8    q9, d3, d1
2407    vmull.u8    q10, d3, d2
2408.endm
2409
2410.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2411    vrshr.u16   q11, q8, #8
2412    vswp        d3, d31
2413    vrshr.u16   q12, q9, #8
2414    vrshr.u16   q13, q10, #8
2415    vraddhn.u16 d30, q11, q8
2416    vraddhn.u16 d29, q12, q9
2417    vraddhn.u16 d28, q13, q10
2418.endm
2419
2420.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2421        vrshr.u16   q11, q8, #8
2422        vswp        d3, d31
2423        vrshr.u16   q12, q9, #8
2424        vrshr.u16   q13, q10, #8
2425    fetch_src_pixblock
2426        vraddhn.u16 d30, q11, q8
2427                                    PF add PF_X, PF_X, #8
2428                                    PF tst PF_CTL, #0xF
2429                                    PF addne PF_X, PF_X, #8
2430                                    PF subne PF_CTL, PF_CTL, #1
2431        vraddhn.u16 d29, q12, q9
2432        vraddhn.u16 d28, q13, q10
2433    vmull.u8    q8, d3, d0
2434    vmull.u8    q9, d3, d1
2435    vmull.u8    q10, d3, d2
2436        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2437                                    PF cmp PF_X, ORIG_W
2438                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2439                                    PF subge PF_X, PF_X, ORIG_W
2440                                    PF subges PF_CTL, PF_CTL, #0x10
2441                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2442.endm
2443
2444generate_composite_function \
2445    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2446    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2447    8, /* number of pixels, processed in a single block */ \
2448    10, /* prefetch distance */ \
2449    default_init, \
2450    default_cleanup, \
2451    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2452    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2453    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2454    28, /* dst_w_basereg */ \
2455    0, /* dst_r_basereg */ \
2456    0, /* src_basereg   */ \
2457    0  /* mask_basereg  */
2458
2459/******************************************************************************/
2460
2461.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2462    vmull.u8    q8, d3, d0
2463    vmull.u8    q9, d3, d1
2464    vmull.u8    q10, d3, d2
2465.endm
2466
2467.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2468    vrshr.u16   q11, q8, #8
2469    vswp        d3, d31
2470    vrshr.u16   q12, q9, #8
2471    vrshr.u16   q13, q10, #8
2472    vraddhn.u16 d28, q11, q8
2473    vraddhn.u16 d29, q12, q9
2474    vraddhn.u16 d30, q13, q10
2475.endm
2476
2477.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2478        vrshr.u16   q11, q8, #8
2479        vswp        d3, d31
2480        vrshr.u16   q12, q9, #8
2481        vrshr.u16   q13, q10, #8
2482    fetch_src_pixblock
2483        vraddhn.u16 d28, q11, q8
2484                                    PF add PF_X, PF_X, #8
2485                                    PF tst PF_CTL, #0xF
2486                                    PF addne PF_X, PF_X, #8
2487                                    PF subne PF_CTL, PF_CTL, #1
2488        vraddhn.u16 d29, q12, q9
2489        vraddhn.u16 d30, q13, q10
2490    vmull.u8    q8, d3, d0
2491    vmull.u8    q9, d3, d1
2492    vmull.u8    q10, d3, d2
2493        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2494                                    PF cmp PF_X, ORIG_W
2495                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2496                                    PF subge PF_X, PF_X, ORIG_W
2497                                    PF subges PF_CTL, PF_CTL, #0x10
2498                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2499.endm
2500
2501generate_composite_function \
2502    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2503    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2504    8, /* number of pixels, processed in a single block */ \
2505    10, /* prefetch distance */ \
2506    default_init, \
2507    default_cleanup, \
2508    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2509    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2510    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2511    28, /* dst_w_basereg */ \
2512    0, /* dst_r_basereg */ \
2513    0, /* src_basereg   */ \
2514    0  /* mask_basereg  */
2515
2516/******************************************************************************/
2517
2518.macro pixman_composite_over_0565_8_0565_process_pixblock_head
2519    /* mask is in d15 */
2520    convert_0565_to_x888 q4, d2, d1, d0
2521    convert_0565_to_x888 q5, d6, d5, d4
2522    /* source pixel data is in      {d0, d1, d2, XX} */
2523    /* destination pixel data is in {d4, d5, d6, XX} */
2524    vmvn.8      d7,  d15
2525    vmull.u8    q6,  d15, d2
2526    vmull.u8    q5,  d15, d1
2527    vmull.u8    q4,  d15, d0
2528    vmull.u8    q8,  d7,  d4
2529    vmull.u8    q9,  d7,  d5
2530    vmull.u8    q13, d7,  d6
2531    vrshr.u16   q12, q6,  #8
2532    vrshr.u16   q11, q5,  #8
2533    vrshr.u16   q10, q4,  #8
2534    vraddhn.u16 d2,  q6,  q12
2535    vraddhn.u16 d1,  q5,  q11
2536    vraddhn.u16 d0,  q4,  q10
2537.endm
2538
2539.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2540    vrshr.u16   q14, q8,  #8
2541    vrshr.u16   q15, q9,  #8
2542    vrshr.u16   q12, q13, #8
2543    vraddhn.u16 d28, q14, q8
2544    vraddhn.u16 d29, q15, q9
2545    vraddhn.u16 d30, q12, q13
2546    vqadd.u8    q0,  q0,  q14
2547    vqadd.u8    q1,  q1,  q15
2548    /* 32bpp result is in {d0, d1, d2, XX} */
2549    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2550.endm
2551
2552/* TODO: expand macros and do better instructions scheduling */
2553.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2554    fetch_mask_pixblock
2555    pixman_composite_over_0565_8_0565_process_pixblock_tail
2556    fetch_src_pixblock
2557    vld1.16    {d10, d11}, [DST_R, :128]!
2558    cache_preload 8, 8
2559    pixman_composite_over_0565_8_0565_process_pixblock_head
2560    vst1.16    {d28, d29}, [DST_W, :128]!
2561.endm
2562
2563generate_composite_function \
2564    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2565    FLAG_DST_READWRITE, \
2566    8, /* number of pixels, processed in a single block */ \
2567    5, /* prefetch distance */ \
2568    default_init_need_all_regs, \
2569    default_cleanup_need_all_regs, \
2570    pixman_composite_over_0565_8_0565_process_pixblock_head, \
2571    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2572    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2573    28, /* dst_w_basereg */ \
2574    10,  /* dst_r_basereg */ \
2575    8,  /* src_basereg   */ \
2576    15  /* mask_basereg  */
2577
2578/******************************************************************************/
2579
2580.macro pixman_composite_over_0565_n_0565_init
2581    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2582    vpush       {d8-d15}
2583    vld1.32     {d15[0]}, [DUMMY]
2584    vdup.8      d15, d15[3]
2585.endm
2586
2587.macro pixman_composite_over_0565_n_0565_cleanup
2588    vpop        {d8-d15}
2589.endm
2590
2591generate_composite_function \
2592    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2593    FLAG_DST_READWRITE, \
2594    8, /* number of pixels, processed in a single block */ \
2595    5, /* prefetch distance */ \
2596    pixman_composite_over_0565_n_0565_init, \
2597    pixman_composite_over_0565_n_0565_cleanup, \
2598    pixman_composite_over_0565_8_0565_process_pixblock_head, \
2599    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2600    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2601    28, /* dst_w_basereg */ \
2602    10, /* dst_r_basereg */ \
2603    8,  /* src_basereg   */ \
2604    15  /* mask_basereg  */
2605
2606/******************************************************************************/
2607
2608.macro pixman_composite_add_0565_8_0565_process_pixblock_head
2609    /* mask is in d15 */
2610    convert_0565_to_x888 q4, d2, d1, d0
2611    convert_0565_to_x888 q5, d6, d5, d4
2612    /* source pixel data is in      {d0, d1, d2, XX} */
2613    /* destination pixel data is in {d4, d5, d6, XX} */
2614    vmull.u8    q6,  d15, d2
2615    vmull.u8    q5,  d15, d1
2616    vmull.u8    q4,  d15, d0
2617    vrshr.u16   q12, q6,  #8
2618    vrshr.u16   q11, q5,  #8
2619    vrshr.u16   q10, q4,  #8
2620    vraddhn.u16 d2,  q6,  q12
2621    vraddhn.u16 d1,  q5,  q11
2622    vraddhn.u16 d0,  q4,  q10
2623.endm
2624
2625.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2626    vqadd.u8    q0,  q0,  q2
2627    vqadd.u8    q1,  q1,  q3
2628    /* 32bpp result is in {d0, d1, d2, XX} */
2629    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2630.endm
2631
2632/* TODO: expand macros and do better instructions scheduling */
2633.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2634    fetch_mask_pixblock
2635    pixman_composite_add_0565_8_0565_process_pixblock_tail
2636    fetch_src_pixblock
2637    vld1.16    {d10, d11}, [DST_R, :128]!
2638    cache_preload 8, 8
2639    pixman_composite_add_0565_8_0565_process_pixblock_head
2640    vst1.16    {d28, d29}, [DST_W, :128]!
2641.endm
2642
2643generate_composite_function \
2644    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2645    FLAG_DST_READWRITE, \
2646    8, /* number of pixels, processed in a single block */ \
2647    5, /* prefetch distance */ \
2648    default_init_need_all_regs, \
2649    default_cleanup_need_all_regs, \
2650    pixman_composite_add_0565_8_0565_process_pixblock_head, \
2651    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2652    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2653    28, /* dst_w_basereg */ \
2654    10, /* dst_r_basereg */ \
2655    8,  /* src_basereg   */ \
2656    15  /* mask_basereg  */
2657
2658/******************************************************************************/
2659
2660.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2661    /* mask is in d15 */
2662    convert_0565_to_x888 q5, d6, d5, d4
2663    /* destination pixel data is in {d4, d5, d6, xx} */
2664    vmvn.8      d24, d15 /* get inverted alpha */
2665    /* now do alpha blending */
2666    vmull.u8    q8, d24, d4
2667    vmull.u8    q9, d24, d5
2668    vmull.u8    q10, d24, d6
2669.endm
2670
2671.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2672    vrshr.u16   q14, q8, #8
2673    vrshr.u16   q15, q9, #8
2674    vrshr.u16   q12, q10, #8
2675    vraddhn.u16 d0, q14, q8
2676    vraddhn.u16 d1, q15, q9
2677    vraddhn.u16 d2, q12, q10
2678    /* 32bpp result is in {d0, d1, d2, XX} */
2679    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2680.endm
2681
2682/* TODO: expand macros and do better instructions scheduling */
2683.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2684    fetch_src_pixblock
2685    pixman_composite_out_reverse_8_0565_process_pixblock_tail
2686    vld1.16    {d10, d11}, [DST_R, :128]!
2687    cache_preload 8, 8
2688    pixman_composite_out_reverse_8_0565_process_pixblock_head
2689    vst1.16    {d28, d29}, [DST_W, :128]!
2690.endm
2691
2692generate_composite_function \
2693    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2694    FLAG_DST_READWRITE, \
2695    8, /* number of pixels, processed in a single block */ \
2696    5, /* prefetch distance */ \
2697    default_init_need_all_regs, \
2698    default_cleanup_need_all_regs, \
2699    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2700    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2701    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2702    28, /* dst_w_basereg */ \
2703    10, /* dst_r_basereg */ \
2704    15, /* src_basereg   */ \
2705    0   /* mask_basereg  */
2706
2707/******************************************************************************/
2708
2709.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2710    /* src is in d0 */
2711    /* destination pixel data is in {d4, d5, d6, d7} */
2712    vmvn.8      d1, d0 /* get inverted alpha */
2713    /* now do alpha blending */
2714    vmull.u8    q8, d1, d4
2715    vmull.u8    q9, d1, d5
2716    vmull.u8    q10, d1, d6
2717    vmull.u8    q11, d1, d7
2718.endm
2719
2720.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2721    vrshr.u16   q14, q8, #8
2722    vrshr.u16   q15, q9, #8
2723    vrshr.u16   q12, q10, #8
2724    vrshr.u16   q13, q11, #8
2725    vraddhn.u16 d28, q14, q8
2726    vraddhn.u16 d29, q15, q9
2727    vraddhn.u16 d30, q12, q10
2728    vraddhn.u16 d31, q13, q11
2729    /* 32bpp result is in {d28, d29, d30, d31} */
2730.endm
2731
2732/* TODO: expand macros and do better instructions scheduling */
2733.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2734    fetch_src_pixblock
2735    pixman_composite_out_reverse_8_8888_process_pixblock_tail
2736    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
2737    cache_preload 8, 8
2738    pixman_composite_out_reverse_8_8888_process_pixblock_head
2739    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
2740.endm
2741
2742generate_composite_function \
2743    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2744    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2745    8, /* number of pixels, processed in a single block */ \
2746    5, /* prefetch distance */ \
2747    default_init, \
2748    default_cleanup, \
2749    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2750    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2751    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2752    28, /* dst_w_basereg */ \
2753    4, /* dst_r_basereg */ \
2754    0, /* src_basereg   */ \
2755    0   /* mask_basereg  */
2756
2757/******************************************************************************/
2758
2759generate_composite_function_nearest_scanline \
2760    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2761    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2762    8, /* number of pixels, processed in a single block */ \
2763    default_init, \
2764    default_cleanup, \
2765    pixman_composite_over_8888_8888_process_pixblock_head, \
2766    pixman_composite_over_8888_8888_process_pixblock_tail, \
2767    pixman_composite_over_8888_8888_process_pixblock_tail_head
2768
2769generate_composite_function_nearest_scanline \
2770    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2771    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2772    8, /* number of pixels, processed in a single block */ \
2773    default_init, \
2774    default_cleanup, \
2775    pixman_composite_over_8888_0565_process_pixblock_head, \
2776    pixman_composite_over_8888_0565_process_pixblock_tail, \
2777    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2778    28, /* dst_w_basereg */ \
2779    4,  /* dst_r_basereg */ \
2780    0,  /* src_basereg   */ \
2781    24  /* mask_basereg  */
2782
2783generate_composite_function_nearest_scanline \
2784    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2785    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2786    8, /* number of pixels, processed in a single block */ \
2787    default_init, \
2788    default_cleanup, \
2789    pixman_composite_src_8888_0565_process_pixblock_head, \
2790    pixman_composite_src_8888_0565_process_pixblock_tail, \
2791    pixman_composite_src_8888_0565_process_pixblock_tail_head
2792
2793generate_composite_function_nearest_scanline \
2794    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2795    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2796    8, /* number of pixels, processed in a single block */ \
2797    default_init, \
2798    default_cleanup, \
2799    pixman_composite_src_0565_8888_process_pixblock_head, \
2800    pixman_composite_src_0565_8888_process_pixblock_tail, \
2801    pixman_composite_src_0565_8888_process_pixblock_tail_head
2802
2803generate_composite_function_nearest_scanline \
2804    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2805    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2806    8, /* number of pixels, processed in a single block */ \
2807    default_init_need_all_regs, \
2808    default_cleanup_need_all_regs, \
2809    pixman_composite_over_8888_8_0565_process_pixblock_head, \
2810    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2811    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2812    28, /* dst_w_basereg */ \
2813    4,  /* dst_r_basereg */ \
2814    8,  /* src_basereg   */ \
2815    24  /* mask_basereg  */
2816
2817generate_composite_function_nearest_scanline \
2818    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2819    FLAG_DST_READWRITE, \
2820    8, /* number of pixels, processed in a single block */ \
2821    default_init_need_all_regs, \
2822    default_cleanup_need_all_regs, \
2823    pixman_composite_over_0565_8_0565_process_pixblock_head, \
2824    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2825    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2826    28, /* dst_w_basereg */ \
2827    10,  /* dst_r_basereg */ \
2828    8,  /* src_basereg   */ \
2829    15  /* mask_basereg  */
2830
2831/******************************************************************************/
2832
2833/* Supplementary macro for setting function attributes */
2834.macro pixman_asm_function fname
2835    .func fname
2836    .global fname
2837#ifdef __ELF__
2838    .hidden fname
2839    .type fname, %function
2840#endif
2841fname:
2842.endm
2843
2844/*
2845 * Bilinear scaling support code which tries to provide pixel fetching, color
2846 * format conversion, and interpolation as separate macros which can be used
2847 * as the basic building blocks for constructing bilinear scanline functions.
2848 */
2849
2850.macro bilinear_load_8888 reg1, reg2, tmp
2851    mov       TMP1, X, asr #16
2852    add       X, X, UX
2853    add       TMP1, TOP, TMP1, asl #2
2854    vld1.32   {reg1}, [TMP1], STRIDE
2855    vld1.32   {reg2}, [TMP1]
2856.endm
2857
2858.macro bilinear_load_0565 reg1, reg2, tmp
2859    mov       TMP1, X, asr #16
2860    add       X, X, UX
2861    add       TMP1, TOP, TMP1, asl #1
2862    vld1.32   {reg2[0]}, [TMP1], STRIDE
2863    vld1.32   {reg2[1]}, [TMP1]
2864    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2865.endm
2866
2867.macro bilinear_load_and_vertical_interpolate_two_8888 \
2868                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2869
2870    bilinear_load_8888 reg1, reg2, tmp1
2871    vmull.u8  acc1, reg1, d28
2872    vmlal.u8  acc1, reg2, d29
2873    bilinear_load_8888 reg3, reg4, tmp2
2874    vmull.u8  acc2, reg3, d28
2875    vmlal.u8  acc2, reg4, d29
2876.endm
2877
2878.macro bilinear_load_and_vertical_interpolate_four_8888 \
2879                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2880                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2881
2882    bilinear_load_and_vertical_interpolate_two_8888 \
2883                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2884    bilinear_load_and_vertical_interpolate_two_8888 \
2885                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2886.endm
2887
2888.macro bilinear_load_and_vertical_interpolate_two_0565 \
2889                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2890
2891    mov       TMP1, X, asr #16
2892    add       X, X, UX
2893    add       TMP1, TOP, TMP1, asl #1
2894    mov       TMP2, X, asr #16
2895    add       X, X, UX
2896    add       TMP2, TOP, TMP2, asl #1
2897    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
2898    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
2899    vld1.32   {acc2lo[1]}, [TMP1]
2900    vld1.32   {acc2hi[1]}, [TMP2]
2901    convert_0565_to_x888 acc2, reg3, reg2, reg1
2902    vzip.u8   reg1, reg3
2903    vzip.u8   reg2, reg4
2904    vzip.u8   reg3, reg4
2905    vzip.u8   reg1, reg2
2906    vmull.u8  acc1, reg1, d28
2907    vmlal.u8  acc1, reg2, d29
2908    vmull.u8  acc2, reg3, d28
2909    vmlal.u8  acc2, reg4, d29
2910.endm
2911
2912.macro bilinear_load_and_vertical_interpolate_four_0565 \
2913                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2914                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2915
2916    mov       TMP1, X, asr #16
2917    add       X, X, UX
2918    add       TMP1, TOP, TMP1, asl #1
2919    mov       TMP2, X, asr #16
2920    add       X, X, UX
2921    add       TMP2, TOP, TMP2, asl #1
2922    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
2923    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
2924    vld1.32   {xacc2lo[1]}, [TMP1]
2925    vld1.32   {xacc2hi[1]}, [TMP2]
2926    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2927    mov       TMP1, X, asr #16
2928    add       X, X, UX
2929    add       TMP1, TOP, TMP1, asl #1
2930    mov       TMP2, X, asr #16
2931    add       X, X, UX
2932    add       TMP2, TOP, TMP2, asl #1
2933    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
2934    vzip.u8   xreg1, xreg3
2935    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
2936    vzip.u8   xreg2, xreg4
2937    vld1.32   {yacc2lo[1]}, [TMP1]
2938    vzip.u8   xreg3, xreg4
2939    vld1.32   {yacc2hi[1]}, [TMP2]
2940    vzip.u8   xreg1, xreg2
2941    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2942    vmull.u8  xacc1, xreg1, d28
2943    vzip.u8   yreg1, yreg3
2944    vmlal.u8  xacc1, xreg2, d29
2945    vzip.u8   yreg2, yreg4
2946    vmull.u8  xacc2, xreg3, d28
2947    vzip.u8   yreg3, yreg4
2948    vmlal.u8  xacc2, xreg4, d29
2949    vzip.u8   yreg1, yreg2
2950    vmull.u8  yacc1, yreg1, d28
2951    vmlal.u8  yacc1, yreg2, d29
2952    vmull.u8  yacc2, yreg3, d28
2953    vmlal.u8  yacc2, yreg4, d29
2954.endm
2955
2956.macro bilinear_store_8888 numpix, tmp1, tmp2
2957.if numpix == 4
2958    vst1.32   {d0, d1}, [OUT, :128]!
2959.elseif numpix == 2
2960    vst1.32   {d0}, [OUT, :64]!
2961.elseif numpix == 1
2962    vst1.32   {d0[0]}, [OUT, :32]!
2963.else
2964    .error bilinear_store_8888 numpix is unsupported
2965.endif
2966.endm
2967
2968.macro bilinear_store_0565 numpix, tmp1, tmp2
2969    vuzp.u8 d0, d1
2970    vuzp.u8 d2, d3
2971    vuzp.u8 d1, d3
2972    vuzp.u8 d0, d2
2973    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2974.if numpix == 4
2975    vst1.16   {d2}, [OUT, :64]!
2976.elseif numpix == 2
2977    vst1.32   {d2[0]}, [OUT, :32]!
2978.elseif numpix == 1
2979    vst1.16   {d2[0]}, [OUT, :16]!
2980.else
2981    .error bilinear_store_0565 numpix is unsupported
2982.endif
2983.endm
2984
2985.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2986    bilinear_load_&src_fmt d0, d1, d2
2987    vmull.u8  q1, d0, d28
2988    vmlal.u8  q1, d1, d29
2989    /* 5 cycles bubble */
2990    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2991    vmlsl.u16 q0, d2, d30
2992    vmlal.u16 q0, d3, d30
2993    /* 5 cycles bubble */
2994    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
2995    /* 3 cycles bubble */
2996    vmovn.u16 d0, q0
2997    /* 1 cycle bubble */
2998    bilinear_store_&dst_fmt 1, q2, q3
2999.endm
3000
3001.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
3002    bilinear_load_and_vertical_interpolate_two_&src_fmt \
3003                q1, q11, d0, d1, d20, d21, d22, d23
3004    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3005    vmlsl.u16 q0, d2, d30
3006    vmlal.u16 q0, d3, d30
3007    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3008    vmlsl.u16 q10, d22, d31
3009    vmlal.u16 q10, d23, d31
3010    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3011    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3012    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3013    vadd.u16  q12, q12, q13
3014    vmovn.u16 d0, q0
3015    bilinear_store_&dst_fmt 2, q2, q3
3016.endm
3017
3018.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3019    bilinear_load_and_vertical_interpolate_four_&src_fmt \
3020                q1, q11, d0, d1, d20, d21, d22, d23 \
3021                q3, q9,  d4, d5, d16, d17, d18, d19
3022    pld       [TMP1, PF_OFFS]
3023    sub       TMP1, TMP1, STRIDE
3024    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3025    vmlsl.u16 q0, d2, d30
3026    vmlal.u16 q0, d3, d30
3027    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3028    vmlsl.u16 q10, d22, d31
3029    vmlal.u16 q10, d23, d31
3030    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3031    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
3032    vmlsl.u16 q2, d6, d30
3033    vmlal.u16 q2, d7, d30
3034    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
3035    pld       [TMP2, PF_OFFS]
3036    vmlsl.u16 q8, d18, d31
3037    vmlal.u16 q8, d19, d31
3038    vadd.u16  q12, q12, q13
3039    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3040    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3041    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3042    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
3043    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3044    vmovn.u16 d0, q0
3045    vmovn.u16 d1, q2
3046    vadd.u16  q12, q12, q13
3047    bilinear_store_&dst_fmt 4, q2, q3
3048.endm
3049
3050.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3051.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3052    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3053.else
3054    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3055.endif
3056.endm
3057
3058.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3059.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3060    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3061.endif
3062.endm
3063
3064.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3065.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3066    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3067.else
3068    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3069.endif
3070.endm
3071
3072.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3073.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3074    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3075.else
3076    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3077    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3078.endif
3079.endm
3080
3081.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3082.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3083    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3084.else
3085    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3086.endif
3087.endm
3088
3089.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3090.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3091    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3092.else
3093    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3094    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3095.endif
3096.endm
3097
3098.set BILINEAR_FLAG_UNROLL_4,          0
3099.set BILINEAR_FLAG_UNROLL_8,          1
3100.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3101
3102/*
3103 * Main template macro for generating NEON optimized bilinear scanline
3104 * functions.
3105 *
3106 * Bilinear scanline scaler macro template uses the following arguments:
3107 *  fname             - name of the function to generate
3108 *  src_fmt           - source color format (8888 or 0565)
3109 *  dst_fmt           - destination color format (8888 or 0565)
3110 *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
3111 *  prefetch_distance - prefetch in the source image by that many
3112 *                      pixels ahead
3113 */
3114
3115.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3116                                       src_bpp_shift, dst_bpp_shift, \
3117                                       prefetch_distance, flags
3118
3119pixman_asm_function fname
3120    OUT       .req      r0
3121    TOP       .req      r1
3122    BOTTOM    .req      r2
3123    WT        .req      r3
3124    WB        .req      r4
3125    X         .req      r5
3126    UX        .req      r6
3127    WIDTH     .req      ip
3128    TMP1      .req      r3
3129    TMP2      .req      r4
3130    PF_OFFS   .req      r7
3131    TMP3      .req      r8
3132    TMP4      .req      r9
3133    STRIDE    .req      r2
3134
3135    mov       ip, sp
3136    push      {r4, r5, r6, r7, r8, r9}
3137    mov       PF_OFFS, #prefetch_distance
3138    ldmia     ip, {WB, X, UX, WIDTH}
3139    mul       PF_OFFS, PF_OFFS, UX
3140
3141.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3142    vpush     {d8-d15}
3143.endif
3144
3145    sub       STRIDE, BOTTOM, TOP
3146    .unreq    BOTTOM
3147
3148    cmp       WIDTH, #0
3149    ble       3f
3150
3151    vdup.u16  q12, X
3152    vdup.u16  q13, UX
3153    vdup.u8   d28, WT
3154    vdup.u8   d29, WB
3155    vadd.u16  d25, d25, d26
3156
3157    /* ensure good destination alignment  */
3158    cmp       WIDTH, #1
3159    blt       0f
3160    tst       OUT, #(1 << dst_bpp_shift)
3161    beq       0f
3162    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3163    vadd.u16  q12, q12, q13
3164    bilinear_interpolate_last_pixel src_fmt, dst_fmt
3165    sub       WIDTH, WIDTH, #1
31660:
3167    vadd.u16  q13, q13, q13
3168    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3169    vadd.u16  q12, q12, q13
3170
3171    cmp       WIDTH, #2
3172    blt       0f
3173    tst       OUT, #(1 << (dst_bpp_shift + 1))
3174    beq       0f
3175    bilinear_interpolate_two_pixels src_fmt, dst_fmt
3176    sub       WIDTH, WIDTH, #2
31770:
3178.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3179/*********** 8 pixels per iteration *****************/
3180    cmp       WIDTH, #4
3181    blt       0f
3182    tst       OUT, #(1 << (dst_bpp_shift + 2))
3183    beq       0f
3184    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3185    sub       WIDTH, WIDTH, #4
31860:
3187    subs      WIDTH, WIDTH, #8
3188    blt       1f
3189    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3190    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3191    subs      WIDTH, WIDTH, #8
3192    blt       5f
31930:
3194    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3195    subs      WIDTH, WIDTH, #8
3196    bge       0b
31975:
3198    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
31991:
3200    tst       WIDTH, #4
3201    beq       2f
3202    bilinear_interpolate_four_pixels src_fmt, dst_fmt
32032:
3204.else
3205/*********** 4 pixels per iteration *****************/
3206    subs      WIDTH, WIDTH, #4
3207    blt       1f
3208    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3209    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3210    subs      WIDTH, WIDTH, #4
3211    blt       5f
32120:
3213    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3214    subs      WIDTH, WIDTH, #4
3215    bge       0b
32165:
3217    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
32181:
3219/****************************************************/
3220.endif
3221    /* handle the remaining trailing pixels */
3222    tst       WIDTH, #2
3223    beq       2f
3224    bilinear_interpolate_two_pixels src_fmt, dst_fmt
32252:
3226    tst       WIDTH, #1
3227    beq       3f
3228    bilinear_interpolate_last_pixel src_fmt, dst_fmt
32293:
3230.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3231    vpop      {d8-d15}
3232.endif
3233    pop       {r4, r5, r6, r7, r8, r9}
3234    bx        lr
3235
3236    .unreq    OUT
3237    .unreq    TOP
3238    .unreq    WT
3239    .unreq    WB
3240    .unreq    X
3241    .unreq    UX
3242    .unreq    WIDTH
3243    .unreq    TMP1
3244    .unreq    TMP2
3245    .unreq    PF_OFFS
3246    .unreq    TMP3
3247    .unreq    TMP4
3248    .unreq    STRIDE
3249.endfunc
3250
3251.endm
3252
3253/*****************************************************************************/
3254
3255.set have_bilinear_interpolate_four_pixels_8888_8888, 1
3256
3257.macro bilinear_interpolate_four_pixels_8888_8888_head
3258    mov       TMP1, X, asr #16
3259    add       X, X, UX
3260    add       TMP1, TOP, TMP1, asl #2
3261    mov       TMP2, X, asr #16
3262    add       X, X, UX
3263    add       TMP2, TOP, TMP2, asl #2
3264
3265    vld1.32   {d22}, [TMP1], STRIDE
3266    vld1.32   {d23}, [TMP1]
3267    mov       TMP3, X, asr #16
3268    add       X, X, UX
3269    add       TMP3, TOP, TMP3, asl #2
3270    vmull.u8  q8, d22, d28
3271    vmlal.u8  q8, d23, d29
3272
3273    vld1.32   {d22}, [TMP2], STRIDE
3274    vld1.32   {d23}, [TMP2]
3275    mov       TMP4, X, asr #16
3276    add       X, X, UX
3277    add       TMP4, TOP, TMP4, asl #2
3278    vmull.u8  q9, d22, d28
3279    vmlal.u8  q9, d23, d29
3280
3281    vld1.32   {d22}, [TMP3], STRIDE
3282    vld1.32   {d23}, [TMP3]
3283    vmull.u8  q10, d22, d28
3284    vmlal.u8  q10, d23, d29
3285
3286    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3287    vmlsl.u16 q0, d16, d30
3288    vmlal.u16 q0, d17, d30
3289
3290    pld       [TMP4, PF_OFFS]
3291    vld1.32   {d16}, [TMP4], STRIDE
3292    vld1.32   {d17}, [TMP4]
3293    pld       [TMP4, PF_OFFS]
3294    vmull.u8  q11, d16, d28
3295    vmlal.u8  q11, d17, d29
3296
3297    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3298    vmlsl.u16 q1, d18, d31
3299.endm
3300
3301.macro bilinear_interpolate_four_pixels_8888_8888_tail
3302    vmlal.u16 q1, d19, d31
3303    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3304    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3305    vmlsl.u16 q2, d20, d30
3306    vmlal.u16 q2, d21, d30
3307    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3308    vmlsl.u16 q3, d22, d31
3309    vmlal.u16 q3, d23, d31
3310    vadd.u16  q12, q12, q13
3311    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3312    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3313    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3314    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3315    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3316    vmovn.u16 d6, q0
3317    vmovn.u16 d7, q2
3318    vadd.u16  q12, q12, q13
3319    vst1.32   {d6, d7}, [OUT, :128]!
3320.endm
3321
3322.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3323    mov       TMP1, X, asr #16
3324    add       X, X, UX
3325    add       TMP1, TOP, TMP1, asl #2
3326    mov       TMP2, X, asr #16
3327    add       X, X, UX
3328    add       TMP2, TOP, TMP2, asl #2
3329        vmlal.u16 q1, d19, d31
3330        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3331        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3332        vmlsl.u16 q2, d20, d30
3333        vmlal.u16 q2, d21, d30
3334        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3335    vld1.32   {d20}, [TMP1], STRIDE
3336        vmlsl.u16 q3, d22, d31
3337        vmlal.u16 q3, d23, d31
3338    vld1.32   {d21}, [TMP1]
3339    vmull.u8  q8, d20, d28
3340    vmlal.u8  q8, d21, d29
3341        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3342        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3343        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3344    vld1.32   {d22}, [TMP2], STRIDE
3345        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3346        vadd.u16  q12, q12, q13
3347    vld1.32   {d23}, [TMP2]
3348    vmull.u8  q9, d22, d28
3349    mov       TMP3, X, asr #16
3350    add       X, X, UX
3351    add       TMP3, TOP, TMP3, asl #2
3352    mov       TMP4, X, asr #16
3353    add       X, X, UX
3354    add       TMP4, TOP, TMP4, asl #2
3355    vmlal.u8  q9, d23, d29
3356    vld1.32   {d22}, [TMP3], STRIDE
3357        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3358    vld1.32   {d23}, [TMP3]
3359    vmull.u8  q10, d22, d28
3360    vmlal.u8  q10, d23, d29
3361        vmovn.u16 d6, q0
3362    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3363        vmovn.u16 d7, q2
3364    vmlsl.u16 q0, d16, d30
3365    vmlal.u16 q0, d17, d30
3366    pld       [TMP4, PF_OFFS]
3367    vld1.32   {d16}, [TMP4], STRIDE
3368        vadd.u16  q12, q12, q13
3369    vld1.32   {d17}, [TMP4]
3370    pld       [TMP4, PF_OFFS]
3371    vmull.u8  q11, d16, d28
3372    vmlal.u8  q11, d17, d29
3373        vst1.32   {d6, d7}, [OUT, :128]!
3374    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3375    vmlsl.u16 q1, d18, d31
3376.endm
3377
3378/*****************************************************************************/
3379
3380.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3381
3382.macro bilinear_interpolate_eight_pixels_8888_0565_head
3383    mov       TMP1, X, asr #16
3384    add       X, X, UX
3385    add       TMP1, TOP, TMP1, asl #2
3386    mov       TMP2, X, asr #16
3387    add       X, X, UX
3388    add       TMP2, TOP, TMP2, asl #2
3389    vld1.32   {d20}, [TMP1], STRIDE
3390    vld1.32   {d21}, [TMP1]
3391    vmull.u8  q8, d20, d28
3392    vmlal.u8  q8, d21, d29
3393    vld1.32   {d22}, [TMP2], STRIDE
3394    vld1.32   {d23}, [TMP2]
3395    vmull.u8  q9, d22, d28
3396    mov       TMP3, X, asr #16
3397    add       X, X, UX
3398    add       TMP3, TOP, TMP3, asl #2
3399    mov       TMP4, X, asr #16
3400    add       X, X, UX
3401    add       TMP4, TOP, TMP4, asl #2
3402    vmlal.u8  q9, d23, d29
3403    vld1.32   {d22}, [TMP3], STRIDE
3404    vld1.32   {d23}, [TMP3]
3405    vmull.u8  q10, d22, d28
3406    vmlal.u8  q10, d23, d29
3407    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3408    vmlsl.u16 q0, d16, d30
3409    vmlal.u16 q0, d17, d30
3410    pld       [TMP4, PF_OFFS]
3411    vld1.32   {d16}, [TMP4], STRIDE
3412    vld1.32   {d17}, [TMP4]
3413    pld       [TMP4, PF_OFFS]
3414    vmull.u8  q11, d16, d28
3415    vmlal.u8  q11, d17, d29
3416    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3417    vmlsl.u16 q1, d18, d31
3418
3419    mov       TMP1, X, asr #16
3420    add       X, X, UX
3421    add       TMP1, TOP, TMP1, asl #2
3422    mov       TMP2, X, asr #16
3423    add       X, X, UX
3424    add       TMP2, TOP, TMP2, asl #2
3425        vmlal.u16 q1, d19, d31
3426        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3427        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3428        vmlsl.u16 q2, d20, d30
3429        vmlal.u16 q2, d21, d30
3430        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3431    vld1.32   {d20}, [TMP1], STRIDE
3432        vmlsl.u16 q3, d22, d31
3433        vmlal.u16 q3, d23, d31
3434    vld1.32   {d21}, [TMP1]
3435    vmull.u8  q8, d20, d28
3436    vmlal.u8  q8, d21, d29
3437        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3438        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3439        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3440    vld1.32   {d22}, [TMP2], STRIDE
3441        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3442        vadd.u16  q12, q12, q13
3443    vld1.32   {d23}, [TMP2]
3444    vmull.u8  q9, d22, d28
3445    mov       TMP3, X, asr #16
3446    add       X, X, UX
3447    add       TMP3, TOP, TMP3, asl #2
3448    mov       TMP4, X, asr #16
3449    add       X, X, UX
3450    add       TMP4, TOP, TMP4, asl #2
3451    vmlal.u8  q9, d23, d29
3452    vld1.32   {d22}, [TMP3], STRIDE
3453        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3454    vld1.32   {d23}, [TMP3]
3455    vmull.u8  q10, d22, d28
3456    vmlal.u8  q10, d23, d29
3457        vmovn.u16 d8, q0
3458    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3459        vmovn.u16 d9, q2
3460    vmlsl.u16 q0, d16, d30
3461    vmlal.u16 q0, d17, d30
3462    pld       [TMP4, PF_OFFS]
3463    vld1.32   {d16}, [TMP4], STRIDE
3464        vadd.u16  q12, q12, q13
3465    vld1.32   {d17}, [TMP4]
3466    pld       [TMP4, PF_OFFS]
3467    vmull.u8  q11, d16, d28
3468    vmlal.u8  q11, d17, d29
3469    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3470    vmlsl.u16 q1, d18, d31
3471.endm
3472
3473.macro bilinear_interpolate_eight_pixels_8888_0565_tail
3474    vmlal.u16 q1, d19, d31
3475    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3476    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3477    vmlsl.u16 q2, d20, d30
3478    vmlal.u16 q2, d21, d30
3479    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3480    vmlsl.u16 q3, d22, d31
3481    vmlal.u16 q3, d23, d31
3482    vadd.u16  q12, q12, q13
3483    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3484    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3485    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3486    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3487    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3488    vmovn.u16 d10, q0
3489    vmovn.u16 d11, q2
3490    vadd.u16  q12, q12, q13
3491
3492    vuzp.u8   d8, d9
3493    vuzp.u8   d10, d11
3494    vuzp.u8   d9, d11
3495    vuzp.u8   d8, d10
3496    vshll.u8  q6, d9, #8
3497    vshll.u8  q5, d10, #8
3498    vshll.u8  q7, d8, #8
3499    vsri.u16  q5, q6, #5
3500    vsri.u16  q5, q7, #11
3501    vst1.32   {d10, d11}, [OUT, :128]!
3502.endm
3503
3504.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3505    mov       TMP1, X, asr #16
3506    add       X, X, UX
3507    add       TMP1, TOP, TMP1, asl #2
3508    mov       TMP2, X, asr #16
3509    add       X, X, UX
3510    add       TMP2, TOP, TMP2, asl #2
3511        vmlal.u16 q1, d19, d31
3512        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3513            vuzp.u8 d8, d9
3514        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3515        vmlsl.u16 q2, d20, d30
3516        vmlal.u16 q2, d21, d30
3517        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3518    vld1.32   {d20}, [TMP1], STRIDE
3519        vmlsl.u16 q3, d22, d31
3520        vmlal.u16 q3, d23, d31
3521    vld1.32   {d21}, [TMP1]
3522    vmull.u8  q8, d20, d28
3523    vmlal.u8  q8, d21, d29
3524        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3525        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3526        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3527    vld1.32   {d22}, [TMP2], STRIDE
3528        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3529        vadd.u16  q12, q12, q13
3530    vld1.32   {d23}, [TMP2]
3531    vmull.u8  q9, d22, d28
3532    mov       TMP3, X, asr #16
3533    add       X, X, UX
3534    add       TMP3, TOP, TMP3, asl #2
3535    mov       TMP4, X, asr #16
3536    add       X, X, UX
3537    add       TMP4, TOP, TMP4, asl #2
3538    vmlal.u8  q9, d23, d29
3539    vld1.32   {d22}, [TMP3], STRIDE
3540        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3541    vld1.32   {d23}, [TMP3]
3542    vmull.u8  q10, d22, d28
3543    vmlal.u8  q10, d23, d29
3544        vmovn.u16 d10, q0
3545    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3546        vmovn.u16 d11, q2
3547    vmlsl.u16 q0, d16, d30
3548    vmlal.u16 q0, d17, d30
3549    pld       [TMP4, PF_OFFS]
3550    vld1.32   {d16}, [TMP4], STRIDE
3551        vadd.u16  q12, q12, q13
3552    vld1.32   {d17}, [TMP4]
3553    pld       [TMP4, PF_OFFS]
3554    vmull.u8  q11, d16, d28
3555    vmlal.u8  q11, d17, d29
3556            vuzp.u8 d10, d11
3557    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3558    vmlsl.u16 q1, d18, d31
3559
3560    mov       TMP1, X, asr #16
3561    add       X, X, UX
3562    add       TMP1, TOP, TMP1, asl #2
3563    mov       TMP2, X, asr #16
3564    add       X, X, UX
3565    add       TMP2, TOP, TMP2, asl #2
3566        vmlal.u16 q1, d19, d31
3567            vuzp.u8 d9, d11
3568        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3569        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3570            vuzp.u8 d8, d10
3571        vmlsl.u16 q2, d20, d30
3572        vmlal.u16 q2, d21, d30
3573        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3574    vld1.32   {d20}, [TMP1], STRIDE
3575        vmlsl.u16 q3, d22, d31
3576        vmlal.u16 q3, d23, d31
3577    vld1.32   {d21}, [TMP1]
3578    vmull.u8  q8, d20, d28
3579    vmlal.u8  q8, d21, d29
3580            vshll.u8  q6, d9, #8
3581            vshll.u8  q5, d10, #8
3582            vshll.u8  q7, d8, #8
3583        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3584            vsri.u16  q5, q6, #5
3585        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3586            vsri.u16  q5, q7, #11
3587        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3588    vld1.32   {d22}, [TMP2], STRIDE
3589        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3590        vadd.u16  q12, q12, q13
3591    vld1.32   {d23}, [TMP2]
3592    vmull.u8  q9, d22, d28
3593    mov       TMP3, X, asr #16
3594    add       X, X, UX
3595    add       TMP3, TOP, TMP3, asl #2
3596    mov       TMP4, X, asr #16
3597    add       X, X, UX
3598    add       TMP4, TOP, TMP4, asl #2
3599    vmlal.u8  q9, d23, d29
3600    vld1.32   {d22}, [TMP3], STRIDE
3601        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3602    vld1.32   {d23}, [TMP3]
3603    vmull.u8  q10, d22, d28
3604    vmlal.u8  q10, d23, d29
3605        vmovn.u16 d8, q0
3606    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3607        vmovn.u16 d9, q2
3608    vmlsl.u16 q0, d16, d30
3609    vmlal.u16 q0, d17, d30
3610    pld       [TMP4, PF_OFFS]
3611    vld1.32   {d16}, [TMP4], STRIDE
3612        vadd.u16  q12, q12, q13
3613    vld1.32   {d17}, [TMP4]
3614    pld       [TMP4, PF_OFFS]
3615    vmull.u8  q11, d16, d28
3616    vmlal.u8  q11, d17, d29
3617    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3618            vst1.32   {d10, d11}, [OUT, :128]!
3619    vmlsl.u16 q1, d18, d31
3620.endm
3621/*****************************************************************************/
3622
3623generate_bilinear_scanline_func \
3624    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3625    2, 2, 28, BILINEAR_FLAG_UNROLL_4
3626
3627generate_bilinear_scanline_func \
3628    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3629    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3630
3631generate_bilinear_scanline_func \
3632    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3633    1, 2, 28, BILINEAR_FLAG_UNROLL_4
3634
3635generate_bilinear_scanline_func \
3636    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3637    1, 1, 28, BILINEAR_FLAG_UNROLL_4
3638