pixman-arm-neon-asm.h revision 1176bdada62cabc6ec4b0308a930e83b679d5d36
1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains a macro ('generate_composite_function') which can
28 * construct 2D image processing functions, based on a common template.
29 * Any combinations of source, destination and mask images with 8bpp,
30 * 16bpp, 24bpp, 32bpp color formats are supported.
31 *
32 * This macro takes care of:
33 *  - handling of leading and trailing unaligned pixels
34 *  - doing most of the work related to L2 cache preload
35 *  - encourages the use of software pipelining for better instructions
36 *    scheduling
37 *
38 * The user of this macro has to provide some configuration parameters
39 * (bit depths for the images, prefetch distance, etc.) and a set of
40 * macros, which should implement basic code chunks responsible for
41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
42 * examples.
43 *
44 * TODO:
45 *  - try overlapped pixel method (from Ian Rickards) when processing
46 *    exactly two blocks of pixels
47 *  - maybe add an option to do reverse scanline processing
48 */
49
50/*
51 * Bit flags for 'generate_composite_function' macro which are used
52 * to tune generated functions behavior.
53 */
54.set FLAG_DST_WRITEONLY,       0
55.set FLAG_DST_READWRITE,       1
56.set FLAG_DEINTERLEAVE_32BPP,  2
57
58/*
59 * Offset in stack where mask and source pointer/stride can be accessed
60 * from 'init' macro. This is useful for doing special handling for solid mask.
61 */
62.set ARGS_STACK_OFFSET,        40
63
64/*
65 * Constants for selecting preferable prefetch type.
66 */
67.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
68.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
69.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
70
71/*
72 * Definitions of supplementary pixld/pixst macros (for partial load/store of
73 * pixel data).
74 */
75
76.macro pixldst1 op, elem_size, reg1, mem_operand, abits
77.if abits > 0
78    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
79.else
80    op&.&elem_size {d&reg1}, [&mem_operand&]!
81.endif
82.endm
83
84.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
85.if abits > 0
86    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
87.else
88    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
89.endif
90.endm
91
92.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
93.if abits > 0
94    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
95.else
96    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
97.endif
98.endm
99
100.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
101    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
102.endm
103
104.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
105    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
106.endm
107
108.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
109    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
110.endm
111
112.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
113.if numbytes == 32
114    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
115                              %(basereg+6), %(basereg+7), mem_operand, abits
116.elseif numbytes == 16
117    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
118.elseif numbytes == 8
119    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
120.elseif numbytes == 4
121    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
122        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
123    .elseif elem_size == 16
124        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
125        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
126    .else
127        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
128        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
129        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
130        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
131    .endif
132.elseif numbytes == 2
133    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
134        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
135    .else
136        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
137        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
138    .endif
139.elseif numbytes == 1
140    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
141.else
142    .error "unsupported size: numbytes"
143.endif
144.endm
145
146.macro pixld numpix, bpp, basereg, mem_operand, abits=0
147.if bpp > 0
148.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
149    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
150                      %(basereg+6), %(basereg+7), mem_operand, abits
151.elseif (bpp == 24) && (numpix == 8)
152    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
153.elseif (bpp == 24) && (numpix == 4)
154    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
155    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
156    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
157    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
158.elseif (bpp == 24) && (numpix == 2)
159    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
160    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
161.elseif (bpp == 24) && (numpix == 1)
162    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
163.else
164    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
165.endif
166.endif
167.endm
168
169.macro pixst numpix, bpp, basereg, mem_operand, abits=0
170.if bpp > 0
171.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
172    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
173                      %(basereg+6), %(basereg+7), mem_operand, abits
174.elseif (bpp == 24) && (numpix == 8)
175    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
176.elseif (bpp == 24) && (numpix == 4)
177    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
178    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
179    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
180    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
181.elseif (bpp == 24) && (numpix == 2)
182    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
183    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
184.elseif (bpp == 24) && (numpix == 1)
185    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
186.else
187    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
188.endif
189.endif
190.endm
191
192.macro pixld_a numpix, bpp, basereg, mem_operand
193.if (bpp * numpix) <= 128
194    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
195.else
196    pixld numpix, bpp, basereg, mem_operand, 128
197.endif
198.endm
199
200.macro pixst_a numpix, bpp, basereg, mem_operand
201.if (bpp * numpix) <= 128
202    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
203.else
204    pixst numpix, bpp, basereg, mem_operand, 128
205.endif
206.endm
207
208/*
209 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
210 * aliases to be defined)
211 */
212.macro pixld1_s elem_size, reg1, mem_operand
213.if elem_size == 16
214    mov     TMP1, VX, asr #16
215    adds    VX, VX, UNIT_X
2165:  subpls  VX, VX, SRC_WIDTH_FIXED
217    bpl     5b
218    add     TMP1, mem_operand, TMP1, asl #1
219    mov     TMP2, VX, asr #16
220    adds    VX, VX, UNIT_X
2215:  subpls  VX, VX, SRC_WIDTH_FIXED
222    bpl     5b
223    add     TMP2, mem_operand, TMP2, asl #1
224    vld1.16 {d&reg1&[0]}, [TMP1, :16]
225    mov     TMP1, VX, asr #16
226    adds    VX, VX, UNIT_X
2275:  subpls  VX, VX, SRC_WIDTH_FIXED
228    bpl     5b
229    add     TMP1, mem_operand, TMP1, asl #1
230    vld1.16 {d&reg1&[1]}, [TMP2, :16]
231    mov     TMP2, VX, asr #16
232    adds    VX, VX, UNIT_X
2335:  subpls  VX, VX, SRC_WIDTH_FIXED
234    bpl     5b
235    add     TMP2, mem_operand, TMP2, asl #1
236    vld1.16 {d&reg1&[2]}, [TMP1, :16]
237    vld1.16 {d&reg1&[3]}, [TMP2, :16]
238.elseif elem_size == 32
239    mov     TMP1, VX, asr #16
240    adds    VX, VX, UNIT_X
2415:  subpls  VX, VX, SRC_WIDTH_FIXED
242    bpl     5b
243    add     TMP1, mem_operand, TMP1, asl #2
244    mov     TMP2, VX, asr #16
245    adds    VX, VX, UNIT_X
2465:  subpls  VX, VX, SRC_WIDTH_FIXED
247    bpl     5b
248    add     TMP2, mem_operand, TMP2, asl #2
249    vld1.32 {d&reg1&[0]}, [TMP1, :32]
250    vld1.32 {d&reg1&[1]}, [TMP2, :32]
251.else
252    .error "unsupported"
253.endif
254.endm
255
256.macro pixld2_s elem_size, reg1, reg2, mem_operand
257.if 0 /* elem_size == 32 */
258    mov     TMP1, VX, asr #16
259    add     VX, VX, UNIT_X, asl #1
260    add     TMP1, mem_operand, TMP1, asl #2
261    mov     TMP2, VX, asr #16
262    sub     VX, VX, UNIT_X
263    add     TMP2, mem_operand, TMP2, asl #2
264    vld1.32 {d&reg1&[0]}, [TMP1, :32]
265    mov     TMP1, VX, asr #16
266    add     VX, VX, UNIT_X, asl #1
267    add     TMP1, mem_operand, TMP1, asl #2
268    vld1.32 {d&reg2&[0]}, [TMP2, :32]
269    mov     TMP2, VX, asr #16
270    add     VX, VX, UNIT_X
271    add     TMP2, mem_operand, TMP2, asl #2
272    vld1.32 {d&reg1&[1]}, [TMP1, :32]
273    vld1.32 {d&reg2&[1]}, [TMP2, :32]
274.else
275    pixld1_s elem_size, reg1, mem_operand
276    pixld1_s elem_size, reg2, mem_operand
277.endif
278.endm
279
280.macro pixld0_s elem_size, reg1, idx, mem_operand
281.if elem_size == 16
282    mov     TMP1, VX, asr #16
283    adds    VX, VX, UNIT_X
2845:  subpls  VX, VX, SRC_WIDTH_FIXED
285    bpl     5b
286    add     TMP1, mem_operand, TMP1, asl #1
287    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
288.elseif elem_size == 32
289    mov     TMP1, VX, asr #16
290    adds    VX, VX, UNIT_X
2915:  subpls  VX, VX, SRC_WIDTH_FIXED
292    bpl     5b
293    add     TMP1, mem_operand, TMP1, asl #2
294    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
295.endif
296.endm
297
298.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
299.if numbytes == 32
300    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
301    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
302    pixdeinterleave elem_size, %(basereg+4)
303.elseif numbytes == 16
304    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
305.elseif numbytes == 8
306    pixld1_s elem_size, %(basereg+1), mem_operand
307.elseif numbytes == 4
308    .if elem_size == 32
309        pixld0_s elem_size, %(basereg+0), 1, mem_operand
310    .elseif elem_size == 16
311        pixld0_s elem_size, %(basereg+0), 2, mem_operand
312        pixld0_s elem_size, %(basereg+0), 3, mem_operand
313    .else
314        pixld0_s elem_size, %(basereg+0), 4, mem_operand
315        pixld0_s elem_size, %(basereg+0), 5, mem_operand
316        pixld0_s elem_size, %(basereg+0), 6, mem_operand
317        pixld0_s elem_size, %(basereg+0), 7, mem_operand
318    .endif
319.elseif numbytes == 2
320    .if elem_size == 16
321        pixld0_s elem_size, %(basereg+0), 1, mem_operand
322    .else
323        pixld0_s elem_size, %(basereg+0), 2, mem_operand
324        pixld0_s elem_size, %(basereg+0), 3, mem_operand
325    .endif
326.elseif numbytes == 1
327    pixld0_s elem_size, %(basereg+0), 1, mem_operand
328.else
329    .error "unsupported size: numbytes"
330.endif
331.endm
332
333.macro pixld_s numpix, bpp, basereg, mem_operand
334.if bpp > 0
335    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
336.endif
337.endm
338
339.macro vuzp8 reg1, reg2
340    vuzp.8 d&reg1, d&reg2
341.endm
342
343.macro vzip8 reg1, reg2
344    vzip.8 d&reg1, d&reg2
345.endm
346
347/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
348.macro pixdeinterleave bpp, basereg
349.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
350    vuzp8 %(basereg+0), %(basereg+1)
351    vuzp8 %(basereg+2), %(basereg+3)
352    vuzp8 %(basereg+1), %(basereg+3)
353    vuzp8 %(basereg+0), %(basereg+2)
354.endif
355.endm
356
357/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
358.macro pixinterleave bpp, basereg
359.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
360    vzip8 %(basereg+0), %(basereg+2)
361    vzip8 %(basereg+1), %(basereg+3)
362    vzip8 %(basereg+2), %(basereg+3)
363    vzip8 %(basereg+0), %(basereg+1)
364.endif
365.endm
366
367/*
368 * This is a macro for implementing cache preload. The main idea is that
369 * cache preload logic is mostly independent from the rest of pixels
370 * processing code. It starts at the top left pixel and moves forward
371 * across pixels and can jump across scanlines. Prefetch distance is
372 * handled in an 'incremental' way: it starts from 0 and advances to the
373 * optimal distance over time. After reaching optimal prefetch distance,
374 * it is kept constant. There are some checks which prevent prefetching
375 * unneeded pixel lines below the image (but it still can prefetch a bit
376 * more data on the right side of the image - not a big issue and may
377 * be actually helpful when rendering text glyphs). Additional trick is
378 * the use of LDR instruction for prefetch instead of PLD when moving to
379 * the next line, the point is that we have a high chance of getting TLB
380 * miss in this case, and PLD would be useless.
381 *
382 * This sounds like it may introduce a noticeable overhead (when working with
383 * fully cached data). But in reality, due to having a separate pipeline and
384 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
385 * execute simultaneously with NEON and be completely shadowed by it. Thus
386 * we get no performance overhead at all (*). This looks like a very nice
387 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
388 * but still can implement some rather advanced prefetch logic in software
389 * for almost zero cost!
390 *
391 * (*) The overhead of the prefetcher is visible when running some trivial
392 * pixels processing like simple copy. Anyway, having prefetch is a must
393 * when working with the graphics data.
394 */
395.macro PF a, x:vararg
396.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
397    a x
398.endif
399.endm
400
401.macro cache_preload std_increment, boost_increment
402.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
403.if regs_shortage
404    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
405.endif
406.if std_increment != 0
407    PF add PF_X, PF_X, #std_increment
408.endif
409    PF tst PF_CTL, #0xF
410    PF addne PF_X, PF_X, #boost_increment
411    PF subne PF_CTL, PF_CTL, #1
412    PF cmp PF_X, ORIG_W
413.if src_bpp_shift >= 0
414    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
415.endif
416.if dst_r_bpp != 0
417    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
418.endif
419.if mask_bpp_shift >= 0
420    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
421.endif
422    PF subge PF_X, PF_X, ORIG_W
423    PF subges PF_CTL, PF_CTL, #0x10
424.if src_bpp_shift >= 0
425    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
426.endif
427.if dst_r_bpp != 0
428    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
429.endif
430.if mask_bpp_shift >= 0
431    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
432.endif
433.endif
434.endm
435
436.macro cache_preload_simple
437.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
438.if src_bpp > 0
439    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
440.endif
441.if dst_r_bpp > 0
442    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
443.endif
444.if mask_bpp > 0
445    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
446.endif
447.endif
448.endm
449
450.macro fetch_mask_pixblock
451    pixld       pixblock_size, mask_bpp, \
452                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
453.endm
454
455/*
456 * Macro which is used to process leading pixels until destination
457 * pointer is properly aligned (at 16 bytes boundary). When destination
458 * buffer uses 16bpp format, this is unnecessary, or even pointless.
459 */
460.macro ensure_destination_ptr_alignment process_pixblock_head, \
461                                        process_pixblock_tail, \
462                                        process_pixblock_tail_head
463.if dst_w_bpp != 24
464    tst         DST_R, #0xF
465    beq         2f
466
467.irp lowbit, 1, 2, 4, 8, 16
468local skip1
469.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
470.if lowbit < 16 /* we don't need more than 16-byte alignment */
471    tst         DST_R, #lowbit
472    beq         1f
473.endif
474    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
475    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
476.if dst_r_bpp > 0
477    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
478.else
479    add         DST_R, DST_R, #lowbit
480.endif
481    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
482    sub         W, W, #(lowbit * 8 / dst_w_bpp)
4831:
484.endif
485.endr
486    pixdeinterleave src_bpp, src_basereg
487    pixdeinterleave mask_bpp, mask_basereg
488    pixdeinterleave dst_r_bpp, dst_r_basereg
489
490    process_pixblock_head
491    cache_preload 0, pixblock_size
492    cache_preload_simple
493    process_pixblock_tail
494
495    pixinterleave dst_w_bpp, dst_w_basereg
496.irp lowbit, 1, 2, 4, 8, 16
497.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
498.if lowbit < 16 /* we don't need more than 16-byte alignment */
499    tst         DST_W, #lowbit
500    beq         1f
501.endif
502    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
5031:
504.endif
505.endr
506.endif
5072:
508.endm
509
510/*
511 * Special code for processing up to (pixblock_size - 1) remaining
512 * trailing pixels. As SIMD processing performs operation on
513 * pixblock_size pixels, anything smaller than this has to be loaded
514 * and stored in a special way. Loading and storing of pixel data is
515 * performed in such a way that we fill some 'slots' in the NEON
516 * registers (some slots naturally are unused), then perform compositing
517 * operation as usual. In the end, the data is taken from these 'slots'
518 * and saved to memory.
519 *
520 * cache_preload_flag - allows to suppress prefetch if
521 *                      set to 0
522 * dst_aligned_flag   - selects whether destination buffer
523 *                      is aligned
524 */
525.macro process_trailing_pixels cache_preload_flag, \
526                               dst_aligned_flag, \
527                               process_pixblock_head, \
528                               process_pixblock_tail, \
529                               process_pixblock_tail_head
530    tst         W, #(pixblock_size - 1)
531    beq         2f
532.irp chunk_size, 16, 8, 4, 2, 1
533.if pixblock_size > chunk_size
534    tst         W, #chunk_size
535    beq         1f
536    pixld_src   chunk_size, src_bpp, src_basereg, SRC
537    pixld       chunk_size, mask_bpp, mask_basereg, MASK
538.if dst_aligned_flag != 0
539    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
540.else
541    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
542.endif
543.if cache_preload_flag != 0
544    PF add      PF_X, PF_X, #chunk_size
545.endif
5461:
547.endif
548.endr
549    pixdeinterleave src_bpp, src_basereg
550    pixdeinterleave mask_bpp, mask_basereg
551    pixdeinterleave dst_r_bpp, dst_r_basereg
552
553    process_pixblock_head
554.if cache_preload_flag != 0
555    cache_preload 0, pixblock_size
556    cache_preload_simple
557.endif
558    process_pixblock_tail
559    pixinterleave dst_w_bpp, dst_w_basereg
560.irp chunk_size, 16, 8, 4, 2, 1
561.if pixblock_size > chunk_size
562    tst         W, #chunk_size
563    beq         1f
564.if dst_aligned_flag != 0
565    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
566.else
567    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
568.endif
5691:
570.endif
571.endr
5722:
573.endm
574
575/*
576 * Macro, which performs all the needed operations to switch to the next
577 * scanline and start the next loop iteration unless all the scanlines
578 * are already processed.
579 */
580.macro advance_to_next_scanline start_of_loop_label
581.if regs_shortage
582    ldrd        W, [sp] /* load W and H (width and height) from stack */
583.else
584    mov         W, ORIG_W
585.endif
586    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
587.if src_bpp != 0
588    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
589.endif
590.if mask_bpp != 0
591    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
592.endif
593.if (dst_w_bpp != 24)
594    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
595.endif
596.if (src_bpp != 24) && (src_bpp != 0)
597    sub         SRC, SRC, W, lsl #src_bpp_shift
598.endif
599.if (mask_bpp != 24) && (mask_bpp != 0)
600    sub         MASK, MASK, W, lsl #mask_bpp_shift
601.endif
602    subs        H, H, #1
603    mov         DST_R, DST_W
604.if regs_shortage
605    str         H, [sp, #4] /* save updated height to stack */
606.endif
607    bge         start_of_loop_label
608.endm
609
610/*
611 * Registers are allocated in the following way by default:
612 * d0, d1, d2, d3     - reserved for loading source pixel data
613 * d4, d5, d6, d7     - reserved for loading destination pixel data
614 * d24, d25, d26, d27 - reserved for loading mask pixel data
615 * d28, d29, d30, d31 - final destination pixel data for writeback to memory
616 */
617.macro generate_composite_function fname, \
618                                   src_bpp_, \
619                                   mask_bpp_, \
620                                   dst_w_bpp_, \
621                                   flags, \
622                                   pixblock_size_, \
623                                   prefetch_distance, \
624                                   init, \
625                                   cleanup, \
626                                   process_pixblock_head, \
627                                   process_pixblock_tail, \
628                                   process_pixblock_tail_head, \
629                                   dst_w_basereg_ = 28, \
630                                   dst_r_basereg_ = 4, \
631                                   src_basereg_   = 0, \
632                                   mask_basereg_  = 24
633
634    .func fname
635    .global fname
636    /* For ELF format also set function visibility to hidden */
637#ifdef __ELF__
638    .hidden fname
639    .type fname, %function
640#endif
641fname:
642    push        {r4-r12, lr}        /* save all registers */
643
644/*
645 * Select prefetch type for this function. If prefetch distance is
646 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
647 * has to be used instead of ADVANCED.
648 */
649    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
650.if prefetch_distance == 0
651    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
652.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
653        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
654    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
655.endif
656
657/*
658 * Make some macro arguments globally visible and accessible
659 * from other macros
660 */
661    .set src_bpp, src_bpp_
662    .set mask_bpp, mask_bpp_
663    .set dst_w_bpp, dst_w_bpp_
664    .set pixblock_size, pixblock_size_
665    .set dst_w_basereg, dst_w_basereg_
666    .set dst_r_basereg, dst_r_basereg_
667    .set src_basereg, src_basereg_
668    .set mask_basereg, mask_basereg_
669
670    .macro pixld_src x:vararg
671        pixld x
672    .endm
673    .macro fetch_src_pixblock
674        pixld_src   pixblock_size, src_bpp, \
675                    (src_basereg - pixblock_size * src_bpp / 64), SRC
676    .endm
677/*
678 * Assign symbolic names to registers
679 */
680    W           .req        r0      /* width (is updated during processing) */
681    H           .req        r1      /* height (is updated during processing) */
682    DST_W       .req        r2      /* destination buffer pointer for writes */
683    DST_STRIDE  .req        r3      /* destination image stride */
684    SRC         .req        r4      /* source buffer pointer */
685    SRC_STRIDE  .req        r5      /* source image stride */
686    DST_R       .req        r6      /* destination buffer pointer for reads */
687
688    MASK        .req        r7      /* mask pointer */
689    MASK_STRIDE .req        r8      /* mask stride */
690
691    PF_CTL      .req        r9      /* combined lines counter and prefetch */
692                                    /* distance increment counter */
693    PF_X        .req        r10     /* pixel index in a scanline for current */
694                                    /* pretetch position */
695    PF_SRC      .req        r11     /* pointer to source scanline start */
696                                    /* for prefetch purposes */
697    PF_DST      .req        r12     /* pointer to destination scanline start */
698                                    /* for prefetch purposes */
699    PF_MASK     .req        r14     /* pointer to mask scanline start */
700                                    /* for prefetch purposes */
701/*
702 * Check whether we have enough registers for all the local variables.
703 * If we don't have enough registers, original width and height are
704 * kept on top of stack (and 'regs_shortage' variable is set to indicate
705 * this for the rest of code). Even if there are enough registers, the
706 * allocation scheme may be a bit different depending on whether source
707 * or mask is not used.
708 */
709.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
710    ORIG_W      .req        r10     /* saved original width */
711    DUMMY       .req        r12     /* temporary register */
712    .set        regs_shortage, 0
713.elseif mask_bpp == 0
714    ORIG_W      .req        r7      /* saved original width */
715    DUMMY       .req        r8      /* temporary register */
716    .set        regs_shortage, 0
717.elseif src_bpp == 0
718    ORIG_W      .req        r4      /* saved original width */
719    DUMMY       .req        r5      /* temporary register */
720    .set        regs_shortage, 0
721.else
722    ORIG_W      .req        r1      /* saved original width */
723    DUMMY       .req        r1      /* temporary register */
724    .set        regs_shortage, 1
725.endif
726
727    .set mask_bpp_shift, -1
728.if src_bpp == 32
729    .set src_bpp_shift, 2
730.elseif src_bpp == 24
731    .set src_bpp_shift, 0
732.elseif src_bpp == 16
733    .set src_bpp_shift, 1
734.elseif src_bpp == 8
735    .set src_bpp_shift, 0
736.elseif src_bpp == 0
737    .set src_bpp_shift, -1
738.else
739    .error "requested src bpp (src_bpp) is not supported"
740.endif
741.if mask_bpp == 32
742    .set mask_bpp_shift, 2
743.elseif mask_bpp == 24
744    .set mask_bpp_shift, 0
745.elseif mask_bpp == 8
746    .set mask_bpp_shift, 0
747.elseif mask_bpp == 0
748    .set mask_bpp_shift, -1
749.else
750    .error "requested mask bpp (mask_bpp) is not supported"
751.endif
752.if dst_w_bpp == 32
753    .set dst_bpp_shift, 2
754.elseif dst_w_bpp == 24
755    .set dst_bpp_shift, 0
756.elseif dst_w_bpp == 16
757    .set dst_bpp_shift, 1
758.elseif dst_w_bpp == 8
759    .set dst_bpp_shift, 0
760.else
761    .error "requested dst bpp (dst_w_bpp) is not supported"
762.endif
763
764.if (((flags) & FLAG_DST_READWRITE) != 0)
765    .set dst_r_bpp, dst_w_bpp
766.else
767    .set dst_r_bpp, 0
768.endif
769.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
770    .set DEINTERLEAVE_32BPP_ENABLED, 1
771.else
772    .set DEINTERLEAVE_32BPP_ENABLED, 0
773.endif
774
775.if prefetch_distance < 0 || prefetch_distance > 15
776    .error "invalid prefetch distance (prefetch_distance)"
777.endif
778
779.if src_bpp > 0
780    ldr         SRC, [sp, #40]
781.endif
782.if mask_bpp > 0
783    ldr         MASK, [sp, #48]
784.endif
785    PF mov      PF_X, #0
786.if src_bpp > 0
787    ldr         SRC_STRIDE, [sp, #44]
788.endif
789.if mask_bpp > 0
790    ldr         MASK_STRIDE, [sp, #52]
791.endif
792    mov         DST_R, DST_W
793
794.if src_bpp == 24
795    sub         SRC_STRIDE, SRC_STRIDE, W
796    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
797.endif
798.if mask_bpp == 24
799    sub         MASK_STRIDE, MASK_STRIDE, W
800    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
801.endif
802.if dst_w_bpp == 24
803    sub         DST_STRIDE, DST_STRIDE, W
804    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
805.endif
806
807/*
808 * Setup advanced prefetcher initial state
809 */
810    PF mov      PF_SRC, SRC
811    PF mov      PF_DST, DST_R
812    PF mov      PF_MASK, MASK
813    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
814    PF mov      PF_CTL, H, lsl #4
815    PF add      PF_CTL, #(prefetch_distance - 0x10)
816
817    init
818.if regs_shortage
819    push        {r0, r1}
820.endif
821    subs        H, H, #1
822.if regs_shortage
823    str         H, [sp, #4] /* save updated height to stack */
824.else
825    mov         ORIG_W, W
826.endif
827    blt         9f
828    cmp         W, #(pixblock_size * 2)
829    blt         8f
830/*
831 * This is the start of the pipelined loop, which if optimized for
832 * long scanlines
833 */
8340:
835    ensure_destination_ptr_alignment process_pixblock_head, \
836                                     process_pixblock_tail, \
837                                     process_pixblock_tail_head
838
839    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
840    pixld_a     pixblock_size, dst_r_bpp, \
841                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
842    fetch_src_pixblock
843    pixld       pixblock_size, mask_bpp, \
844                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
845    PF add      PF_X, PF_X, #pixblock_size
846    process_pixblock_head
847    cache_preload 0, pixblock_size
848    cache_preload_simple
849    subs        W, W, #(pixblock_size * 2)
850    blt         2f
8511:
852    process_pixblock_tail_head
853    cache_preload_simple
854    subs        W, W, #pixblock_size
855    bge         1b
8562:
857    process_pixblock_tail
858    pixst_a     pixblock_size, dst_w_bpp, \
859                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
860
861    /* Process the remaining trailing pixels in the scanline */
862    process_trailing_pixels 1, 1, \
863                            process_pixblock_head, \
864                            process_pixblock_tail, \
865                            process_pixblock_tail_head
866    advance_to_next_scanline 0b
867
868.if regs_shortage
869    pop         {r0, r1}
870.endif
871    cleanup
872    pop         {r4-r12, pc}  /* exit */
873/*
874 * This is the start of the loop, designed to process images with small width
875 * (less than pixblock_size * 2 pixels). In this case neither pipelining
876 * nor prefetch are used.
877 */
8788:
879    /* Process exactly pixblock_size pixels if needed */
880    tst         W, #pixblock_size
881    beq         1f
882    pixld       pixblock_size, dst_r_bpp, \
883                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
884    fetch_src_pixblock
885    pixld       pixblock_size, mask_bpp, \
886                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
887    process_pixblock_head
888    process_pixblock_tail
889    pixst       pixblock_size, dst_w_bpp, \
890                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
8911:
892    /* Process the remaining trailing pixels in the scanline */
893    process_trailing_pixels 0, 0, \
894                            process_pixblock_head, \
895                            process_pixblock_tail, \
896                            process_pixblock_tail_head
897    advance_to_next_scanline 8b
8989:
899.if regs_shortage
900    pop         {r0, r1}
901.endif
902    cleanup
903    pop         {r4-r12, pc}  /* exit */
904
905    .purgem     fetch_src_pixblock
906    .purgem     pixld_src
907
908    .unreq      SRC
909    .unreq      MASK
910    .unreq      DST_R
911    .unreq      DST_W
912    .unreq      ORIG_W
913    .unreq      W
914    .unreq      H
915    .unreq      SRC_STRIDE
916    .unreq      DST_STRIDE
917    .unreq      MASK_STRIDE
918    .unreq      PF_CTL
919    .unreq      PF_X
920    .unreq      PF_SRC
921    .unreq      PF_DST
922    .unreq      PF_MASK
923    .unreq      DUMMY
924    .endfunc
925.endm
926
927/*
928 * A simplified variant of function generation template for a single
929 * scanline processing (for implementing pixman combine functions)
930 */
931.macro generate_composite_function_scanline        use_nearest_scaling, \
932                                                   fname, \
933                                                   src_bpp_, \
934                                                   mask_bpp_, \
935                                                   dst_w_bpp_, \
936                                                   flags, \
937                                                   pixblock_size_, \
938                                                   init, \
939                                                   cleanup, \
940                                                   process_pixblock_head, \
941                                                   process_pixblock_tail, \
942                                                   process_pixblock_tail_head, \
943                                                   dst_w_basereg_ = 28, \
944                                                   dst_r_basereg_ = 4, \
945                                                   src_basereg_   = 0, \
946                                                   mask_basereg_  = 24
947
948    .func fname
949    .global fname
950    /* For ELF format also set function visibility to hidden */
951#ifdef __ELF__
952    .hidden fname
953    .type fname, %function
954#endif
955fname:
956    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
957/*
958 * Make some macro arguments globally visible and accessible
959 * from other macros
960 */
961    .set src_bpp, src_bpp_
962    .set mask_bpp, mask_bpp_
963    .set dst_w_bpp, dst_w_bpp_
964    .set pixblock_size, pixblock_size_
965    .set dst_w_basereg, dst_w_basereg_
966    .set dst_r_basereg, dst_r_basereg_
967    .set src_basereg, src_basereg_
968    .set mask_basereg, mask_basereg_
969
970.if use_nearest_scaling != 0
971    /*
972     * Assign symbolic names to registers for nearest scaling
973     */
974    W           .req        r0
975    DST_W       .req        r1
976    SRC         .req        r2
977    VX          .req        r3
978    UNIT_X      .req        ip
979    MASK        .req        lr
980    TMP1        .req        r4
981    TMP2        .req        r5
982    DST_R       .req        r6
983    SRC_WIDTH_FIXED .req        r7
984
985    .macro pixld_src x:vararg
986        pixld_s x
987    .endm
988
989    ldr         UNIT_X, [sp]
990    push        {r4-r8, lr}
991    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
992    .if mask_bpp != 0
993    ldr         MASK, [sp, #(24 + 8)]
994    .endif
995.else
996    /*
997     * Assign symbolic names to registers
998     */
999    W           .req        r0      /* width (is updated during processing) */
1000    DST_W       .req        r1      /* destination buffer pointer for writes */
1001    SRC         .req        r2      /* source buffer pointer */
1002    DST_R       .req        ip      /* destination buffer pointer for reads */
1003    MASK        .req        r3      /* mask pointer */
1004
1005    .macro pixld_src x:vararg
1006        pixld x
1007    .endm
1008.endif
1009
1010.if (((flags) & FLAG_DST_READWRITE) != 0)
1011    .set dst_r_bpp, dst_w_bpp
1012.else
1013    .set dst_r_bpp, 0
1014.endif
1015.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
1016    .set DEINTERLEAVE_32BPP_ENABLED, 1
1017.else
1018    .set DEINTERLEAVE_32BPP_ENABLED, 0
1019.endif
1020
1021    .macro fetch_src_pixblock
1022        pixld_src   pixblock_size, src_bpp, \
1023                    (src_basereg - pixblock_size * src_bpp / 64), SRC
1024    .endm
1025
1026    init
1027    mov         DST_R, DST_W
1028
1029    cmp         W, #pixblock_size
1030    blt         8f
1031
1032    ensure_destination_ptr_alignment process_pixblock_head, \
1033                                     process_pixblock_tail, \
1034                                     process_pixblock_tail_head
1035
1036    subs        W, W, #pixblock_size
1037    blt         7f
1038
1039    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1040    pixld_a     pixblock_size, dst_r_bpp, \
1041                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
1042    fetch_src_pixblock
1043    pixld       pixblock_size, mask_bpp, \
1044                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
1045    process_pixblock_head
1046    subs        W, W, #pixblock_size
1047    blt         2f
10481:
1049    process_pixblock_tail_head
1050    subs        W, W, #pixblock_size
1051    bge         1b
10522:
1053    process_pixblock_tail
1054    pixst_a     pixblock_size, dst_w_bpp, \
1055                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
10567:
1057    /* Process the remaining trailing pixels in the scanline (dst aligned) */
1058    process_trailing_pixels 0, 1, \
1059                            process_pixblock_head, \
1060                            process_pixblock_tail, \
1061                            process_pixblock_tail_head
1062
1063    cleanup
1064.if use_nearest_scaling != 0
1065    pop         {r4-r8, pc}  /* exit */
1066.else
1067    bx          lr  /* exit */
1068.endif
10698:
1070    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1071    process_trailing_pixels 0, 0, \
1072                            process_pixblock_head, \
1073                            process_pixblock_tail, \
1074                            process_pixblock_tail_head
1075
1076    cleanup
1077
1078.if use_nearest_scaling != 0
1079    pop         {r4-r8, pc}  /* exit */
1080
1081    .unreq      DST_R
1082    .unreq      SRC
1083    .unreq      W
1084    .unreq      VX
1085    .unreq      UNIT_X
1086    .unreq      TMP1
1087    .unreq      TMP2
1088    .unreq      DST_W
1089    .unreq      MASK
1090    .unreq      SRC_WIDTH_FIXED
1091
1092.else
1093    bx          lr  /* exit */
1094
1095    .unreq      SRC
1096    .unreq      MASK
1097    .unreq      DST_R
1098    .unreq      DST_W
1099    .unreq      W
1100.endif
1101
1102    .purgem     fetch_src_pixblock
1103    .purgem     pixld_src
1104
1105    .endfunc
1106.endm
1107
1108.macro generate_composite_function_single_scanline x:vararg
1109    generate_composite_function_scanline 0, x
1110.endm
1111
1112.macro generate_composite_function_nearest_scanline x:vararg
1113    generate_composite_function_scanline 1, x
1114.endm
1115
1116/* Default prologue/epilogue, nothing special needs to be done */
1117
1118.macro default_init
1119.endm
1120
1121.macro default_cleanup
1122.endm
1123
1124/*
1125 * Prologue/epilogue variant which additionally saves/restores d8-d15
1126 * registers (they need to be saved/restored by callee according to ABI).
1127 * This is required if the code needs to use all the NEON registers.
1128 */
1129
1130.macro default_init_need_all_regs
1131    vpush       {d8-d15}
1132.endm
1133
1134.macro default_cleanup_need_all_regs
1135    vpop        {d8-d15}
1136.endm
1137
1138/******************************************************************************/
1139
1140/*
1141 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
1142 * into a planar a8r8g8b8 format (with a, r, g, b color components
1143 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
1144 *
1145 * Warning: the conversion is destructive and the original
1146 *          value (in) is lost.
1147 */
1148.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
1149    vshrn.u16   out_r, in,    #8
1150    vshrn.u16   out_g, in,    #3
1151    vsli.u16    in,    in,    #5
1152    vmov.u8     out_a, #255
1153    vsri.u8     out_r, out_r, #5
1154    vsri.u8     out_g, out_g, #6
1155    vshrn.u16   out_b, in,    #2
1156.endm
1157
1158.macro convert_0565_to_x888 in, out_r, out_g, out_b
1159    vshrn.u16   out_r, in,    #8
1160    vshrn.u16   out_g, in,    #3
1161    vsli.u16    in,    in,    #5
1162    vsri.u8     out_r, out_r, #5
1163    vsri.u8     out_g, out_g, #6
1164    vshrn.u16   out_b, in,    #2
1165.endm
1166
1167/*
1168 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1169 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1170 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1171 * registers (tmp1, tmp2)
1172 */
1173.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
1174    vshll.u8    tmp1, in_g, #8
1175    vshll.u8    out, in_r, #8
1176    vshll.u8    tmp2, in_b, #8
1177    vsri.u16    out, tmp1, #5
1178    vsri.u16    out, tmp2, #11
1179.endm
1180
1181/*
1182 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1183 * returned in (out0, out1) registers pair. Requires one temporary
1184 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1185 * value from 'in' is lost
1186 */
1187.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
1188    vshl.u16    out0, in,   #5  /* G top 6 bits */
1189    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
1190    vsri.u16    in,   in,   #5  /* R is ready in top bits */
1191    vsri.u16    out0, out0, #6  /* G is ready in top bits */
1192    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
1193    vshr.u16    out1, in,   #8  /* R is in place */
1194    vsri.u16    out0, tmp,  #8  /* G & B is in place */
1195    vzip.u16    out0, out1      /* everything is in place */
1196.endm
1197