pixman-android-neon.S revision 8feda42074db4786f308efdca56a54e1366398f1
1/*
2 * Copyright © 2013 The Android Open Source Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23/*
24 * Copyright © 2009 Nokia Corporation
25 *
26 * Permission is hereby granted, free of charge, to any person obtaining a
27 * copy of this software and associated documentation files (the "Software"),
28 * to deal in the Software without restriction, including without limitation
29 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
30 * and/or sell copies of the Software, and to permit persons to whom the
31 * Software is furnished to do so, subject to the following conditions:
32 *
33 * The above copyright notice and this permission notice (including the next
34 * paragraph) shall be included in all copies or substantial portions of the
35 * Software.
36 *
37 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
40 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
43 * DEALINGS IN THE SOFTWARE.
44 *
45 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
46 */
47
48#if defined(__linux__) && defined(__ELF__)
49.section .note.GNU-stack,"",%progbits
50#endif
51
52    .text
53    .fpu neon
54    .arch armv7a
55    .object_arch armv4
56    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
57    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
58    .arm
59    .altmacro
60    .p2align 2
61
62#include "pixman-private.h"
63#include "pixman-arm-neon-asm.h"
64
65.set RESPECT_STRICT_ALIGNMENT, 1
66.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
67.set PREFETCH_DISTANCE_SIMPLE, 64
68
69.set BILINEAR_FLAG_UNROLL_4,          0
70.set BILINEAR_FLAG_UNROLL_8,          1
71.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
72
73/* Supplementary macro for setting function attributes */
74.macro pixman_asm_function fname
75    .func fname
76    .global fname
77#ifdef __ELF__
78    .hidden fname
79    .type fname, %function
80#endif
81fname:
82.endm
83
84.macro bilinear_load_8888 reg1, reg2, tmp
85    mov       TMP1, X, asr #16
86    add       X, X, UX
87    add       TMP1, TOP, TMP1, asl #2
88    vld1.32   {reg1}, [TMP1], STRIDE
89    vld1.32   {reg2}, [TMP1]
90.endm
91
92.macro bilinear_load_and_vertical_interpolate_two_8888 \
93                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
94
95    bilinear_load_8888 reg1, reg2, tmp1
96    vmull.u8  acc1, reg1, d28
97    vmlal.u8  acc1, reg2, d29
98    bilinear_load_8888 reg3, reg4, tmp2
99    vmull.u8  acc2, reg3, d28
100    vmlal.u8  acc2, reg4, d29
101.endm
102
103.macro bilinear_store_8888 numpix, tmp1, tmp2
104.if numpix == 4
105    vst1.32   {d0, d1}, [OUT, :128]!
106.elseif numpix == 2
107    vst1.32   {d0}, [OUT, :64]!
108.elseif numpix == 1
109    vst1.32   {d0[0]}, [OUT, :32]!
110.else
111    .error bilinear_store_8888 numpix is unsupported
112.endif
113.endm
114
115.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
116    bilinear_load_&src_fmt d0, d1, d2
117    vmull.u8  q1, d0, d28
118    vmlal.u8  q1, d1, d29
119    /* 5 cycles bubble */
120    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
121    vmlsl.u16 q0, d2, d30
122    vmlal.u16 q0, d3, d30
123    /* 5 cycles bubble */
124    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
125    /* 3 cycles bubble */
126    vmovn.u16 d0, q0
127    /* 1 cycle bubble */
128    bilinear_store_&dst_fmt 1, q2, q3
129.endm
130
131.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
132    bilinear_load_and_vertical_interpolate_two_&src_fmt \
133                q1, q11, d0, d1, d20, d21, d22, d23
134    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
135    vmlsl.u16 q0, d2, d30
136    vmlal.u16 q0, d3, d30
137    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
138    vmlsl.u16 q10, d22, d31
139    vmlal.u16 q10, d23, d31
140    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
141    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
142    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
143    vadd.u16  q12, q12, q13
144    vmovn.u16 d0, q0
145    bilinear_store_&dst_fmt 2, q2, q3
146.endm
147
148.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
149    bilinear_load_and_vertical_interpolate_four_&src_fmt \
150                q1, q11, d0, d1, d20, d21, d22, d23 \
151                q3, q9,  d4, d5, d16, d17, d18, d19
152    pld       [TMP1, PF_OFFS]
153    sub       TMP1, TMP1, STRIDE
154    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
155    vmlsl.u16 q0, d2, d30
156    vmlal.u16 q0, d3, d30
157    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
158    vmlsl.u16 q10, d22, d31
159    vmlal.u16 q10, d23, d31
160    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
161    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
162    vmlsl.u16 q2, d6, d30
163    vmlal.u16 q2, d7, d30
164    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
165    pld       [TMP2, PF_OFFS]
166    vmlsl.u16 q8, d18, d31
167    vmlal.u16 q8, d19, d31
168    vadd.u16  q12, q12, q13
169    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
170    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
171    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
172    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
173    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
174    vmovn.u16 d0, q0
175    vmovn.u16 d1, q2
176    vadd.u16  q12, q12, q13
177    bilinear_store_&dst_fmt 4, q2, q3
178.endm
179
180.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
181.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
182    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
183.else
184    bilinear_interpolate_four_pixels src_fmt, dst_fmt
185.endif
186.endm
187
188.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
189.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
190    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
191.endif
192.endm
193
194.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
195.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
196    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
197.else
198    bilinear_interpolate_four_pixels src_fmt, dst_fmt
199.endif
200.endm
201
202.macro bilinear_load_and_vertical_interpolate_four_8888 \
203                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
204                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
205
206    bilinear_load_and_vertical_interpolate_two_8888 \
207                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
208    bilinear_load_and_vertical_interpolate_two_8888 \
209                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
210.endm
211
212.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
213                                       src_bpp_shift, dst_bpp_shift, \
214                                       prefetch_distance, flags
215
216pixman_asm_function fname
217    OUT       .req      r0
218    TOP       .req      r1
219    BOTTOM    .req      r2
220    WT        .req      r3
221    WB        .req      r4
222    X         .req      r5
223    UX        .req      r6
224    WIDTH     .req      ip
225    TMP1      .req      r3
226    TMP2      .req      r4
227    PF_OFFS   .req      r7
228    TMP3      .req      r8
229    TMP4      .req      r9
230    STRIDE    .req      r2
231
232    mov       ip, sp
233    push      {r4, r5, r6, r7, r8, r9}
234    mov       PF_OFFS, #prefetch_distance
235    ldmia     ip, {WB, X, UX, WIDTH}
236    mul       PF_OFFS, PF_OFFS, UX
237
238.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
239    vpush     {d8-d15}
240.endif
241
242    sub       STRIDE, BOTTOM, TOP
243    .unreq    BOTTOM
244
245    cmp       WIDTH, #0
246    ble       3f
247
248    vdup.u16  q12, X
249    vdup.u16  q13, UX
250    vdup.u8   d28, WT
251    vdup.u8   d29, WB
252    vadd.u16  d25, d25, d26
253
254    /* ensure good destination alignment  */
255    cmp       WIDTH, #1
256    blt       0f
257    tst       OUT, #(1 << dst_bpp_shift)
258    beq       0f
259    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
260    vadd.u16  q12, q12, q13
261    bilinear_interpolate_last_pixel src_fmt, dst_fmt
262    sub       WIDTH, WIDTH, #1
2630:
264    vadd.u16  q13, q13, q13
265    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
266    vadd.u16  q12, q12, q13
267
268    cmp       WIDTH, #2
269    blt       0f
270    tst       OUT, #(1 << (dst_bpp_shift + 1))
271    beq       0f
272    bilinear_interpolate_two_pixels src_fmt, dst_fmt
273    sub       WIDTH, WIDTH, #2
2740:
275.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
276/*********** 8 pixels per iteration *****************/
277    cmp       WIDTH, #4
278    blt       0f
279    tst       OUT, #(1 << (dst_bpp_shift + 2))
280    beq       0f
281    bilinear_interpolate_four_pixels src_fmt, dst_fmt
282    sub       WIDTH, WIDTH, #4
2830:
284    subs      WIDTH, WIDTH, #8
285    blt       1f
286    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
287    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
288    subs      WIDTH, WIDTH, #8
289    blt       5f
2900:
291    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
292    subs      WIDTH, WIDTH, #8
293    bge       0b
2945:
295    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2961:
297    tst       WIDTH, #4
298    beq       2f
299    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3002:
301.else
302/*********** 4 pixels per iteration *****************/
303    subs      WIDTH, WIDTH, #4
304    blt       1f
305    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
306    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
307    subs      WIDTH, WIDTH, #4
308    blt       5f
3090:
310    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
311    subs      WIDTH, WIDTH, #4
312    bge       0b
3135:
314    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3151:
316/****************************************************/
317.endif
318    /* handle the remaining trailing pixels */
319    tst       WIDTH, #2
320    beq       2f
321    bilinear_interpolate_two_pixels src_fmt, dst_fmt
3222:
323    tst       WIDTH, #1
324    beq       3f
325    bilinear_interpolate_last_pixel src_fmt, dst_fmt
3263:
327.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
328    vpop      {d8-d15}
329.endif
330    pop       {r4, r5, r6, r7, r8, r9}
331    bx        lr
332
333    .unreq    OUT
334    .unreq    TOP
335    .unreq    WT
336    .unreq    WB
337    .unreq    X
338    .unreq    UX
339    .unreq    WIDTH
340    .unreq    TMP1
341    .unreq    TMP2
342    .unreq    PF_OFFS
343    .unreq    TMP3
344    .unreq    TMP4
345    .unreq    STRIDE
346.endfunc
347
348.endm
349
350generate_bilinear_scanline_func \
351    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
352    2, 2, 28, BILINEAR_FLAG_UNROLL_4
353
354