pixman-arm-simd-asm-scaled.S revision 1176bdada62cabc6ec4b0308a930e83b679d5d36
1/*
2 * Copyright © 2008 Mozilla Corporation
3 * Copyright © 2010 Nokia Corporation
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Mozilla Corporation not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission.  Mozilla Corporation makes no
12 * representations about the suitability of this software for any purpose.  It
13 * is provided "as is" without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author:  Jeff Muizelaar (jeff@infidigm.net)
25 *
26 */
27
28/* Prevent the stack from becoming executable */
29#if defined(__linux__) && defined(__ELF__)
30.section .note.GNU-stack,"",%progbits
31#endif
32
33	.text
34	.arch armv6
35	.object_arch armv4
36	.arm
37	.altmacro
38	.p2align 2
39
40/* Supplementary macro for setting function attributes */
41.macro pixman_asm_function fname
42	.func fname
43	.global fname
44#ifdef __ELF__
45	.hidden fname
46	.type fname, %function
47#endif
48fname:
49.endm
50
51/*
52 * Note: This code is only using armv5te instructions (not even armv6),
53 *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
54 *       be split into a few variants, tuned for each microarchitecture.
55 *
56 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
57 * have efficient write combining), it needs to be changed to use 16-byte
58 * aligned writes using STM instruction.
59 *
60 * Nearest scanline scaler macro template uses the following arguments:
61 *  fname                     - name of the function to generate
62 *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
63 *  t                         - type suffix for LDR/STR instructions
64 *  prefetch_distance         - prefetch in the source image by that many
65 *                              pixels ahead
66 *  prefetch_braking_distance - stop prefetching when that many pixels are
67 *                              remaining before the end of scanline
68 */
69
70.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
71                                      prefetch_distance,        \
72                                      prefetch_braking_distance
73
74pixman_asm_function fname
75	W		.req	r0
76	DST		.req	r1
77	SRC		.req	r2
78	VX		.req	r3
79	UNIT_X		.req	ip
80	TMP1		.req	r4
81	TMP2		.req	r5
82	VXMASK		.req	r6
83	PF_OFFS		.req	r7
84	SRC_WIDTH_FIXED	.req	r8
85
86	ldr	UNIT_X, [sp]
87	push	{r4, r5, r6, r7, r8, r10}
88	mvn	VXMASK, #((1 << bpp_shift) - 1)
89	ldr	SRC_WIDTH_FIXED, [sp, #28]
90
91	/* define helper macro */
92	.macro	scale_2_pixels
93		ldr&t	TMP1, [SRC, TMP1]
94		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
95		adds	VX, VX, UNIT_X
96		str&t	TMP1, [DST], #(1 << bpp_shift)
979:		subpls	VX, VX, SRC_WIDTH_FIXED
98		bpl	9b
99
100		ldr&t	TMP2, [SRC, TMP2]
101		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
102		adds	VX, VX, UNIT_X
103		str&t	TMP2, [DST], #(1 << bpp_shift)
1049:		subpls	VX, VX, SRC_WIDTH_FIXED
105		bpl	9b
106	.endm
107
108	/* now do the scaling */
109	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
110	adds	VX, VX, UNIT_X
1119:	subpls	VX, VX, SRC_WIDTH_FIXED
112	bpl	9b
113	subs	W, W, #(8 + prefetch_braking_distance)
114	blt	2f
115	/* calculate prefetch offset */
116	mov	PF_OFFS, #prefetch_distance
117	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
1181:	/* main loop, process 8 pixels per iteration with prefetch */
119	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
120	add	PF_OFFS, UNIT_X, lsl #3
121	scale_2_pixels
122	scale_2_pixels
123	scale_2_pixels
124	scale_2_pixels
125	subs	W, W, #8
126	bge	1b
1272:
128	subs	W, W, #(4 - 8 - prefetch_braking_distance)
129	blt	2f
1301:	/* process the remaining pixels */
131	scale_2_pixels
132	scale_2_pixels
133	subs	W, W, #4
134	bge	1b
1352:
136	tst	W, #2
137	beq	2f
138	scale_2_pixels
1392:
140	tst	W, #1
141	ldrne&t	TMP1, [SRC, TMP1]
142	strne&t	TMP1, [DST]
143	/* cleanup helper macro */
144	.purgem	scale_2_pixels
145	.unreq	DST
146	.unreq	SRC
147	.unreq	W
148	.unreq	VX
149	.unreq	UNIT_X
150	.unreq	TMP1
151	.unreq	TMP2
152	.unreq	VXMASK
153	.unreq	PF_OFFS
154	.unreq  SRC_WIDTH_FIXED
155	/* return */
156	pop	{r4, r5, r6, r7, r8, r10}
157	bx	lr
158.endfunc
159.endm
160
161generate_nearest_scanline_func \
162    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
163
164generate_nearest_scanline_func \
165    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
166