memcpy.S revision 4e468ed2eb86a2406e14f1eca82072ee501d05fd
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31	.text
32
33    .global memcpy
34    .type memcpy, %function
35    .align 4
36
37		/*
38		 * Optimized memcpy() for ARM.
39         *
40		 * note that memcpy() always returns the destination pointer,
41		 * so we have to preserve R0.
42		 */
43
44memcpy:
45		/* The stack must always be 64-bits aligned to be compliant with the
46		 * ARM ABI. Since we have to save R0, we might as well save R4
47		 * which we can use for better pipelining of the reads below
48		 */
49        .fnstart
50        .save       {r0, r4, lr}
51        stmfd       sp!, {r0, r4, lr}
52        /* Making room for r5-r11 which will be spilled later */
53        .pad        #28
54        sub         sp, sp, #28
55
56        // preload the destination because we'll align it to a cache line
57        // with small writes. Also start the source "pump".
58        PLD         (r0, #0)
59        PLD         (r1, #0)
60        PLD         (r1, #32)
61
62		/* it simplifies things to take care of len<4 early */
63		cmp			r2, #4
64		blo			copy_last_3_and_return
65
66		/* compute the offset to align the source
67		 * offset = (4-(src&3))&3 = -src & 3
68		 */
69		rsb			r3, r1, #0
70		ands		r3, r3, #3
71		beq			src_aligned
72
73		/* align source to 32 bits. We need to insert 2 instructions between
74		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
75		 * stall 2 cycles.
76		 */
77		movs		r12, r3, lsl #31
78		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
79		ldrmib		r3, [r1], #1
80		ldrcsb		r4, [r1], #1
81		ldrcsb		r12,[r1], #1
82        strmib		r3, [r0], #1
83		strcsb		r4, [r0], #1
84		strcsb		r12,[r0], #1
85
86src_aligned:
87
88		/* see if src and dst are aligned together (congruent) */
89		eor			r12, r0, r1
90		tst			r12, #3
91		bne			non_congruent
92
93        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
94         * frame. Don't update sp.
95         */
96        stmea		sp, {r5-r11}
97
98		/* align the destination to a cache-line */
99		rsb         r3, r0, #0
100		ands		r3, r3, #0x1C
101		beq         congruent_aligned32
102		cmp         r3, r2
103		andhi		r3, r2, #0x1C
104
105		/* conditionnaly copies 0 to 7 words (length in r3) */
106		movs		r12, r3, lsl #28
107		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
108		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
109		stmcsia		r0!, {r4, r5, r6, r7}
110		stmmiia		r0!, {r8, r9}
111		tst         r3, #0x4
112		ldrne		r10,[r1], #4			/*  4 bytes */
113		strne		r10,[r0], #4
114		sub         r2, r2, r3
115
116congruent_aligned32:
117		/*
118		 * here source is aligned to 32 bytes.
119		 */
120
121cached_aligned32:
122        subs        r2, r2, #32
123        blo         less_than_32_left
124
125        /*
126         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
127         * stall only until the requested world is fetched, but the linefill
128         * continues in the the background.
129         * While the linefill is going, we write our previous cache-line
130         * into the write-buffer (which should have some free space).
131         * When the linefill is done, the writebuffer will
132         * start dumping its content into memory
133         *
134         * While all this is going, we then load a full cache line into
135         * 8 registers, this cache line should be in the cache by now
136         * (or partly in the cache).
137         *
138         * This code should work well regardless of the source/dest alignment.
139         *
140         */
141
142        // Align the preload register to a cache-line because the cpu does
143        // "critical word first" (the first word requested is loaded first).
144        bic         r12, r1, #0x1F
145        add         r12, r12, #64
146
1471:      ldmia       r1!, { r4-r11 }
148        PLD         (r12, #64)
149        subs        r2, r2, #32
150
151        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
152        // for ARM9 preload will not be safely guarded by the preceding subs.
153        // When it is safely guarded the only possibility to have SIGSEGV here
154        // is because the caller overstates the length.
155        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
156        stmia       r0!, { r4-r11 }
157		bhs         1b
158
159        add         r2, r2, #32
160
161
162
163
164less_than_32_left:
165		/*
166		 * less than 32 bytes left at this point (length in r2)
167		 */
168
169		/* skip all this if there is nothing to do, which should
170		 * be a common case (if not executed the code below takes
171		 * about 16 cycles)
172		 */
173		tst			r2, #0x1F
174		beq			1f
175
176		/* conditionnaly copies 0 to 31 bytes */
177		movs		r12, r2, lsl #28
178		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
179		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
180		stmcsia		r0!, {r4, r5, r6, r7}
181		stmmiia		r0!, {r8, r9}
182		movs		r12, r2, lsl #30
183		ldrcs		r3, [r1], #4			/*  4 bytes */
184		ldrmih		r4, [r1], #2			/*  2 bytes */
185		strcs		r3, [r0], #4
186		strmih		r4, [r0], #2
187		tst         r2, #0x1
188		ldrneb		r3, [r1]				/*  last byte  */
189		strneb		r3, [r0]
190
191		/* we're done! restore everything and return */
1921:		ldmfd		sp!, {r5-r11}
193		ldmfd		sp!, {r0, r4, lr}
194		bx			lr
195
196		/********************************************************************/
197
198non_congruent:
199		/*
200		 * here source is aligned to 4 bytes
201		 * but destination is not.
202		 *
203		 * in the code below r2 is the number of bytes read
204		 * (the number of bytes written is always smaller, because we have
205		 * partial words in the shift queue)
206		 */
207		cmp			r2, #4
208		blo			copy_last_3_and_return
209
210        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
211         * frame. Don't update sp.
212         */
213        stmea		sp, {r5-r11}
214
215		/* compute shifts needed to align src to dest */
216		rsb			r5, r0, #0
217		and			r5, r5, #3			/* r5 = # bytes in partial words */
218		mov			r12, r5, lsl #3		/* r12 = right */
219		rsb			lr, r12, #32		/* lr = left  */
220
221		/* read the first word */
222		ldr			r3, [r1], #4
223		sub			r2, r2, #4
224
225		/* write a partial word (0 to 3 bytes), such that destination
226		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
227		 */
228		movs		r5, r5, lsl #31
229		strmib		r3, [r0], #1
230		movmi		r3, r3, lsr #8
231		strcsb		r3, [r0], #1
232		movcs		r3, r3, lsr #8
233		strcsb		r3, [r0], #1
234		movcs		r3, r3, lsr #8
235
236		cmp			r2, #4
237		blo			partial_word_tail
238
239		/* Align destination to 32 bytes (cache line boundary) */
2401:		tst			r0, #0x1c
241		beq			2f
242		ldr			r5, [r1], #4
243		sub         r2, r2, #4
244		orr			r4, r3, r5,		lsl lr
245		mov			r3, r5,			lsr r12
246		str			r4, [r0], #4
247        cmp         r2, #4
248		bhs			1b
249		blo			partial_word_tail
250
251		/* copy 32 bytes at a time */
2522:		subs		r2, r2, #32
253		blo			less_than_thirtytwo
254
255		/* Use immediate mode for the shifts, because there is an extra cycle
256		 * for register shifts, which could account for up to 50% of
257		 * performance hit.
258		 */
259
260        cmp			r12, #24
261		beq			loop24
262		cmp			r12, #8
263		beq			loop8
264
265loop16:
266        ldr         r12, [r1], #4
2671:      mov         r4, r12
268		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
269        PLD         (r1, #64)
270        subs        r2, r2, #32
271        ldrhs       r12, [r1], #4
272		orr			r3, r3, r4,		lsl #16
273		mov			r4, r4,			lsr #16
274		orr			r4, r4, r5,		lsl #16
275		mov			r5, r5,			lsr #16
276		orr			r5, r5, r6,		lsl #16
277		mov			r6, r6,			lsr #16
278		orr			r6, r6, r7,		lsl #16
279		mov			r7, r7,			lsr #16
280		orr			r7, r7, r8,		lsl #16
281		mov			r8, r8,			lsr #16
282		orr			r8, r8, r9,		lsl #16
283		mov			r9, r9,			lsr #16
284		orr			r9, r9, r10,	lsl #16
285		mov			r10, r10,		lsr #16
286		orr			r10, r10, r11,	lsl #16
287		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
288		mov			r3, r11,		lsr #16
289		bhs			1b
290		b			less_than_thirtytwo
291
292loop8:
293        ldr         r12, [r1], #4
2941:      mov         r4, r12
295		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
296        PLD         (r1, #64)
297		subs		r2, r2, #32
298        ldrhs       r12, [r1], #4
299		orr			r3, r3, r4,		lsl #24
300		mov			r4, r4,			lsr #8
301		orr			r4, r4, r5,		lsl #24
302		mov			r5, r5,			lsr #8
303		orr			r5, r5, r6,		lsl #24
304		mov			r6, r6,			lsr #8
305		orr			r6, r6, r7,		lsl #24
306		mov			r7, r7,			lsr #8
307		orr			r7, r7, r8,		lsl #24
308		mov			r8, r8,			lsr #8
309		orr			r8, r8, r9,		lsl #24
310		mov			r9, r9,			lsr #8
311		orr			r9, r9, r10,	lsl #24
312		mov			r10, r10,		lsr #8
313		orr			r10, r10, r11,	lsl #24
314		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
315		mov			r3, r11,		lsr #8
316		bhs			1b
317		b			less_than_thirtytwo
318
319loop24:
320        ldr         r12, [r1], #4
3211:      mov         r4, r12
322		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
323        PLD         (r1, #64)
324		subs		r2, r2, #32
325        ldrhs       r12, [r1], #4
326		orr			r3, r3, r4,		lsl #8
327		mov			r4, r4,			lsr #24
328		orr			r4, r4, r5,		lsl #8
329		mov			r5, r5,			lsr #24
330		orr			r5, r5, r6,		lsl #8
331		mov			r6, r6,			lsr #24
332		orr			r6, r6, r7,		lsl #8
333		mov			r7, r7,			lsr #24
334		orr			r7, r7, r8,		lsl #8
335		mov			r8, r8,			lsr #24
336		orr			r8, r8, r9,		lsl #8
337		mov			r9, r9,			lsr #24
338		orr			r9, r9, r10,	lsl #8
339		mov			r10, r10,		lsr #24
340		orr			r10, r10, r11,	lsl #8
341		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
342		mov			r3, r11,		lsr #24
343		bhs			1b
344
345
346less_than_thirtytwo:
347		/* copy the last 0 to 31 bytes of the source */
348		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
349		add			r2, r2, #32
350		cmp			r2, #4
351		blo			partial_word_tail
352
3531:		ldr			r5, [r1], #4
354		sub         r2, r2, #4
355		orr			r4, r3, r5,		lsl lr
356		mov			r3,	r5,			lsr r12
357		str			r4, [r0], #4
358        cmp         r2, #4
359		bhs			1b
360
361partial_word_tail:
362		/* we have a partial word in the input buffer */
363		movs		r5, lr, lsl #(31-3)
364		strmib		r3, [r0], #1
365		movmi		r3, r3, lsr #8
366		strcsb		r3, [r0], #1
367		movcs		r3, r3, lsr #8
368		strcsb		r3, [r0], #1
369
370		/* Refill spilled registers from the stack. Don't update sp. */
371		ldmfd		sp, {r5-r11}
372
373copy_last_3_and_return:
374		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
375		ldrmib		r2, [r1], #1
376		ldrcsb		r3, [r1], #1
377		ldrcsb		r12,[r1]
378		strmib		r2, [r0], #1
379		strcsb		r3, [r0], #1
380		strcsb		r12,[r0]
381
382        /* we're done! restore sp and spilled registers and return */
383        add         sp,  sp, #28
384		ldmfd		sp!, {r0, r4, lr}
385		bx			lr
386        .fnend
387
388