1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <private/bionic_asm.h>
31
32#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
33
34        .text
35        .fpu    neon
36
37#ifdef HAVE_32_BYTE_CACHE_LINE
38/* a prefetch distance of 2 cache-lines */
39#define CACHE_LINE_SIZE     32
40#else
41/* a prefetch distance of 4 cache-lines works best experimentally */
42#define CACHE_LINE_SIZE     64
43#endif
44
45ENTRY(memcpy)
46        .save       {r0, lr}
47        /* start preloading as early as possible */
48        pld         [r1, #(CACHE_LINE_SIZE * 0)]
49        stmfd       sp!, {r0, lr}
50        pld         [r1, #(CACHE_LINE_SIZE * 1)]
51
52/* If Neon supports unaligned access then remove the align code,
53 * unless a size limit has been specified.
54 */
55#ifndef NEON_UNALIGNED_ACCESS
56        /* do we have at least 16-bytes to copy (needed for alignment below) */
57        cmp         r2, #16
58        blo         5f
59
60        /* check if buffers are aligned. If so, run arm-only version */
61        eor         r3, r0, r1
62        ands        r3, r3, #0x3
63        beq         11f
64
65        /* align destination to cache-line for the write-buffer */
66        rsb         r3, r0, #0
67        ands        r3, r3, #0xF
68        beq         2f
69
70        /* copy up to 15-bytes (count in r3) */
71        sub         r2, r2, r3
72        movs        ip, r3, lsl #31
73        ldrmib      lr, [r1], #1
74        strmib      lr, [r0], #1
75        ldrcsb      ip, [r1], #1
76        ldrcsb      lr, [r1], #1
77        strcsb      ip, [r0], #1
78        strcsb      lr, [r0], #1
79        movs        ip, r3, lsl #29
80        bge         1f
81        // copies 4 bytes, destination 32-bits aligned
82        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
83        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
841:      bcc         2f
85        // copies 8 bytes, destination 64-bits aligned
86        vld1.8      {d0}, [r1]!
87        vst1.8      {d0}, [r0, :64]!
882:
89        /* preload immediately the next cache line, which we may need */
90        pld         [r1, #(CACHE_LINE_SIZE * 0)]
91        pld         [r1, #(CACHE_LINE_SIZE * 1)]
92
93#ifdef HAVE_32_BYTE_CACHE_LINE
94        /* make sure we have at least 32 bytes to copy */
95        subs        r2, r2, #32
96        blo         4f
97
98        /* preload all the cache lines we need.
99         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
100         * ideally would would increase the distance in the main loop to
101         * avoid the goofy code below. In practice this doesn't seem to make
102         * a big difference.
103         */
104        pld         [r1, #(PREFETCH_DISTANCE)]
105
1061:      /* The main loop copies 32 bytes at a time */
107        vld1.8      {d0  - d3},   [r1]!
108        pld         [r1, #(PREFETCH_DISTANCE)]
109        subs        r2, r2, #32
110        vst1.8      {d0  - d3},   [r0, :128]!
111        bhs         1b
112#else
113        /* make sure we have at least 64 bytes to copy */
114        subs        r2, r2, #64
115        blo         2f
116
117        /* preload all the cache lines we need. */
118        pld         [r1, #(CACHE_LINE_SIZE * 2)]
119        pld         [r1, #(CACHE_LINE_SIZE * 3)]
120
1211:      /* The main loop copies 64 bytes at a time */
122        vld1.8      {d0 - d3}, [r1]!
123        vld1.8      {d4 - d7}, [r1]!
124#ifdef  HAVE_32_BYTE_CACHE_LINE
125        pld         [r1, #(CACHE_LINE_SIZE * 2)]
126        pld         [r1, #(CACHE_LINE_SIZE * 3)]
127#else
128        pld         [r1, #(CACHE_LINE_SIZE * 3)]
129#endif
130        subs        r2, r2, #64
131        vst1.8      {d0 - d3}, [r0, :128]!
132        vst1.8      {d4 - d7}, [r0, :128]!
133        bhs         1b
134
1352:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
136        add         r2, r2, #64
137        subs        r2, r2, #32
138        blo         4f
139
1403:      /* 32 bytes at a time. These cache lines were already preloaded */
141        vld1.8      {d0 - d3}, [r1]!
142        subs        r2, r2, #32
143        vst1.8      {d0 - d3}, [r0, :128]!
144        bhs         3b
145#endif
1464:      /* less than 32 left */
147        add         r2, r2, #32
148        tst         r2, #0x10
149        beq         5f
150        // copies 16 bytes, 128-bits aligned
151        vld1.8      {d0, d1}, [r1]!
152        vst1.8      {d0, d1}, [r0, :128]!
1535:      /* copy up to 15-bytes (count in r2) */
154        movs        ip, r2, lsl #29
155        bcc         1f
156        vld1.8      {d0}, [r1]!
157        vst1.8      {d0}, [r0]!
1581:      bge         2f
159        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
160        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
1612:      movs        ip, r2, lsl #31
162        ldrmib      r3, [r1], #1
163        ldrcsb      ip, [r1], #1
164        ldrcsb      lr, [r1], #1
165        strmib      r3, [r0], #1
166        strcsb      ip, [r0], #1
167        strcsb      lr, [r0], #1
168
169        ldmfd       sp!, {r0, lr}
170        bx          lr
171
172#else   /* NEON_UNALIGNED_ACCESS */
173
174        // Check so divider is at least 16 bytes, needed for alignment code.
175        cmp         r2, #16
176        blo         5f
177
178#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
179        /* Check the upper size limit for Neon unaligned memory access in memcpy */
180#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
181        cmp         r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
182        blo         3f
183#endif
184        /* check if buffers are aligned. If so, run arm-only version */
185        eor         r3, r0, r1
186        ands        r3, r3, #0x3
187        beq         11f
188
189        /* align destination to 16 bytes for the write-buffer */
190        rsb         r3, r0, #0
191        ands        r3, r3, #0xF
192        beq         3f
193
194        /* copy up to 15-bytes (count in r3) */
195        sub         r2, r2, r3
196        movs        ip, r3, lsl #31
197        ldrmib      lr, [r1], #1
198        strmib      lr, [r0], #1
199        ldrcsb      ip, [r1], #1
200        ldrcsb      lr, [r1], #1
201        strcsb      ip, [r0], #1
202        strcsb      lr, [r0], #1
203        movs        ip, r3, lsl #29
204        bge         1f
205        // copies 4 bytes, destination 32-bits aligned
206        vld1.32     {d0[0]}, [r1]!
207        vst1.32     {d0[0]}, [r0, :32]!
2081:      bcc         2f
209        // copies 8 bytes, destination 64-bits aligned
210        vld1.8      {d0}, [r1]!
211        vst1.8      {d0}, [r0, :64]!
2122:
213        /* preload immediately the next cache line, which we may need */
214        pld         [r1, #(CACHE_LINE_SIZE * 0)]
215        pld         [r1, #(CACHE_LINE_SIZE * 1)]
2163:
217#endif
218        /* make sure we have at least 64 bytes to copy */
219        subs        r2, r2, #64
220        blo         2f
221
222        /* preload all the cache lines we need */
223        pld         [r1, #(CACHE_LINE_SIZE * 2)]
224        pld         [r1, #(CACHE_LINE_SIZE * 3)]
225
2261:      /* The main loop copies 64 bytes at a time */
227        vld1.8      {d0 - d3}, [r1]!
228        vld1.8      {d4 - d7}, [r1]!
229#ifdef  HAVE_32_BYTE_CACHE_LINE
230        pld         [r1, #(CACHE_LINE_SIZE * 2)]
231        pld         [r1, #(CACHE_LINE_SIZE * 3)]
232#else
233        pld         [r1, #(CACHE_LINE_SIZE * 3)]
234#endif
235        subs        r2, r2, #64
236        vst1.8      {d0 - d3}, [r0]!
237        vst1.8      {d4 - d7}, [r0]!
238        bhs         1b
239
2402:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
241        add         r2, r2, #64
242        subs        r2, r2, #32
243        blo         4f
244
2453:      /* 32 bytes at a time. These cache lines were already preloaded */
246        vld1.8      {d0 - d3}, [r1]!
247        subs        r2, r2, #32
248        vst1.8      {d0 - d3}, [r0]!
249        bhs         3b
250
2514:      /* less than 32 left */
252        add         r2, r2, #32
253        tst         r2, #0x10
254        beq         5f
255        // copies 16 bytes, 128-bits aligned
256        vld1.8      {d0, d1}, [r1]!
257        vst1.8      {d0, d1}, [r0]!
2585:      /* copy up to 15-bytes (count in r2) */
259        movs        ip, r2, lsl #29
260        bcc         1f
261        vld1.8      {d0}, [r1]!
262        vst1.8      {d0}, [r0]!
2631:      bge         2f
264        vld1.32     {d0[0]}, [r1]!
265        vst1.32     {d0[0]}, [r0]!
2662:      movs        ip, r2, lsl #31
267        ldrmib      r3, [r1], #1
268        ldrcsb      ip, [r1], #1
269        ldrcsb      lr, [r1], #1
270        strmib      r3, [r0], #1
271        strcsb      ip, [r0], #1
272        strcsb      lr, [r0], #1
273
274        ldmfd       sp!, {r0, lr}
275        bx          lr
276#endif  /* NEON_UNALIGNED_ACCESS */
27711:
278        /* Simple arm-only copy loop to handle aligned copy operations */
279        stmfd       sp!, {r4, r5, r6, r7, r8}
280        pld         [r1, #(CACHE_LINE_SIZE * 2)]
281
282        /* Check alignment */
283        rsb         r3, r1, #0
284        ands        r3, #3
285        beq         2f
286
287        /* align source to 32 bits. We need to insert 2 instructions between
288         * a ldr[b|h] and str[b|h] because byte and half-word instructions
289         * stall 2 cycles.
290         */
291        movs        r12, r3, lsl #31
292        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
293        ldrmib      r3, [r1], #1
294        ldrcsb      r4, [r1], #1
295        ldrcsb      r5, [r1], #1
296        strmib      r3, [r0], #1
297        strcsb      r4, [r0], #1
298        strcsb      r5, [r0], #1
2992:
300        subs        r2, #32
301        blt         5f
302        pld         [r1, #(CACHE_LINE_SIZE * 3)]
3033:      /* Main copy loop, copying 32 bytes at a time */
304        pld         [r1, #(CACHE_LINE_SIZE * 4)]
305        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
306        subs        r2, r2, #32
307        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
308        bge         3b
3095:      /* Handle any remaining bytes */
310        adds        r2, #32
311        beq         6f
312
313        movs        r12, r2, lsl #28
314        ldmcsia     r1!, {r3, r4, r5, r6}   /* 16 bytes */
315        ldmmiia     r1!, {r7, r8}           /*  8 bytes */
316        stmcsia     r0!, {r3, r4, r5, r6}
317        stmmiia     r0!, {r7, r8}
318        movs        r12, r2, lsl #30
319        ldrcs       r3, [r1], #4            /*  4 bytes */
320        ldrmih      r4, [r1], #2            /*  2 bytes */
321        strcs       r3, [r0], #4
322        strmih      r4, [r0], #2
323        tst         r2, #0x1
324        ldrneb      r3, [r1]                /*  last byte  */
325        strneb      r3, [r0]
3266:
327        ldmfd       sp!, {r4, r5, r6, r7, r8}
328        ldmfd       sp!, {r0, pc}
329END(memcpy)
330
331
332#else   /* __ARM_ARCH__ < 7 */
333
334
335		/*
336		 * Optimized memcpy() for ARM.
337         *
338		 * note that memcpy() always returns the destination pointer,
339		 * so we have to preserve R0.
340		 */
341
342ENTRY(memcpy)
343		/* The stack must always be 64-bits aligned to be compliant with the
344		 * ARM ABI. Since we have to save R0, we might as well save R4
345		 * which we can use for better pipelining of the reads below
346		 */
347        .save       {r0, r4, lr}
348        stmfd       sp!, {r0, r4, lr}
349        /* Making room for r5-r11 which will be spilled later */
350        .pad        #28
351        sub         sp, sp, #28
352
353        // preload the destination because we'll align it to a cache line
354        // with small writes. Also start the source "pump".
355        pld         [r0, #0]
356        pld         [r1, #0]
357        pld         [r1, #32]
358
359		/* it simplifies things to take care of len<4 early */
360		cmp			r2, #4
361		blo			copy_last_3_and_return
362
363		/* compute the offset to align the source
364		 * offset = (4-(src&3))&3 = -src & 3
365		 */
366		rsb			r3, r1, #0
367		ands		r3, r3, #3
368		beq			src_aligned
369
370		/* align source to 32 bits. We need to insert 2 instructions between
371		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
372		 * stall 2 cycles.
373		 */
374		movs		r12, r3, lsl #31
375		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
376		ldrmib		r3, [r1], #1
377		ldrcsb		r4, [r1], #1
378		ldrcsb		r12,[r1], #1
379        strmib		r3, [r0], #1
380		strcsb		r4, [r0], #1
381		strcsb		r12,[r0], #1
382
383src_aligned:
384
385		/* see if src and dst are aligned together (congruent) */
386		eor			r12, r0, r1
387		tst			r12, #3
388		bne			non_congruent
389
390        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
391         * frame. Don't update sp.
392         */
393        stmea		sp, {r5-r11}
394
395		/* align the destination to a cache-line */
396		rsb         r3, r0, #0
397		ands		r3, r3, #0x1C
398		beq         congruent_aligned32
399		cmp         r3, r2
400		andhi		r3, r2, #0x1C
401
402		/* conditionnaly copies 0 to 7 words (length in r3) */
403		movs		r12, r3, lsl #28
404		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
405		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
406		stmcsia		r0!, {r4, r5, r6, r7}
407		stmmiia		r0!, {r8, r9}
408		tst         r3, #0x4
409		ldrne		r10,[r1], #4			/*  4 bytes */
410		strne		r10,[r0], #4
411		sub         r2, r2, r3
412
413congruent_aligned32:
414		/*
415		 * here source is aligned to 32 bytes.
416		 */
417
418cached_aligned32:
419        subs        r2, r2, #32
420        blo         less_than_32_left
421
422        /*
423         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
424         * stall only until the requested world is fetched, but the linefill
425         * continues in the the background.
426         * While the linefill is going, we write our previous cache-line
427         * into the write-buffer (which should have some free space).
428         * When the linefill is done, the writebuffer will
429         * start dumping its content into memory
430         *
431         * While all this is going, we then load a full cache line into
432         * 8 registers, this cache line should be in the cache by now
433         * (or partly in the cache).
434         *
435         * This code should work well regardless of the source/dest alignment.
436         *
437         */
438
439        // Align the preload register to a cache-line because the cpu does
440        // "critical word first" (the first word requested is loaded first).
441        bic         r12, r1, #0x1F
442        add         r12, r12, #64
443
4441:      ldmia       r1!, { r4-r11 }
445        pld         [r12, #64]
446        subs        r2, r2, #32
447
448        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
449        // for ARM9 preload will not be safely guarded by the preceding subs.
450        // When it is safely guarded the only possibility to have SIGSEGV here
451        // is because the caller overstates the length.
452        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
453        stmia       r0!, { r4-r11 }
454		bhs         1b
455
456        add         r2, r2, #32
457
458
459
460
461less_than_32_left:
462		/*
463		 * less than 32 bytes left at this point (length in r2)
464		 */
465
466		/* skip all this if there is nothing to do, which should
467		 * be a common case (if not executed the code below takes
468		 * about 16 cycles)
469		 */
470		tst			r2, #0x1F
471		beq			1f
472
473		/* conditionnaly copies 0 to 31 bytes */
474		movs		r12, r2, lsl #28
475		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
476		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
477		stmcsia		r0!, {r4, r5, r6, r7}
478		stmmiia		r0!, {r8, r9}
479		movs		r12, r2, lsl #30
480		ldrcs		r3, [r1], #4			/*  4 bytes */
481		ldrmih		r4, [r1], #2			/*  2 bytes */
482		strcs		r3, [r0], #4
483		strmih		r4, [r0], #2
484		tst         r2, #0x1
485		ldrneb		r3, [r1]				/*  last byte  */
486		strneb		r3, [r0]
487
488		/* we're done! restore everything and return */
4891:		ldmfd		sp!, {r5-r11}
490		ldmfd		sp!, {r0, r4, lr}
491		bx			lr
492
493		/********************************************************************/
494
495non_congruent:
496		/*
497		 * here source is aligned to 4 bytes
498		 * but destination is not.
499		 *
500		 * in the code below r2 is the number of bytes read
501		 * (the number of bytes written is always smaller, because we have
502		 * partial words in the shift queue)
503		 */
504		cmp			r2, #4
505		blo			copy_last_3_and_return
506
507        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
508         * frame. Don't update sp.
509         */
510        stmea		sp, {r5-r11}
511
512		/* compute shifts needed to align src to dest */
513		rsb			r5, r0, #0
514		and			r5, r5, #3			/* r5 = # bytes in partial words */
515		mov			r12, r5, lsl #3		/* r12 = right */
516		rsb			lr, r12, #32		/* lr = left  */
517
518		/* read the first word */
519		ldr			r3, [r1], #4
520		sub			r2, r2, #4
521
522		/* write a partial word (0 to 3 bytes), such that destination
523		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
524		 */
525		movs		r5, r5, lsl #31
526		strmib		r3, [r0], #1
527		movmi		r3, r3, lsr #8
528		strcsb		r3, [r0], #1
529		movcs		r3, r3, lsr #8
530		strcsb		r3, [r0], #1
531		movcs		r3, r3, lsr #8
532
533		cmp			r2, #4
534		blo			partial_word_tail
535
536		/* Align destination to 32 bytes (cache line boundary) */
5371:		tst			r0, #0x1c
538		beq			2f
539		ldr			r5, [r1], #4
540		sub         r2, r2, #4
541		orr			r4, r3, r5,		lsl lr
542		mov			r3, r5,			lsr r12
543		str			r4, [r0], #4
544        cmp         r2, #4
545		bhs			1b
546		blo			partial_word_tail
547
548		/* copy 32 bytes at a time */
5492:		subs		r2, r2, #32
550		blo			less_than_thirtytwo
551
552		/* Use immediate mode for the shifts, because there is an extra cycle
553		 * for register shifts, which could account for up to 50% of
554		 * performance hit.
555		 */
556
557        cmp			r12, #24
558		beq			loop24
559		cmp			r12, #8
560		beq			loop8
561
562loop16:
563        ldr         r12, [r1], #4
5641:      mov         r4, r12
565		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
566        pld         [r1, #64]
567        subs        r2, r2, #32
568        ldrhs       r12, [r1], #4
569		orr			r3, r3, r4,		lsl #16
570		mov			r4, r4,			lsr #16
571		orr			r4, r4, r5,		lsl #16
572		mov			r5, r5,			lsr #16
573		orr			r5, r5, r6,		lsl #16
574		mov			r6, r6,			lsr #16
575		orr			r6, r6, r7,		lsl #16
576		mov			r7, r7,			lsr #16
577		orr			r7, r7, r8,		lsl #16
578		mov			r8, r8,			lsr #16
579		orr			r8, r8, r9,		lsl #16
580		mov			r9, r9,			lsr #16
581		orr			r9, r9, r10,	lsl #16
582		mov			r10, r10,		lsr #16
583		orr			r10, r10, r11,	lsl #16
584		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
585		mov			r3, r11,		lsr #16
586		bhs			1b
587		b			less_than_thirtytwo
588
589loop8:
590        ldr         r12, [r1], #4
5911:      mov         r4, r12
592		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
593        pld         [r1, #64]
594		subs		r2, r2, #32
595        ldrhs       r12, [r1], #4
596		orr			r3, r3, r4,		lsl #24
597		mov			r4, r4,			lsr #8
598		orr			r4, r4, r5,		lsl #24
599		mov			r5, r5,			lsr #8
600		orr			r5, r5, r6,		lsl #24
601		mov			r6, r6,			lsr #8
602		orr			r6, r6, r7,		lsl #24
603		mov			r7, r7,			lsr #8
604		orr			r7, r7, r8,		lsl #24
605		mov			r8, r8,			lsr #8
606		orr			r8, r8, r9,		lsl #24
607		mov			r9, r9,			lsr #8
608		orr			r9, r9, r10,	lsl #24
609		mov			r10, r10,		lsr #8
610		orr			r10, r10, r11,	lsl #24
611		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
612		mov			r3, r11,		lsr #8
613		bhs			1b
614		b			less_than_thirtytwo
615
616loop24:
617        ldr         r12, [r1], #4
6181:      mov         r4, r12
619		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
620        pld         [r1, #64]
621		subs		r2, r2, #32
622        ldrhs       r12, [r1], #4
623		orr			r3, r3, r4,		lsl #8
624		mov			r4, r4,			lsr #24
625		orr			r4, r4, r5,		lsl #8
626		mov			r5, r5,			lsr #24
627		orr			r5, r5, r6,		lsl #8
628		mov			r6, r6,			lsr #24
629		orr			r6, r6, r7,		lsl #8
630		mov			r7, r7,			lsr #24
631		orr			r7, r7, r8,		lsl #8
632		mov			r8, r8,			lsr #24
633		orr			r8, r8, r9,		lsl #8
634		mov			r9, r9,			lsr #24
635		orr			r9, r9, r10,	lsl #8
636		mov			r10, r10,		lsr #24
637		orr			r10, r10, r11,	lsl #8
638		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
639		mov			r3, r11,		lsr #24
640		bhs			1b
641
642
643less_than_thirtytwo:
644		/* copy the last 0 to 31 bytes of the source */
645		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
646		add			r2, r2, #32
647		cmp			r2, #4
648		blo			partial_word_tail
649
6501:		ldr			r5, [r1], #4
651		sub         r2, r2, #4
652		orr			r4, r3, r5,		lsl lr
653		mov			r3,	r5,			lsr r12
654		str			r4, [r0], #4
655        cmp         r2, #4
656		bhs			1b
657
658partial_word_tail:
659		/* we have a partial word in the input buffer */
660		movs		r5, lr, lsl #(31-3)
661		strmib		r3, [r0], #1
662		movmi		r3, r3, lsr #8
663		strcsb		r3, [r0], #1
664		movcs		r3, r3, lsr #8
665		strcsb		r3, [r0], #1
666
667		/* Refill spilled registers from the stack. Don't update sp. */
668		ldmfd		sp, {r5-r11}
669
670copy_last_3_and_return:
671		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
672		ldrmib		r2, [r1], #1
673		ldrcsb		r3, [r1], #1
674		ldrcsb		r12,[r1]
675		strmib		r2, [r0], #1
676		strcsb		r3, [r0], #1
677		strcsb		r12,[r0]
678
679        /* we're done! restore sp and spilled registers and return */
680        add         sp,  sp, #28
681		ldmfd		sp!, {r0, r4, lr}
682		bx			lr
683END(memcpy)
684
685
686#endif    /* __ARM_ARCH__ < 7 */
687