memcpy_base.S revision 5f45d583b0cfb4f7bed1447e8eed003a529cc69e
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * This code assumes it is running on a processor that supports all arm v7
31 * instructions, that supports neon instructions, and that has a 32 byte
32 * cache line.
33 */
34
35        // Check so divider is at least 16 bytes, needed for alignment code.
36        cmp         r2, #16
37        blo         5f
38
39
40        /* check if buffers are aligned. If so, run arm-only version */
41        eor         r3, r0, r1
42        ands        r3, r3, #0x3
43        beq         11f
44
45        /* Check the upper size limit for Neon unaligned memory access in memcpy */
46        cmp         r2, #224
47        blo         3f
48
49        /* align destination to 16 bytes for the write-buffer */
50        rsb         r3, r0, #0
51        ands        r3, r3, #0xF
52        beq         3f
53
54        /* copy up to 15-bytes (count in r3) */
55        sub         r2, r2, r3
56        movs        ip, r3, lsl #31
57        itt         mi
58        ldrbmi      lr, [r1], #1
59        strbmi      lr, [r0], #1
60        itttt       cs
61        ldrbcs      ip, [r1], #1
62        ldrbcs      lr, [r1], #1
63        strbcs      ip, [r0], #1
64        strbcs      lr, [r0], #1
65        movs        ip, r3, lsl #29
66        bge         1f
67        // copies 4 bytes, destination 32-bits aligned
68        vld1.32     {d0[0]}, [r1]!
69        vst1.32     {d0[0]}, [r0, :32]!
701:      bcc         2f
71        // copies 8 bytes, destination 64-bits aligned
72        vld1.8      {d0}, [r1]!
73        vst1.8      {d0}, [r0, :64]!
742:
75        /* preload immediately the next cache line, which we may need */
76        pld         [r1, #0]
77        pld         [r1, #(32 * 2)]
783:
79        /* make sure we have at least 64 bytes to copy */
80        subs        r2, r2, #64
81        blo         2f
82
83        /* preload all the cache lines we need */
84        pld         [r1, #(32 * 4)]
85        pld         [r1, #(32 * 6)]
86
871:      /* The main loop copies 64 bytes at a time */
88        vld1.8      {d0 - d3}, [r1]!
89        vld1.8      {d4 - d7}, [r1]!
90        pld         [r1, #(32 * 6)]
91        subs        r2, r2, #64
92        vst1.8      {d0 - d3}, [r0]!
93        vst1.8      {d4 - d7}, [r0]!
94        bhs         1b
95
962:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
97        add         r2, r2, #64
98        subs        r2, r2, #32
99        blo         4f
100
1013:      /* 32 bytes at a time. These cache lines were already preloaded */
102        vld1.8      {d0 - d3}, [r1]!
103        subs        r2, r2, #32
104        vst1.8      {d0 - d3}, [r0]!
105        bhs         3b
106
1074:      /* less than 32 left */
108        add         r2, r2, #32
109        tst         r2, #0x10
110        beq         5f
111        // copies 16 bytes, 128-bits aligned
112        vld1.8      {d0, d1}, [r1]!
113        vst1.8      {d0, d1}, [r0]!
1145:      /* copy up to 15-bytes (count in r2) */
115        movs        ip, r2, lsl #29
116        bcc         1f
117        vld1.8      {d0}, [r1]!
118        vst1.8      {d0}, [r0]!
1191:      bge         2f
120        vld1.32     {d0[0]}, [r1]!
121        vst1.32     {d0[0]}, [r0]!
1222:      movs        ip, r2, lsl #31
123        itt         mi
124        ldrbmi      r3, [r1], #1
125        strbmi      r3, [r0], #1
126        itttt       cs
127        ldrbcs      ip, [r1], #1
128        ldrbcs      lr, [r1], #1
129        strbcs      ip, [r0], #1
130        strbcs      lr, [r0], #1
131
132        ldmfd       sp!, {r0, lr}
133        bx          lr
13411:
135        /* Simple arm-only copy loop to handle aligned copy operations */
136        stmfd       sp!, {r4, r5, r6, r7, r8}
137        pld         [r1, #(32 * 4)]
138
139        /* Check alignment */
140        rsb         r3, r1, #0
141        ands        r3, #3
142        beq         2f
143
144        /* align source to 32 bits. We need to insert 2 instructions between
145         * a ldr[b|h] and str[b|h] because byte and half-word instructions
146         * stall 2 cycles.
147         */
148        movs        r12, r3, lsl #31
149        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
150        itt         mi
151        ldrbmi      r3, [r1], #1
152        strbmi      r3, [r0], #1
153        itttt       cs
154        ldrbcs      r4, [r1], #1
155        ldrbcs      r5, [r1], #1
156        strbcs      r4, [r0], #1
157        strbcs      r5, [r0], #1
158
1592:
160        subs        r2, r2, #64
161        blt         4f
162
1633:      /* Main copy loop, copying 64 bytes at a time */
164        pld         [r1, #(32 * 8)]
165        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
166        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
167        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
168        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
169        subs        r2, r2, #64
170        bge         3b
171
1724:      /* Check if there are > 32 bytes left */
173        adds        r2, r2, #64
174        subs        r2, r2, #32
175        blt         5f
176
177        /* Copy 32 bytes */
178        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
179        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
180        subs        r2, #32
181
1825:      /* Handle any remaining bytes */
183        adds        r2, #32
184        beq         6f
185
186        movs        r12, r2, lsl #28
187        itt         cs
188        ldmiacs     r1!, {r3, r4, r5, r6}   /* 16 bytes */
189        stmiacs     r0!, {r3, r4, r5, r6}
190        itt         mi
191        ldmiami     r1!, {r7, r8}           /*  8 bytes */
192        stmiami     r0!, {r7, r8}
193        movs        r12, r2, lsl #30
194        itt         cs
195        ldrcs       r3, [r1], #4            /*  4 bytes */
196        strcs       r3, [r0], #4
197        itt         mi
198        ldrhmi      r4, [r1], #2            /*  2 bytes */
199        strhmi      r4, [r0], #2
200        tst         r2, #0x1
201        itt         ne
202        ldrbne      r3, [r1]                /*  last byte  */
203        strbne      r3, [r0]
2046:
205        ldmfd       sp!, {r4, r5, r6, r7, r8}
206        ldmfd       sp!, {r0, pc}
207