1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * This code assumes it is running on a processor that supports all arm v7
31 * instructions, that supports neon instructions, and that has a 32 byte
32 * cache line.
33 */
34
35ENTRY(MEMCPY_BASE)
36        .cfi_startproc
37        .save       {r0, lr}
38        .cfi_def_cfa_offset 8
39        .cfi_rel_offset r0, 0
40        .cfi_rel_offset lr, 4
41
42        // Check so divider is at least 16 bytes, needed for alignment code.
43        cmp         r2, #16
44        blo         5f
45
46        /* check if buffers are aligned. If so, run arm-only version */
47        eor         r3, r0, r1
48        ands        r3, r3, #0x3
49        beq         __memcpy_base_aligned
50
51        /* Check the upper size limit for Neon unaligned memory access in memcpy */
52        cmp         r2, #224
53        blo         3f
54
55        /* align destination to 16 bytes for the write-buffer */
56        rsb         r3, r0, #0
57        ands        r3, r3, #0xF
58        beq         3f
59
60        /* copy up to 15-bytes (count in r3) */
61        sub         r2, r2, r3
62        movs        ip, r3, lsl #31
63        itt         mi
64        ldrbmi      lr, [r1], #1
65        strbmi      lr, [r0], #1
66        itttt       cs
67        ldrbcs      ip, [r1], #1
68        ldrbcs      lr, [r1], #1
69        strbcs      ip, [r0], #1
70        strbcs      lr, [r0], #1
71        movs        ip, r3, lsl #29
72        bge         1f
73        // copies 4 bytes, destination 32-bits aligned
74        vld1.32     {d0[0]}, [r1]!
75        vst1.32     {d0[0]}, [r0, :32]!
761:      bcc         2f
77        // copies 8 bytes, destination 64-bits aligned
78        vld1.8      {d0}, [r1]!
79        vst1.8      {d0}, [r0, :64]!
802:
81        /* preload immediately the next cache line, which we may need */
82        pld         [r1, #0]
83        pld         [r1, #(32 * 2)]
843:
85        /* make sure we have at least 64 bytes to copy */
86        subs        r2, r2, #64
87        blo         2f
88
89        /* preload all the cache lines we need */
90        pld         [r1, #(32 * 4)]
91        pld         [r1, #(32 * 6)]
92
931:      /* The main loop copies 64 bytes at a time */
94        vld1.8      {d0 - d3}, [r1]!
95        vld1.8      {d4 - d7}, [r1]!
96        pld         [r1, #(32 * 6)]
97        subs        r2, r2, #64
98        vst1.8      {d0 - d3}, [r0]!
99        vst1.8      {d4 - d7}, [r0]!
100        bhs         1b
101
1022:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
103        add         r2, r2, #64
104        subs        r2, r2, #32
105        blo         4f
106
1073:      /* 32 bytes at a time. These cache lines were already preloaded */
108        vld1.8      {d0 - d3}, [r1]!
109        subs        r2, r2, #32
110        vst1.8      {d0 - d3}, [r0]!
111        bhs         3b
112
1134:      /* less than 32 left */
114        add         r2, r2, #32
115        tst         r2, #0x10
116        beq         5f
117        // copies 16 bytes, 128-bits aligned
118        vld1.8      {d0, d1}, [r1]!
119        vst1.8      {d0, d1}, [r0]!
1205:      /* copy up to 15-bytes (count in r2) */
121        movs        ip, r2, lsl #29
122        bcc         1f
123        vld1.8      {d0}, [r1]!
124        vst1.8      {d0}, [r0]!
1251:      bge         2f
126        vld1.32     {d0[0]}, [r1]!
127        vst1.32     {d0[0]}, [r0]!
1282:      movs        ip, r2, lsl #31
129        itt         mi
130        ldrbmi      r3, [r1], #1
131        strbmi      r3, [r0], #1
132        itttt       cs
133        ldrbcs      ip, [r1], #1
134        ldrbcs      lr, [r1], #1
135        strbcs      ip, [r0], #1
136        strbcs      lr, [r0], #1
137
138        ldmfd       sp!, {r0, lr}
139        bx          lr
140
141        .cfi_endproc
142END(MEMCPY_BASE)
143
144ENTRY(MEMCPY_BASE_ALIGNED)
145        .cfi_startproc
146
147        .save       {r0, lr}
148        .cfi_def_cfa_offset 8
149        .cfi_rel_offset r0, 0
150        .cfi_rel_offset lr, 4
151
152        /* Simple arm-only copy loop to handle aligned copy operations */
153        stmfd       sp!, {r4-r8}
154        .save       {r4-r8}
155        .cfi_adjust_cfa_offset 20
156        .cfi_rel_offset r4, 0
157        .cfi_rel_offset r5, 4
158        .cfi_rel_offset r6, 8
159        .cfi_rel_offset r7, 12
160        .cfi_rel_offset r8, 16
161        pld         [r1, #(32 * 4)]
162
163        /* Check alignment */
164        rsb         r3, r1, #0
165        ands        r3, #3
166        beq         2f
167
168        /* align source to 32 bits. We need to insert 2 instructions between
169         * a ldr[b|h] and str[b|h] because byte and half-word instructions
170         * stall 2 cycles.
171         */
172        movs        r12, r3, lsl #31
173        sub         r2, r2, r3      /* we know that r3 <= r2 because r2 >= 4 */
174        itt         mi
175        ldrbmi      r3, [r1], #1
176        strbmi      r3, [r0], #1
177        itttt       cs
178        ldrbcs      r4, [r1], #1
179        ldrbcs      r5, [r1], #1
180        strbcs      r4, [r0], #1
181        strbcs      r5, [r0], #1
182
1832:
184        subs        r2, r2, #64
185        blt         4f
186
1873:      /* Main copy loop, copying 64 bytes at a time */
188        pld         [r1, #(32 * 8)]
189        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
190        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
191        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
192        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
193        subs        r2, r2, #64
194        bge         3b
195
1964:      /* Check if there are > 32 bytes left */
197        adds        r2, r2, #64
198        subs        r2, r2, #32
199        blt         5f
200
201        /* Copy 32 bytes */
202        ldmia       r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
203        stmia       r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
204        subs        r2, #32
205
2065:      /* Handle any remaining bytes */
207        adds        r2, #32
208        beq         6f
209
210        movs        r12, r2, lsl #28
211        itt         cs
212        ldmiacs     r1!, {r3, r4, r5, r6}   /* 16 bytes */
213        stmiacs     r0!, {r3, r4, r5, r6}
214        itt         mi
215        ldmiami     r1!, {r7, r8}           /*  8 bytes */
216        stmiami     r0!, {r7, r8}
217        movs        r12, r2, lsl #30
218        itt         cs
219        ldrcs       r3, [r1], #4            /*  4 bytes */
220        strcs       r3, [r0], #4
221        itt         mi
222        ldrhmi      r4, [r1], #2            /*  2 bytes */
223        strhmi      r4, [r0], #2
224        tst         r2, #0x1
225        itt         ne
226        ldrbne      r3, [r1]                /*  last byte  */
227        strbne      r3, [r0]
2286:
229        ldmfd       sp!, {r4-r8}
230        ldmfd       sp!, {r0, pc}
231
232        .cfi_endproc
233END(MEMCPY_BASE_ALIGNED)
234