1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28/*
29 * Copyright (c) 2013 ARM Ltd
30 * All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. The name of the company may not be used to endorse or promote
41 *    products derived from this software without specific prior written
42 *    permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
45 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
49 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
50 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
51 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
52 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
53 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 */
55
56ENTRY_PRIVATE(MEMCPY_BASE)
57        .cfi_def_cfa_offset 8
58        .cfi_rel_offset r0, 0
59        .cfi_rel_offset lr, 4
60
61        // Assumes that n >= 0, and dst, src are valid pointers.
62        // For any sizes less than 832 use the neon code that doesn't
63        // care about the src alignment. This avoids any checks
64        // for src alignment, and offers the best improvement since
65        // smaller sized copies are dominated by the overhead of
66        // the pre and post main loop.
67        // For larger copies, if src and dst cannot both be aligned to
68        // word boundaries, use the neon code.
69        // For all other copies, align dst to a double word boundary
70        // and copy using LDRD/STRD instructions.
71
72        cmp     r2, #16
73        blo     .L_copy_less_than_16_unknown_align
74
75        // TODO: The aligned copy code is extremely slow copying some large
76        //       buffers so always go through the unaligned path for now.
77        //cmp     r2, #832
78        //bge     .L_check_alignment
79
80.L_copy_unknown_alignment:
81        // Unknown alignment of src and dst.
82        // Assumes that the first few bytes have already been prefetched.
83
84        // Align destination to 128 bits. The mainloop store instructions
85        // require this alignment or they will throw an exception.
86        rsb         r3, r0, #0
87        ands        r3, r3, #0xF
88        beq         2f
89
90        // Copy up to 15 bytes (count in r3).
91        sub         r2, r2, r3
92        movs        ip, r3, lsl #31
93
94        itt         mi
95        ldrbmi      lr, [r1], #1
96        strbmi      lr, [r0], #1
97        itttt       cs
98        ldrbcs      ip, [r1], #1
99        ldrbcs      lr, [r1], #1
100        strbcs      ip, [r0], #1
101        strbcs      lr, [r0], #1
102
103        movs        ip, r3, lsl #29
104        bge         1f
105        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
106        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
107        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1081:      bcc         2f
109        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
110        vld1.8      {d0}, [r1]!
111        vst1.8      {d0}, [r0, :64]!
112
1132:      // Make sure we have at least 64 bytes to copy.
114        subs        r2, r2, #64
115        blo         2f
116
1171:      // The main loop copies 64 bytes at a time.
118        vld1.8      {d0  - d3},   [r1]!
119        vld1.8      {d4  - d7},   [r1]!
120        pld         [r1, #(64*4)]
121        subs        r2, r2, #64
122        vst1.8      {d0  - d3},   [r0, :128]!
123        vst1.8      {d4  - d7},   [r0, :128]!
124        bhs         1b
125
1262:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
127        adds        r2, r2, #32
128        blo         3f
129
130        // 32 bytes. These cache lines were already preloaded.
131        vld1.8      {d0 - d3},  [r1]!
132        sub         r2, r2, #32
133        vst1.8      {d0 - d3},  [r0, :128]!
1343:      // Less than 32 left.
135        add         r2, r2, #32
136        tst         r2, #0x10
137        beq         .L_copy_less_than_16_unknown_align
138        // Copies 16 bytes, destination 128 bits aligned.
139        vld1.8      {d0, d1}, [r1]!
140        vst1.8      {d0, d1}, [r0, :128]!
141
142.L_copy_less_than_16_unknown_align:
143        // Copy up to 15 bytes (count in r2).
144        movs        ip, r2, lsl #29
145        bcc         1f
146        vld1.8      {d0}, [r1]!
147        vst1.8      {d0}, [r0]!
1481:      bge         2f
149        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
150        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
151
1522:      // Copy 0 to 4 bytes.
153        lsls        r2, r2, #31
154        itt         ne
155        ldrbne      lr, [r1], #1
156        strbne      lr, [r0], #1
157        itttt       cs
158        ldrbcs      ip, [r1], #1
159        ldrbcs      lr, [r1]
160        strbcs      ip, [r0], #1
161        strbcs      lr, [r0]
162
163        pop         {r0, pc}
164
165.L_check_alignment:
166        // If src and dst cannot both be aligned to a word boundary,
167        // use the unaligned copy version.
168        eor     r3, r0, r1
169        ands    r3, r3, #0x3
170        bne     .L_copy_unknown_alignment
171END(MEMCPY_BASE)
172
173ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
174        .cfi_def_cfa_offset 8
175        .cfi_rel_offset r0, 0
176        .cfi_rel_offset lr, 4
177
178        // To try and improve performance, stack layout changed,
179        // i.e., not keeping the stack looking like users expect
180        // (highest numbered register at highest address).
181        strd    r4, r5, [sp, #-8]!
182        .cfi_adjust_cfa_offset 8
183        .cfi_rel_offset r4, 0
184        .cfi_rel_offset r5, 4
185        strd    r6, r7, [sp, #-8]!
186        .cfi_adjust_cfa_offset 8
187        .cfi_rel_offset r6, 0
188        .cfi_rel_offset r7, 0
189        strd    r8, r9, [sp, #-8]!
190        .cfi_adjust_cfa_offset 8
191        .cfi_rel_offset r8, 0
192        .cfi_rel_offset r9, 4
193
194        // Optimized for already aligned dst code.
195        ands    ip, r0, #3
196        bne     .L_dst_not_word_aligned
197
198.L_word_aligned:
199        // Align the destination buffer to 8 bytes, to make sure double
200        // loads and stores don't cross a cache line boundary,
201        // as they are then more expensive even if the data is in the cache
202        // (require two load/store issue cycles instead of one).
203        // If only one of the buffers is not 8 bytes aligned,
204        // then it's more important to align dst than src,
205        // because there is more penalty for stores
206        // than loads that cross a cacheline boundary.
207        // This check and realignment are only done if there is >= 832
208        // bytes to copy.
209
210        // Dst is word aligned, but check if it is already double word aligned.
211        ands    r3, r0, #4
212        beq     1f
213        ldr     r3, [r1], #4
214        str     r3, [r0], #4
215        sub     r2, #4
216
2171:      // Can only get here if > 64 bytes to copy, so don't do check r2.
218        sub     r2, #64
219
2202:      // Every loop iteration copies 64 bytes.
221        .irp    offset, #0, #8, #16, #24, #32
222        ldrd    r4, r5, [r1, \offset]
223        strd    r4, r5, [r0, \offset]
224        .endr
225
226        ldrd    r4, r5, [r1, #40]
227        ldrd    r6, r7, [r1, #48]
228        ldrd    r8, r9, [r1, #56]
229
230        // Keep the pld as far from the next load as possible.
231        // The amount to prefetch was determined experimentally using
232        // large sizes, and verifying the prefetch size does not affect
233        // the smaller copies too much.
234        // WARNING: If the ldrd and strd instructions get too far away
235        //          from each other, performance suffers. Three loads
236        //          in a row is the best tradeoff.
237        pld     [r1, #(64*16)]
238        strd    r4, r5, [r0, #40]
239        strd    r6, r7, [r0, #48]
240        strd    r8, r9, [r0, #56]
241
242        add     r0, r0, #64
243        add     r1, r1, #64
244        subs    r2, r2, #64
245        bge     2b
246
247        // Fix-up the remaining count and make sure we have >= 32 bytes left.
248        adds    r2, r2, #32
249        blo     4f
250
251        // Copy 32 bytes. These cache lines were already preloaded.
252        .irp    offset, #0, #8, #16, #24
253        ldrd    r4, r5, [r1, \offset]
254        strd    r4, r5, [r0, \offset]
255        .endr
256        add     r1, r1, #32
257        add     r0, r0, #32
258        sub     r2, r2, #32
2594:      // Less than 32 left.
260        add     r2, r2, #32
261        tst     r2, #0x10
262        beq     5f
263        // Copy 16 bytes.
264        .irp    offset, #0, #8
265        ldrd    r4, r5, [r1, \offset]
266        strd    r4, r5, [r0, \offset]
267        .endr
268        add     r1, r1, #16
269        add     r0, r0, #16
270
2715:      // Copy up to 15 bytes (count in r2).
272        movs    ip, r2, lsl #29
273        bcc     1f
274        // Copy 8 bytes.
275        ldrd    r4, r5, [r1], #8
276        strd    r4, r5, [r0], #8
2771:      bge         2f
278        // Copy 4 bytes.
279        ldr     r4, [r1], #4
280        str     r4, [r0], #4
2812:      // Copy 0 to 4 bytes.
282        lsls    r2, r2, #31
283        itt     ne
284        ldrbne  lr, [r1], #1
285        strbne  lr, [r0], #1
286        itttt   cs
287        ldrbcs  ip, [r1], #1
288        ldrbcs  lr, [r1]
289        strbcs  ip, [r0], #1
290        strbcs  lr, [r0]
291
292        // Restore registers: optimized pop {r0, pc}
293        ldrd    r8, r9, [sp], #8
294        ldrd    r6, r7, [sp], #8
295        ldrd    r4, r5, [sp], #8
296        pop     {r0, pc}
297
298.L_dst_not_word_aligned:
299        // Align dst to word.
300        rsb     ip, ip, #4
301        cmp     ip, #2
302
303        itt     gt
304        ldrbgt  lr, [r1], #1
305        strbgt  lr, [r0], #1
306
307        itt     ge
308        ldrbge  lr, [r1], #1
309        strbge  lr, [r0], #1
310
311        ldrb    lr, [r1], #1
312        strb    lr, [r0], #1
313
314        sub     r2, r2, ip
315
316        // Src is guaranteed to be at least word aligned by this point.
317        b       .L_word_aligned
318END(MEMCPY_BASE_ALIGNED)
319