15b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang/*
25b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * Copyright (C) 2008 The Android Open Source Project
35b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * All rights reserved.
45b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * Copyright (c) 2013-2014, NVIDIA Corporation.  All rights reserved.
55b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *
65b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * Redistribution and use in source and binary forms, with or without
75b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * modification, are permitted provided that the following conditions
85b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * are met:
95b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *  * Redistributions of source code must retain the above copyright
105b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    notice, this list of conditions and the following disclaimer.
115b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *  * Redistributions in binary form must reproduce the above copyright
125b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    notice, this list of conditions and the following disclaimer in
135b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    the documentation and/or other materials provided with the
145b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    distribution.
155b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *
165b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
175b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
185b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
195b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
205b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
215b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
225b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
235b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
245b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
255b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
265b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
275b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * SUCH DAMAGE.
285b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang */
295b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
305b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang#define CACHE_LINE_SIZE         (64)
315b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang#define PREFETCH_DISTANCE       (CACHE_LINE_SIZE*6)
325b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
335b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangENTRY_PRIVATE(MEMCPY_BASE)
345b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .cfi_def_cfa_offset 8
355b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .cfi_rel_offset r0, 0
365b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .cfi_rel_offset lr, 4
375b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
385b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, #0
395b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         .L_memcpy_done
405b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r0, r1
415b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         .L_memcpy_done
425b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
435b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* preload next cache line */
445b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #CACHE_LINE_SIZE*1]
455b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
465b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* Deal with very small blocks (< 32bytes) asap */
475b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, #32
485b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        blo         .L_memcpy_lt_32bytes
495b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* no need to align if len < 128 bytes */
505b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, #128
515b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        blo         .L_memcpy_lt_128bytes
525b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
535b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* large copy, align dest to 64 byte boundry */
545b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #CACHE_LINE_SIZE*2]
555b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        rsb         r3, r0, #0
565b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ands        r3, r3, #0x3F
575b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #CACHE_LINE_SIZE*3]
585b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         .L_memcpy_dispatch
595b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        sub         r2, r2, r3
605b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 1 byte */
615b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r3, lsl #31
625b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         mi
635b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrbmi      ip, [r1], #1
645b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbmi      ip, [r0], #1
655b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 2 bytes */
665b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         cs
675b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrhcs      ip, [r1], #2
685b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strhcs      ip, [r0], #2
695b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 4 bytes */
705b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r3, lsl #29
715b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         mi
725b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrmi       ip, [r1], #4
735b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strmi       ip, [r0], #4
745b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 8 bytes */
755b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
765b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {d0}, [r1]!
775b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {d0}, [r0, :64]!
785b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 16 bytes */
795b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r3, lsl #27
805b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl         1f
815b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0}, [r1]!
825b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0}, [r0, :128]!
835b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 32 bytes */
845b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         .L_memcpy_dispatch
855b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
865b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
875b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
885b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_dispatch:
895b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // pre-decrement by 128 to detect nearly-done condition easily, but
905b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // also need to check if we have less than 128 bytes left at this
915b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // point due to alignment code above
925b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        subs        r2, r2, #128
935b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        blo         .L_memcpy_lt_128presub
945b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
955b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Denver does better if both source and dest are aligned so
965b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // we'll special-case that even though the code is virually identical
975b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        tst         r1, #0xF
985b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bne         .L_memcpy_neon_unalign_src_pld
995b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1005b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // DRAM memcpy should be throttled slightly to get full bandwidth
1015b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        //
1025b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, #32768
1035b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bhi         .L_memcpy_neon_unalign_src_pld
1045b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .align      4
1055b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1065b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 128 bytes in each loop */
1075b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        subs        r2, r2, #128
1085b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1095b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* preload a cache line */
1105b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #PREFETCH_DISTANCE]
1115b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy a cache line */
1125b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1, :128]!
1135b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1145b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1, :128]!
1155b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1165b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* preload a cache line */
1175b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #PREFETCH_DISTANCE]
1185b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy a cache line */
1195b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1, :128]!
1205b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1215b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1, :128]!
1225b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1235b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1245b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bhs         1b
1255b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        adds        r2, r2, #128
1265b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bne         .L_memcpy_lt_128bytes_align
1275b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pop         {r0, pc}
1285b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1295b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .align      4
1305b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_neon_unalign_src_pld:
1315b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1325b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 128 bytes in each loop */
1335b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        subs        r2, r2, #128
1345b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1355b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* preload a cache line */
1365b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #PREFETCH_DISTANCE]
1375b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy a cache line */
1385b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1395b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1405b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1415b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1425b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* preload a cache line */
1435b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pld         [r1, #PREFETCH_DISTANCE]
1445b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy a cache line */
1455b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1465b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1475b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1485b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1495b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1505b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bhs         1b
1515b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        adds        r2, r2, #128
1525b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bne         .L_memcpy_lt_128bytes_align
1535b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pop         {r0, pc}
1545b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1555b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_lt_128presub:
1565b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        add         r2, r2, #128
1575b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_lt_128bytes_align:
1585b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 64 bytes */
1595b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #26
1605b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
1615b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1625b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1635b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1645b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1655b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 32 bytes */
1665b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl         1f
1675b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
1685b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
1695b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 16 bytes */
1705b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #28
1715b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
1725b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0}, [r1]!
1735b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0}, [r0, :128]!
1745b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 8 bytes */
1755b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl         1f
1765b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {d0}, [r1]!
1775b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {d0}, [r0, :64]!
1785b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 4 bytes */
1795b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        tst         r2, #4
1805b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         ne
1815b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrne       ip, [r1], #4
1825b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strne       ip, [r0], #4
1835b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 2 bytes */
1845b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #31
1855b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         cs
1865b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrhcs      ip, [r1], #2
1875b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strhcs      ip, [r0], #2
1885b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 1 byte */
1895b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         mi
1905b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrbmi      ip, [r1]
1915b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbmi      ip, [r0]
1925b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1935b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pop         {r0, pc}
1945b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1955b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_lt_128bytes:
1965b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 64 bytes */
1975b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #26
1985b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
1995b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
2005b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0]!
2015b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
2025b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0]!
2035b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 32 bytes */
2045b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl	    .L_memcpy_lt_32bytes
2055b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0, q1}, [r1]!
2065b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r0]!
2075b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_lt_32bytes:
2085b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 16 bytes */
2095b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #28
2105b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
2115b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {q0}, [r1]!
2125b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0}, [r0]!
2135b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 8 bytes */
2145b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl         1f
2155b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vld1.8      {d0}, [r1]!
2165b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {d0}, [r0]!
2175b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:      /* copy 4 bytes */
2185b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        tst         r2, #4
2195b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         ne
2205b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrne       ip, [r1], #4
2215b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strne       ip, [r0], #4
2225b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 2 bytes */
2235b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #31
2245b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         cs
2255b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrhcs      ip, [r1], #2
2265b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strhcs      ip, [r0], #2
2275b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* copy 1 byte */
2285b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         mi
2295b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldrbmi      ip, [r1]
2305b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbmi      ip, [r0]
2315b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
2325b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memcpy_done:
2335b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pop         {r0, pc}
2345b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangEND(MEMCPY_BASE)
235