16c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang/*
26c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * Copyright (C) 2013 The Android Open Source Project
36c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * All rights reserved.
46c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
56c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *
66c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * Redistribution and use in source and binary forms, with or without
76c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * modification, are permitted provided that the following conditions
86c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * are met:
96c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *  * Redistributions of source code must retain the above copyright
106c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *    notice, this list of conditions and the following disclaimer.
116c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *  * Redistributions in binary form must reproduce the above copyright
126c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *    notice, this list of conditions and the following disclaimer in
136c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *    the documentation and/or other materials provided with the
146c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *    distribution.
156c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang *
166c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
176c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
186c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
196c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
206c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
216c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
226c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
236c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
246c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
256c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
266c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
276c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * SUCH DAMAGE.
286c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang */
296c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
306c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#include <private/bionic_asm.h>
316c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#include <private/libc_events.h>
326c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
336c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .text
346c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .syntax unified
356c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .fpu    neon
366c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
376c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define CACHE_LINE_SIZE         (64)
386c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define MEMCPY_BLOCK_SIZE_SMALL (32768)
396c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define MEMCPY_BLOCK_SIZE_MID   (1048576)
406c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define PREFETCH_DISTANCE_NEAR  (CACHE_LINE_SIZE*4)
416c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define PREFETCH_DISTANCE_MID   (CACHE_LINE_SIZE*4)
426c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define PREFETCH_DISTANCE_FAR   (CACHE_LINE_SIZE*16)
436c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
446c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu ZhangENTRY(memmove)
456c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmp         r2, #0
466c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmpne       r0, r1
476c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bxeq        lr
486c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        subs        r3, r0, r1
496c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bls         .L_jump_to_memcpy
506c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmp         r2, r3
516c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bhi         .L_reversed_memcpy
526c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
536c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_jump_to_memcpy:
546c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        b           memcpy
556c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
566c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy:
576c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        push        {r0, lr}
586c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .cfi_def_cfa_offset 8
596c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .cfi_rel_offset r0, 0
606c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .cfi_rel_offset lr, 4
616c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
626c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r0, r0, r2
636c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r1, r1, r2
646c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
656c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload next cache line */
666c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-CACHE_LINE_SIZE]
676c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-CACHE_LINE_SIZE*2]
686c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
696c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_align_dest:
706c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* Deal with very small blocks (< 32bytes) asap */
716c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmp         r2, #32
726c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        blo         .L_reversed_memcpy_lt_32bytes
736c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* no need to align if len < 128 bytes */
746c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmp         r2, #128
756c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        blo         .L_reversed_memcpy_lt_128bytes
766c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* align destination to 64 bytes (1 cache line) */
776c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ands        r3, r0, #0x3f
786c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        beq         .L_reversed_memcpy_dispatch
796c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r2, r2, r3
806c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0:      /* copy 1 byte */
816c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        movs        ip, r3, lsl #31
826c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ldrbmi      ip, [r1, #-1]!
836c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        strbmi      ip, [r0, #-1]!
846c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1:      /* copy 2 bytes */
856c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ldrbcs      ip, [r1, #-1]!
866c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        strbcs      ip, [r0, #-1]!
876c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ldrbcs      ip, [r1, #-1]!
886c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        strbcs      ip, [r0, #-1]!
896c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang2:      /* copy 4 bytes */
906c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        movs        ip, r3, lsl #29
916c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bpl         3f
926c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #4
936c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #4
946c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
956c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
966c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang3:      /* copy 8 bytes */
976c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bcc         4f
986c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #8
996c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #8
1006c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {d0}, [r1]
1016c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {d0}, [r0, :64]
1026c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang4:      /* copy 16 bytes */
1036c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        movs        ip, r3, lsl #27
1046c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bpl         5f
1056c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #16
1066c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #16
1076c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0}, [r1]
1086c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0}, [r0, :128]
1096c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang5:      /* copy 32 bytes */
1106c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bcc         .L_reversed_memcpy_dispatch
1116c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #32
1126c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #32
1136c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1]
1146c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256]
1156c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1166c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_dispatch:
1176c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload more cache lines */
1186c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-CACHE_LINE_SIZE*3]
1196c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-CACHE_LINE_SIZE*4]
1206c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1216c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmp         r2, #MEMCPY_BLOCK_SIZE_SMALL
1226c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        blo         .L_reversed_memcpy_neon_pld_near
1236c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        cmp         r2, #MEMCPY_BLOCK_SIZE_MID
1246c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        blo         .L_reversed_memcpy_neon_pld_mid
1256c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        b           .L_reversed_memcpy_neon_pld_far
1266c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1276c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_neon_pld_near:
1286c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* less than 128 bytes? */
1296c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        subs        r2, r2, #128
1306c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        blo         1f
1316c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #32
1326c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #32
1336c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        mov         r3, #-32
1346c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .align      4
1356c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0:
1366c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy 128 bytes in each loop */
1376c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        subs        r2, r2, #128
1386c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1396c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload to cache */
1406c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
1416c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy a cache line */
1426c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1436c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1446c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1456c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1466c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1476c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload to cache */
1486c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
1496c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy a cache line */
1506c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1516c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1526c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1536c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1546c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1556c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bhs         0b
1566c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r1, r1, #32
1576c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r0, r0, #32
1586c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1:
1596c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        adds        r2, r2, #128
1606c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bne         .L_reversed_memcpy_lt_128bytes
1616c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pop         {r0, pc}
1626c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1636c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_neon_pld_mid:
1646c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        subs        r2, r2, #128
1656c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #32
1666c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #32
1676c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        mov         r3, #-32
1686c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .align      4
1696c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0:
1706c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy 128 bytes in each loop */
1716c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        subs        r2, r2, #128
1726c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1736c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload to cache */
1746c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
1756c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy a cache line */
1766c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1776c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1786c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1796c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1806c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1816c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload to cache */
1826c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
1836c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy a cache line */
1846c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1856c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1866c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1], r3
1876c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256], r3
1886c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1896c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bhs         0b
1906c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r1, r1, #32
1916c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r0, r0, #32
1926c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1:
1936c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        adds        r2, r2, #128
1946c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bne         .L_reversed_memcpy_lt_128bytes
1956c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pop         {r0, pc}
1966c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
1976c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_neon_pld_far:
1986c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r2, r2, #128
1996c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #128
2006c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #128
2016c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        .align      4
2026c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0:
2036c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* copy 128 bytes in each loop */
2046c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        subs        r2, r2, #128
2056c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
2066c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* preload to cache */
2076c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
2086c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
2096c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* read */
2106c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1]!
2116c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q2, q3}, [r1]!
2126c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q8, q9}, [r1]!
2136c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q10, q11}, [r1]!
2146c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        /* write */
2156c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0, :256]!
2166c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q2, q3}, [r0, :256]!
2176c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q8, q9}, [r0, :256]!
2186c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q10, q11}, [r0, :256]!
2196c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
2206c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #256
2216c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #256
2226c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bhs         0b
2236c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r0, r0, #128
2246c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        add         r1, r1, #128
2256c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1:
2266c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        adds        r2, r2, #128
2276c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bne         .L_reversed_memcpy_lt_128bytes
2286c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pop         {r0, pc}
2296c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
2306c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_lt_128bytes:
2316c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang6:      /* copy 64 bytes */
2326c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        movs        ip, r2, lsl #26
2336c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bcc         5f
2346c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #32
2356c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #32
2366c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1]
2376c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0]
2386c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #32
2396c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #32
2406c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1]
2416c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0]
2426c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang5:      /* copy 32 bytes */
2436c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bpl         4f
2446c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #32
2456c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #32
2466c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0, q1}, [r1]
2476c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0, q1}, [r0]
2486c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_lt_32bytes:
2496c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang4:      /* copy 16 bytes */
2506c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        movs        ip, r2, lsl #28
2516c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bcc         3f
2526c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #16
2536c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #16
2546c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {q0}, [r1]
2556c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {q0}, [r0]
2566c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang3:      /* copy 8 bytes */
2576c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        bpl         2f
2586c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #8
2596c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #8
2606c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld1.8      {d0}, [r1]
2616c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst1.8      {d0}, [r0]
2626c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang2:      /* copy 4 bytes */
2636c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ands        ip, r2, #0x4
2646c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        beq         1f
2656c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r1, r1, #4
2666c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        sub         r0, r0, #4
2676c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
2686c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]
2696c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1:      /* copy 2 bytes */
2706c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        movs        ip, r2, lsl #31
2716c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ldrbcs      ip, [r1, #-1]!
2726c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        strbcs      ip, [r0, #-1]!
2736c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ldrbcs      ip, [r1, #-1]!
2746c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        strbcs      ip, [r0, #-1]!
2756c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0:      /* copy 1 byte */
2766c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        ldrbmi      ip, [r1, #-1]!
2776c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        strbmi      ip, [r0, #-1]!
2786c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
2796c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang        pop         {r0, pc}
2806c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang
2816c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu ZhangEND(memmove)
282