16c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang/* 26c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * Copyright (C) 2013 The Android Open Source Project 36c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * All rights reserved. 46c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. 56c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * 66c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * Redistribution and use in source and binary forms, with or without 76c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * modification, are permitted provided that the following conditions 86c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * are met: 96c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * * Redistributions of source code must retain the above copyright 106c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * notice, this list of conditions and the following disclaimer. 116c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * * Redistributions in binary form must reproduce the above copyright 126c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * notice, this list of conditions and the following disclaimer in 136c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * the documentation and/or other materials provided with the 146c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * distribution. 156c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * 166c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 176c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 186c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 196c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 206c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 216c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 226c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 236c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 246c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 256c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 266c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 276c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang * SUCH DAMAGE. 286c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang */ 296c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 306c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#include <private/bionic_asm.h> 316c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#include <private/libc_events.h> 326c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 336c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .text 346c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .syntax unified 356c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .fpu neon 366c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 376c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define CACHE_LINE_SIZE (64) 386c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define MEMCPY_BLOCK_SIZE_SMALL (32768) 396c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define MEMCPY_BLOCK_SIZE_MID (1048576) 406c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4) 416c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4) 426c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16) 436c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 446c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu ZhangENTRY(memmove) 456c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmp r2, #0 466c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmpne r0, r1 476c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bxeq lr 486c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang subs r3, r0, r1 496c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bls .L_jump_to_memcpy 506c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmp r2, r3 516c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bhi .L_reversed_memcpy 526c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 536c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_jump_to_memcpy: 546c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang b memcpy 556c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 566c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy: 576c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang push {r0, lr} 586c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .cfi_def_cfa_offset 8 596c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .cfi_rel_offset r0, 0 606c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .cfi_rel_offset lr, 4 616c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 626c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r0, r0, r2 636c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r1, r1, r2 646c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 656c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload next cache line */ 666c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-CACHE_LINE_SIZE] 676c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-CACHE_LINE_SIZE*2] 686c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 696c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_align_dest: 706c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* Deal with very small blocks (< 32bytes) asap */ 716c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmp r2, #32 726c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang blo .L_reversed_memcpy_lt_32bytes 736c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* no need to align if len < 128 bytes */ 746c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmp r2, #128 756c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang blo .L_reversed_memcpy_lt_128bytes 766c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* align destination to 64 bytes (1 cache line) */ 776c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ands r3, r0, #0x3f 786c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang beq .L_reversed_memcpy_dispatch 796c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r2, r2, r3 806c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0: /* copy 1 byte */ 816c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang movs ip, r3, lsl #31 826c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ldrbmi ip, [r1, #-1]! 836c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang strbmi ip, [r0, #-1]! 846c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1: /* copy 2 bytes */ 856c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ldrbcs ip, [r1, #-1]! 866c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang strbcs ip, [r0, #-1]! 876c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ldrbcs ip, [r1, #-1]! 886c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang strbcs ip, [r0, #-1]! 896c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang2: /* copy 4 bytes */ 906c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang movs ip, r3, lsl #29 916c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bpl 3f 926c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #4 936c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #4 946c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] 956c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32] 966c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang3: /* copy 8 bytes */ 976c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bcc 4f 986c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #8 996c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #8 1006c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {d0}, [r1] 1016c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {d0}, [r0, :64] 1026c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang4: /* copy 16 bytes */ 1036c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang movs ip, r3, lsl #27 1046c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bpl 5f 1056c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #16 1066c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #16 1076c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0}, [r1] 1086c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0}, [r0, :128] 1096c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang5: /* copy 32 bytes */ 1106c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bcc .L_reversed_memcpy_dispatch 1116c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #32 1126c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #32 1136c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1] 1146c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256] 1156c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1166c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_dispatch: 1176c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload more cache lines */ 1186c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-CACHE_LINE_SIZE*3] 1196c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-CACHE_LINE_SIZE*4] 1206c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1216c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmp r2, #MEMCPY_BLOCK_SIZE_SMALL 1226c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang blo .L_reversed_memcpy_neon_pld_near 1236c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang cmp r2, #MEMCPY_BLOCK_SIZE_MID 1246c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang blo .L_reversed_memcpy_neon_pld_mid 1256c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang b .L_reversed_memcpy_neon_pld_far 1266c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1276c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_neon_pld_near: 1286c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* less than 128 bytes? */ 1296c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang subs r2, r2, #128 1306c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang blo 1f 1316c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #32 1326c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #32 1336c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang mov r3, #-32 1346c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .align 4 1356c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0: 1366c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy 128 bytes in each loop */ 1376c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang subs r2, r2, #128 1386c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1396c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload to cache */ 1406c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] 1416c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy a cache line */ 1426c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1436c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1446c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1456c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1466c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1476c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload to cache */ 1486c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] 1496c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy a cache line */ 1506c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1516c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1526c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1536c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1546c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1556c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bhs 0b 1566c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r1, r1, #32 1576c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r0, r0, #32 1586c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1: 1596c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang adds r2, r2, #128 1606c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bne .L_reversed_memcpy_lt_128bytes 1616c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pop {r0, pc} 1626c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1636c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_neon_pld_mid: 1646c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang subs r2, r2, #128 1656c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #32 1666c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #32 1676c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang mov r3, #-32 1686c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .align 4 1696c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0: 1706c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy 128 bytes in each loop */ 1716c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang subs r2, r2, #128 1726c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1736c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload to cache */ 1746c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] 1756c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy a cache line */ 1766c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1776c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1786c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1796c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1806c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1816c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload to cache */ 1826c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] 1836c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy a cache line */ 1846c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1856c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1866c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1], r3 1876c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256], r3 1886c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1896c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bhs 0b 1906c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r1, r1, #32 1916c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r0, r0, #32 1926c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1: 1936c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang adds r2, r2, #128 1946c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bne .L_reversed_memcpy_lt_128bytes 1956c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pop {r0, pc} 1966c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 1976c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_neon_pld_far: 1986c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r2, r2, #128 1996c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #128 2006c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #128 2016c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang .align 4 2026c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0: 2036c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* copy 128 bytes in each loop */ 2046c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang subs r2, r2, #128 2056c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 2066c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* preload to cache */ 2076c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128] 2086c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128] 2096c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* read */ 2106c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1]! 2116c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q2, q3}, [r1]! 2126c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q8, q9}, [r1]! 2136c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q10, q11}, [r1]! 2146c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang /* write */ 2156c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0, :256]! 2166c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q2, q3}, [r0, :256]! 2176c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q8, q9}, [r0, :256]! 2186c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q10, q11}, [r0, :256]! 2196c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 2206c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #256 2216c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #256 2226c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bhs 0b 2236c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r0, r0, #128 2246c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang add r1, r1, #128 2256c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1: 2266c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang adds r2, r2, #128 2276c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bne .L_reversed_memcpy_lt_128bytes 2286c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pop {r0, pc} 2296c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 2306c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_lt_128bytes: 2316c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang6: /* copy 64 bytes */ 2326c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang movs ip, r2, lsl #26 2336c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bcc 5f 2346c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #32 2356c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #32 2366c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1] 2376c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0] 2386c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #32 2396c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #32 2406c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1] 2416c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0] 2426c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang5: /* copy 32 bytes */ 2436c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bpl 4f 2446c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #32 2456c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #32 2466c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0, q1}, [r1] 2476c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0, q1}, [r0] 2486c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang.L_reversed_memcpy_lt_32bytes: 2496c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang4: /* copy 16 bytes */ 2506c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang movs ip, r2, lsl #28 2516c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bcc 3f 2526c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #16 2536c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #16 2546c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {q0}, [r1] 2556c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {q0}, [r0] 2566c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang3: /* copy 8 bytes */ 2576c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang bpl 2f 2586c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #8 2596c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #8 2606c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld1.8 {d0}, [r1] 2616c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst1.8 {d0}, [r0] 2626c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang2: /* copy 4 bytes */ 2636c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ands ip, r2, #0x4 2646c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang beq 1f 2656c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r1, r1, #4 2666c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang sub r0, r0, #4 2676c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] 2686c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 2696c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang1: /* copy 2 bytes */ 2706c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang movs ip, r2, lsl #31 2716c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ldrbcs ip, [r1, #-1]! 2726c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang strbcs ip, [r0, #-1]! 2736c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ldrbcs ip, [r1, #-1]! 2746c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang strbcs ip, [r0, #-1]! 2756c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang0: /* copy 1 byte */ 2766c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang ldrbmi ip, [r1, #-1]! 2776c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang strbmi ip, [r0, #-1]! 2786c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 2796c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang pop {r0, pc} 2806c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu Zhang 2816c80ccdeed9d9b30e961f68229fe8171d79c5d14Shu ZhangEND(memmove) 282