10cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris/* Copyright (c) 2012, Linaro Limited 20cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris All rights reserved. 30cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 40cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris Redistribution and use in source and binary forms, with or without 50cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris modification, are permitted provided that the following conditions are met: 60cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * Redistributions of source code must retain the above copyright 70cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris notice, this list of conditions and the following disclaimer. 80cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * Redistributions in binary form must reproduce the above copyright 90cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris notice, this list of conditions and the following disclaimer in the 100cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris documentation and/or other materials provided with the distribution. 110cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * Neither the name of the Linaro nor the 120cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris names of its contributors may be used to endorse or promote products 130cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris derived from this software without specific prior written permission. 140cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 150cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 160cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 170cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 180cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 190cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 200cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 210cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 220cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 230cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 240cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 250cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 260cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris*/ 270cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 280cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris/* Assumptions: 290cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * 300cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * ARMv8-a, AArch64 310cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * Unaligned accesses 320cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * 330cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris */ 340cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 350cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define dstin x0 360cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define src x1 370cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define count x2 380cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp1 x3 390cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp1w w3 400cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp2 x4 410cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp2w w4 420cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp3 x5 430cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp3w w5 440cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define dst x6 450cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 460cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define A_l x7 470cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define A_h x8 480cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define B_l x9 490cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define B_h x10 500cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define C_l x11 510cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define C_h x12 520cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define D_l x13 530cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define D_h x14 540cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 550cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris mov dst, dstin 560cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris cmp count, #64 570cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.ge .Lcpy_not_short 580cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris cmp count, #15 590cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.le .Ltail15tiny 600cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 610cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* Deal with small copies quickly by dropping straight into the 620cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * exit block. */ 630cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Ltail63: 640cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* Copy up to 48 bytes of data. At this point we only need the 650cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * bottom 6 bits of count to be accurate. */ 660cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ands tmp1, count, #0x30 670cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.eq .Ltail15 680cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add dst, dst, tmp1 690cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add src, src, tmp1 700cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris cmp tmp1w, #0x20 710cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.eq 1f 720cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.lt 2f 730cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src, #-48] 740cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst, #-48] 750cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 760cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src, #-32] 770cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst, #-32] 780cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris2: 790cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src, #-16] 800cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst, #-16] 810cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 820cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Ltail15: 830cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ands count, count, #15 840cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris beq 1f 850cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add src, src, count 860cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src, #-16] 870cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add dst, dst, count 880cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst, #-16] 890cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 900cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ret 910cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 920cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Ltail15tiny: 930cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* Copy up to 15 bytes of data. Does not assume additional data 940cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris being copied. */ 950cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris tbz count, #3, 1f 960cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldr tmp1, [src], #8 970cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris str tmp1, [dst], #8 980cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 990cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris tbz count, #2, 1f 1000cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldr tmp1w, [src], #4 1010cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris str tmp1w, [dst], #4 1020cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 1030cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris tbz count, #1, 1f 1040cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldrh tmp1w, [src], #2 1050cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris strh tmp1w, [dst], #2 1060cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 1070cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris tbz count, #0, 1f 1080cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldrb tmp1w, [src] 1090cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris strb tmp1w, [dst] 1100cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 1110cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ret 1120cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 1130cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Lcpy_not_short: 1140cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* We don't much care about the alignment of DST, but we want SRC 1150cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * to be 128-bit (16 byte) aligned so that we don't cross cache line 1160cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * boundaries on both loads and stores. */ 1170cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris neg tmp2, src 1180cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ 1190cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.eq 2f 1200cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris sub count, count, tmp2 1210cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* Copy more data than needed; it's faster than jumping 1220cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * around copying sub-Quadword quantities. We know that 1230cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * it can't overrun. */ 1240cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src] 1250cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add src, src, tmp2 1260cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst] 1270cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add dst, dst, tmp2 1280cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* There may be less than 63 bytes to go now. */ 1290cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris cmp count, #63 1300cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.le .Ltail63 1310cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris2: 1320cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris subs count, count, #128 1330cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.ge .Lcpy_body_large 1340cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* Less than 128 bytes to copy, so handle 64 here and then jump 1350cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * to the tail. */ 1360cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src] 1370cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp B_l, B_h, [src, #16] 1380cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp C_l, C_h, [src, #32] 1390cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp D_l, D_h, [src, #48] 1400cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst] 1410cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp B_l, B_h, [dst, #16] 1420cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp C_l, C_h, [dst, #32] 1430cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp D_l, D_h, [dst, #48] 1440cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris tst count, #0x3f 1450cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add src, src, #64 1460cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add dst, dst, #64 1470cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.ne .Ltail63 1480cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ret 1490cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris 1500cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* Critical loop. Start at a new cache line boundary. Assuming 1510cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * 64 bytes per line this ensures the entire loop is in one line. */ 1520cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris .p2align 6 1530cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Lcpy_body_large: 1540cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris /* There are at least 128 bytes to copy. */ 1550cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src, #0] 1560cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris sub dst, dst, #16 /* Pre-bias. */ 1570cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp B_l, B_h, [src, #16] 1580cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp C_l, C_h, [src, #32] 1590cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ 1600cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1: 1610cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst, #16] 1620cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp A_l, A_h, [src, #16] 1630cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp B_l, B_h, [dst, #32] 1640cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp B_l, B_h, [src, #32] 1650cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp C_l, C_h, [dst, #48] 1660cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp C_l, C_h, [src, #48] 1670cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp D_l, D_h, [dst, #64]! 1680cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ldp D_l, D_h, [src, #64]! 1690cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris subs count, count, #64 1700cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.ge 1b 1710cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp A_l, A_h, [dst, #16] 1720cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp B_l, B_h, [dst, #32] 1730cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp C_l, C_h, [dst, #48] 1740cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris stp D_l, D_h, [dst, #64] 1750cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add src, src, #16 1760cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris add dst, dst, #64 + 16 1770cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris tst count, #0x3f 1780cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris b.ne .Ltail63 1790cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris ret 180