10cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris/* Copyright (c) 2012, Linaro Limited
20cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   All rights reserved.
30cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
40cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   Redistribution and use in source and binary forms, with or without
50cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   modification, are permitted provided that the following conditions are met:
60cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris       * Redistributions of source code must retain the above copyright
70cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris         notice, this list of conditions and the following disclaimer.
80cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris       * Redistributions in binary form must reproduce the above copyright
90cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris         notice, this list of conditions and the following disclaimer in the
100cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris         documentation and/or other materials provided with the distribution.
110cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris       * Neither the name of the Linaro nor the
120cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris         names of its contributors may be used to endorse or promote products
130cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris         derived from this software without specific prior written permission.
140cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
150cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
160cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
170cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
180cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
190cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
200cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
210cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
220cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
230cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
240cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
250cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
260cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris*/
270cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
280cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris/* Assumptions:
290cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris *
300cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * ARMv8-a, AArch64
310cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris * Unaligned accesses
320cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris *
330cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris */
340cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
350cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define dstin	x0
360cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define src	x1
370cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define count	x2
380cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp1	x3
390cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp1w	w3
400cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp2	x4
410cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp2w	w4
420cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp3	x5
430cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define tmp3w	w5
440cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define dst	x6
450cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
460cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define A_l	x7
470cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define A_h	x8
480cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define B_l	x9
490cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define B_h	x10
500cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define C_l	x11
510cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define C_h	x12
520cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define D_l	x13
530cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris#define D_h	x14
540cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
550cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	mov	dst, dstin
560cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	cmp	count, #64
570cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.ge	.Lcpy_not_short
580cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	cmp	count, #15
590cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.le	.Ltail15tiny
600cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
610cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* Deal with small copies quickly by dropping straight into the
620cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * exit block.  */
630cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Ltail63:
640cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* Copy up to 48 bytes of data.  At this point we only need the
650cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * bottom 6 bits of count to be accurate.  */
660cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ands	tmp1, count, #0x30
670cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.eq	.Ltail15
680cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	dst, dst, tmp1
690cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	src, src, tmp1
700cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	cmp	tmp1w, #0x20
710cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.eq	1f
720cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.lt	2f
730cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src, #-48]
740cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst, #-48]
750cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
760cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src, #-32]
770cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst, #-32]
780cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris2:
790cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src, #-16]
800cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst, #-16]
810cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
820cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Ltail15:
830cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ands	count, count, #15
840cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	beq	1f
850cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	src, src, count
860cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src, #-16]
870cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	dst, dst, count
880cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst, #-16]
890cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
900cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ret
910cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
920cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Ltail15tiny:
930cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* Copy up to 15 bytes of data.  Does not assume additional data
940cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	   being copied.  */
950cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	tbz	count, #3, 1f
960cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldr	tmp1, [src], #8
970cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	str	tmp1, [dst], #8
980cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
990cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	tbz	count, #2, 1f
1000cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldr	tmp1w, [src], #4
1010cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	str	tmp1w, [dst], #4
1020cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
1030cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	tbz	count, #1, 1f
1040cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldrh	tmp1w, [src], #2
1050cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	strh	tmp1w, [dst], #2
1060cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
1070cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	tbz	count, #0, 1f
1080cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldrb	tmp1w, [src]
1090cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	strb	tmp1w, [dst]
1100cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
1110cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ret
1120cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
1130cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Lcpy_not_short:
1140cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* We don't much care about the alignment of DST, but we want SRC
1150cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
1160cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * boundaries on both loads and stores.  */
1170cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	neg	tmp2, src
1180cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
1190cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.eq	2f
1200cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	sub	count, count, tmp2
1210cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* Copy more data than needed; it's faster than jumping
1220cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * around copying sub-Quadword quantities.  We know that
1230cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * it can't overrun.  */
1240cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src]
1250cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	src, src, tmp2
1260cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst]
1270cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	dst, dst, tmp2
1280cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* There may be less than 63 bytes to go now.  */
1290cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	cmp	count, #63
1300cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.le	.Ltail63
1310cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris2:
1320cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	subs	count, count, #128
1330cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.ge	.Lcpy_body_large
1340cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* Less than 128 bytes to copy, so handle 64 here and then jump
1350cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * to the tail.  */
1360cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src]
1370cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	B_l, B_h, [src, #16]
1380cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	C_l, C_h, [src, #32]
1390cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	D_l, D_h, [src, #48]
1400cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst]
1410cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	B_l, B_h, [dst, #16]
1420cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	C_l, C_h, [dst, #32]
1430cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	D_l, D_h, [dst, #48]
1440cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	tst	count, #0x3f
1450cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	src, src, #64
1460cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	dst, dst, #64
1470cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.ne	.Ltail63
1480cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ret
1490cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris
1500cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* Critical loop.  Start at a new cache line boundary.  Assuming
1510cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	 * 64 bytes per line this ensures the entire loop is in one line.  */
1520cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	.p2align 6
1530cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris.Lcpy_body_large:
1540cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	/* There are at least 128 bytes to copy.  */
1550cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src, #0]
1560cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	sub	dst, dst, #16		/* Pre-bias.  */
1570cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	B_l, B_h, [src, #16]
1580cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	C_l, C_h, [src, #32]
1590cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
1600cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris1:
1610cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst, #16]
1620cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	A_l, A_h, [src, #16]
1630cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	B_l, B_h, [dst, #32]
1640cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	B_l, B_h, [src, #32]
1650cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	C_l, C_h, [dst, #48]
1660cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	C_l, C_h, [src, #48]
1670cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	D_l, D_h, [dst, #64]!
1680cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ldp	D_l, D_h, [src, #64]!
1690cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	subs	count, count, #64
1700cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.ge	1b
1710cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	A_l, A_h, [dst, #16]
1720cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	B_l, B_h, [dst, #32]
1730cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	C_l, C_h, [dst, #48]
1740cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	stp	D_l, D_h, [dst, #64]
1750cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	src, src, #16
1760cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	add	dst, dst, #64 + 16
1770cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	tst	count, #0x3f
1780cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	b.ne	.Ltail63
1790cc59dd303205de7110e298e9b90b1c3b98f4711Christopher Ferris	ret
180