14466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
24466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt/* Modified by SuperH, Inc. September 2003 */
34466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
44466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! Fast SH memcpy
54466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
64466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! by Toshiyasu Morita (tm@netcom.com)
74466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
84466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! SH5 code Copyright 2002 SuperH Ltd.
94466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
104466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! Entry: ARG0: destination pointer
114466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!        ARG1: source pointer
124466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!        ARG2: byte count
134466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
144466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! Exit:  RESULT: destination pointer
154466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!        any other registers in the range r0-r7: trashed
164466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
174466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! Notes: Usually one wants to do small reads and write a longword, but
184466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!        unfortunately it is difficult in some cases to concatanate bytes
194466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!        into a longword on the SH, so this does a longword read and small
204466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!        writes.
214466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
224466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! This implementation makes two assumptions about how it is called:
234466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
244466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! 1.: If the byte count is nonzero, the address of the last byte to be
254466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     copied is unsigned greater than the address of the first byte to
264466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     be copied.  This could be easily swapped for a signed comparison,
274466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     but the algorithm used needs some comparison.
284466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
294466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt! 2.: When there are two or three bytes in the last word of an 11-or-more
304466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     bytes memory chunk to b copied, the rest of the word can be read
314466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     without side effects.
3225985edcedea6396277003854657b5f3cb31a628Lucas De Marchi!     This could be easily changed by increasing the minimum size of
334466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
344466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     however, this would cost a few extra cyles on average.
354466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     For SHmedia, the assumption is that any quadword can be read in its
364466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!     enirety if at least one byte is included in the copy.
374466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt!
384466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
394466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.section .text..SHmedia32,"ax"
404466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.globl	memcpy
414466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.type	memcpy, @function
424466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.align	5
434466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
444466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundtmemcpy:
454466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
464466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
474466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
484466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
494466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
504466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
514466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r3,0,r63
524466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	pta/l Large,tr0
534466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	movi 25,r0
544466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	bgeu/u r4,r0,tr0
554466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	nsb r4,r0
564466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	shlli r0,5,r0
574466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	movi (L1-L0+63*32 + 1) & 0xffff,r1
584466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sub r1, r0, r0
594466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtL0:	ptrel r0,tr0
604466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	add r2,r4,r5
614466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ptabs r18,tr1
624466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	add r3,r4,r6
634466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr0,r63
644466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
654466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt/* Rearranged to make cut2 safe */
664466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.balign 8
674466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtL4_7:	/* 4..7 byte memcpy cntd. */
684466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.l r2, 0, r0
694466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r6, r7, r6
704466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.l r5, -1, r6
714466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.l r5, -4, r6
724466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1,r63
734466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
744466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.balign 8
754466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtL1:	/* 0 byte memcpy */
764466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	nop
774466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1,r63
784466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	nop
794466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	nop
804466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	nop
814466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	nop
824466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
834466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtL2_3:	/* 2 or 3 byte memcpy cntd. */
844466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	st.b r5,-1,r6
854466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1,r63
864466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
874466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	/* 1 byte memcpy */
884466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r3,0,r0
894466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	st.b r2,0,r0
904466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1,r63
914466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
924466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtL8_15:	/* 8..15 byte memcpy cntd. */
934466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r2, 0, r0
944466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r6, r7, r6
954466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r5, -1, r6
964466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r5, -8, r6
974466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1,r63
984466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
994466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	/* 2 or 3 byte memcpy */
1004466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r3,0,r0
1014466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r2,0,r63
1024466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r3,1,r1
1034466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	st.b r2,0,r0
1044466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	pta/l L2_3,tr0
1054466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r6,-1,r6
1064466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	st.b r2,1,r1
1074466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr0, r63
1084466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1094466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	/* 4 .. 7 byte memcpy */
1104466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	LDUAL (r3, 0, r0, r1)
1114466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	pta L4_7, tr0
1124466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldlo.l r6, -4, r7
1134466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r0, r1, r0
1144466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.l r2, 3, r0
1154466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldhi.l r6, -1, r6
1164466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr0, r63
1174466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1184466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	/* 8 .. 15 byte memcpy */
1194466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	LDUAQ (r3, 0, r0, r1)
1204466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	pta L8_15, tr0
1214466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldlo.q r6, -8, r7
1224466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r0, r1, r0
1234466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r2, 7, r0
1244466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldhi.q r6, -1, r6
1254466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr0, r63
1264466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1274466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	/* 16 .. 24 byte memcpy */
1284466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	LDUAQ (r3, 0, r0, r1)
1294466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	LDUAQ (r3, 8, r8, r9)
1304466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r0, r1, r0
1314466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r2, 7, r0
1324466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r8, r9, r8
1334466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r2, 15, r8
1344466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldlo.q r6, -8, r7
1354466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldhi.q r6, -1, r6
1364466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r2, 8, r8
1374466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r2, 0, r0
1384466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r6, r7, r6
1394466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r5, -1, r6
1404466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r5, -8, r6
1414466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1,r63
1424466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1434466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtLarge:
1444466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ld.b r2, 0, r63
1454466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	pta/l  Loop_ua, tr1
1464466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ori r3, -8, r7
1474466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sub r2, r7, r22
1484466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sub r3, r2, r6
1494466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	add r2, r4, r5
1504466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldlo.q r3, 0, r0
1514466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r5, -16, r5
1524466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	movi 64+8, r27 // could subtract r7 from that.
1534466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r2, 0, r0
1544466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r2, 7, r0
1554466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r6, r0
1564466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	bgtu/l r27, r4, tr1
1574466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1584466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r5, -48, r27
1594466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	pta/l Loop_line, tr0
1604466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r6, 64, r36
1614466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r6, -24, r19
1624466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r6, -16, r20
1634466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r6, -8, r21
1644466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1654466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtLoop_line:
1664466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r36, r63
1674466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	alloco r22, 32
1684466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r22, 32, r22
1694466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r19, r23
1704466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r22, -25, r0
1714466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r20, r24
1724466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r21, r25
1734466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r22, -32, r0
1744466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r6,  r0
1754466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r22, -17, r23
1764466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r22,  -9, r24
1774466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r22,  -1, r25
1784466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r22, -24, r23
1794466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r22, -16, r24
1804466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r22,  -8, r25
1814466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	bgeu r27, r22, tr0
1824466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1834466b20cfcfa718ff515b9e3886749cc025e2005Paul MundtLoop_ua:
1844466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	addi r22, 8, r22
1854466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r22, -1, r0
1864466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r22, -8, r0
1874466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldx.q r22, r6, r0
1884466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	bgtu/l r5, r22, tr1
1894466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
1904466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	add r3, r4, r7
1914466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldlo.q r7, -8, r1
1924466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r22, 7, r0
1934466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ldhi.q r7, -1, r7
1944466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	ptabs r18,tr1
1954466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r22, 0, r0
1964466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	or r1, r7, r1
1974466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	sthi.q r5, 15, r1
1984466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	stlo.q r5, 8, r1
1994466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	blink tr1, r63
2004466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt
2014466b20cfcfa718ff515b9e3886749cc025e2005Paul Mundt	.size memcpy,.-memcpy
202