11da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/*
21da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * "memcpy" implementation of SuperH
31da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds *
41da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Copyright (C) 1999  Niibe Yutaka
51da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Copyright (c) 2002  STMicroelectronics Ltd
61da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds *   Modified from memcpy.S and micro-optimised for SH4
71da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds *   Stuart Menefy (stuart.menefy@st.com)
81da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds *
91da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds */
101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#include <linux/linkage.h>
111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/*
131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * void *memcpy(void *dst, const void *src, size_t n);
141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds *
151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * It is assumed that there is no overlap between src and dst.
161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * If there is an overlap, then the results are undefined.
171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds */
181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is 16 or greater, and may have trailing bytes
241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase1:
271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Read a long word and write a long word at once
281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! At the start of each iteration, r7 contains last long load
291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-1,r5		!  79 EX
301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4,r2		!   5 MT (0 cycles latency)
311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4,r5		!  50 EX
341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#7,r2		!  79 EX
361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#ifdef CONFIG_CPU_LITTLE_ENDIAN
381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 6 cycles, 4 bytes per iteration
391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r7, r3		!   5 MT (latency=0)	! RQPO
411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2,r0		!  57 MT
431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll16	r3		! 103 EX
441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1,r6		!   5 MT (latency=0)
461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll8	r3		! 102 EX		! Oxxx
471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr8	r6		! 106 EX		! xNML
491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1, r7		!   5 MT (latency=0)
501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	or	r6,r3		!  82 EX		! ONML
521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 109 BR
531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r3,@-r0		!  30 LS
551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#else
561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r7,r3		!   5 MT (latency=0)	! OPQR
581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2,r0		!  57 MT
601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr16	r3		! 107 EX
611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr8	r3		! 106 EX		! xxxO
631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1,r6		!   5 MT (latency=0)
641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll8	r6		! 102 EX		! LMNx
661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1,r7		!   5 MT (latency=0)
671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	or	r6,r3		!  82 EX		! LMNO
691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 109 BR
701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r3,@-r0		!  30 LS
721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif
731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Finally, copy a byte at once, if necessary
741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#4,r5		!  50 EX
761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r4,r0		!  54 MT
771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-6,r2		!  50 EX
791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt	9f		! 109 BR
801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds8:	cmp/hi	r2,r0		!  57 MT
821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	8b		! 109 BR
851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0		!  29 LS
871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9:	rts
891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is 16 or greater, and may have trailing bytes
971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase3:
1001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Read a long word and write a long word at once
1011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! At the start of each iteration, r7 contains last long load
1021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-3,r5		! 79 EX
1031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4,r2		!  5 MT (0 cycles latency)
1041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
1061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4,r5		! 50 EX
1071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#7,r2		!  79 EX
1091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
1101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#ifdef CONFIG_CPU_LITTLE_ENDIAN
1111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 6 cycles, 4 bytes per iteration
1121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
1131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r7, r3		!   5 MT (latency=0)	! RQPO
1141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2,r0		!  57 MT
1161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll8	r3		! 102 EX		! QPOx
1171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1,r6		!   5 MT (latency=0)
1191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr16	r6		! 107 EX
1201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr8	r6		! 106 EX		! xxxN
1221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1, r7		!   5 MT (latency=0)
1231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	or	r6,r3		!  82 EX		! QPON
1251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 109 BR
1261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r3,@-r0		!  30 LS
1281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#else
129e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito3:	mov	r7,r3		! OPQR
1301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr8	r3		! xOPQ
131e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	@(r0,r5),r7	! KLMN
132e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov	r7,r6
1331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll16	r6
1341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll8	r6		! Nxxx
1351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	or	r6,r3		! NOPQ
1361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2,r0
1371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b
1381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r3,@-r0
1391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif
1401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Finally, copy a byte at once, if necessary
1421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#6,r5		!  50 EX
1441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r4,r0		!  54 MT
1451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-6,r2		!  50 EX
1471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt	9f		! 109 BR
1481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds8:	cmp/hi	r2,r0		!  57 MT
1501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
1511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	8b		! 109 BR
1531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0		!  29 LS
1551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9:	rts
1571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
1581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus TorvaldsENTRY(memcpy)
1601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Calculate the invariants which will be used in the remainder
1621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! of the code:
1631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
1641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
1651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	         [ ...  ]                 [ ...  ]
1661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	           :                        :
1671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
1681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
1691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
1701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Short circuit the common case of src, dst and len being 32 bit aligned
1721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! and test for zero length move
1731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r6, r0		!   5 MT (0 cycle latency)
1751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	or	r4, r0		!  82 EX
1761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	or	r5, r0		!  82 EX
1781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	r6, r6		!  86 MT
1791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	99f		! 111 BR		(zero len)
1811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 tst	#3, r0		!  87 MT
1821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4, r0		!   5 MT (0 cycle latency)
1841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	r6, r0		!  49 EX
1851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	#16, r1		!   6 EX
1871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	.Lcase00	! 111 BR		(aligned)
1881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 sub	r4, r5		!  75 EX
1901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
1911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Arguments are not nicely long word aligned or zero len.
1921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Check for small copies, and if so do a simple byte at a time copy.
1931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
1941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Deciding on an exact value of 'small' is not easy, as the point at which
1951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! using the optimised routines become worthwhile varies (these are the
1961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
1971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	size	byte-at-time	long	word	byte
1981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	16	42		39-40	46-50	50-55
1991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	24	58		43-44	54-58	62-67
2001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	36	82		49-50	66-70	80-85
2011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! However the penalty for getting it 'wrong' is much higher for long word
2021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! aligned data (and this is more common), so use a value of 16.
2031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/gt	r6,r1		!  56 MT
2051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-1,r5		!  50 EX
2071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	6f		! 108 BR		(not small)
2081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov	r5, r3		!   5 MT (latency=0)
2101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr	r6		! 104 EX
2111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
2131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	4f		! 111 BR
2141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#-1,r3		!  50 EX
2161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	r6, r6		!  86 MT
2171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	98f		! 110 BR
2191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0		!  29 LS
2201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 4 cycles, 2 bytes per iteration
2221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
2231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
2251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	dt	r6		!  67 EX
2261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.b	r1,@-r0		!  29 LS
2281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	3b		! 111 BR
2291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r2,@-r0		!  29 LS
2311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds98:
2321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	rts
2331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
2341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds99:	rts
2361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov	r4, r0
2371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is not small, so its worthwhile looking for optimisations.
2391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! First align destination to a long word boundary.
2401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
2411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! r5 = normal value -1
2421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds6:	tst	#3, r0		!  87 MT
2441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds        mov	#3, r3		!   6 EX
2451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	2f		! 111 BR
2471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 and	r0,r3		!  78 EX
2481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 3 cycles, 1 byte per iteration
2501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	dt	r3		!  67 EX
2511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
2521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-1, r6		!  79 EX
2541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	1b		! 109 BR
2551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0		!  28 LS
2571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	add	#1, r5		!  79 EX
2591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Now select the appropriate bulk transfer code based on relative
2611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! alignment of src and dst.
2621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r0, r3		!   5 MT (latency=0)
2641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r5, r0		!   5 MT (latency=0)
2661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	#1, r0		!  87 MT
2671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	1f		! 111 BR
2691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov	#64, r7		!   6 EX
2701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! bit 0 clear
2721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/ge	r7, r6		!  55 MT
2741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	2f		! 111 BR
2761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 tst	#2, r0		!  87 MT
2771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! small
2791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	.Lcase0
2801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov	r3, r0
2811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bra	.Lcase2
2831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
2841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! big
2861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	bt/s	.Lcase0b
2871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov	r3, r0
2881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bra	.Lcase2b
2901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
2911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! bit 0 set
2931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	tst	#2, r0		! 87 MT
2941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	.Lcase1
2961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov	r3, r0
2971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
2981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bra	.Lcase3
2991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
3001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
3031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
3041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
3051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! src, dst and size are all long word aligned
3071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! size is non-zero
3081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
3101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase00:
3111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	#64, r1		!   6 EX
3121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r5, r3		!   5 MT (latency=0)
3131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/gt	r6, r1		!  56 MT
3151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4, r5		!  50 EX
3161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf	.Lcase00b	! 108 BR		(big loop)
3181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr2	r6		! 105 EX
3191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shlr	r6		! 104 EX
3211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	4f		! 111 BR
3241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#-8, r3		!  50 EX
3251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	r6, r6		!  86 MT
3271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	5f		! 110 BR
3281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r1,@-r0		!  30 LS
3301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 4 cycles, 2 long words per iteration
3321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
3351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	dt	r6		!  67 EX
3361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r1, @-r0	!  30 LS
3381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	3b		! 109 BR
3391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r2, @-r0	!  30 LS
3411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds5:	rts
3431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
3441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is 16 or greater and less than 64, but may have trailing bytes
3471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
3491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase0:
3501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4, r5		!  50 EX
3511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4, r7		!   5 MT (latency=0)
3521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	#4, r2		!   6 EX
3551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#11, r7		!  50 EX
3571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	r2, r6		!  86 MT
3581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r5, r3		!   5 MT (latency=0)
3601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	4f		! 111 BR
3611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#-4, r3		!  50 EX
3631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r1,@-r0		!  30 LS
3641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 4 cycles, 2 long words per iteration
3661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
3671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
3691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r7, r0
3701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r1, @-r0	!  30 LS
3721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 109 BR
3731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r2, @-r0	!  30 LS
3751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Copy the final 0-3 bytes
3771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#3,r5		!  50 EX
3791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r0, r4		!  54 MT
3811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-10, r7	!  50 EX
3821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt	9f		! 110 BR
3841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 3 cycles, 1 byte per iteration
3861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	mov.b	@(r0,r5),r1	!  19 LS
3871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r7,r0		!  57 MT
3881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	1b		! 111 BR
3901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0		!  28 LS
3911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9:	rts
3931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
3941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
3951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is at least 64 bytes, so will be going round the big loop at least once.
3961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
3971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r2 = rounded up r4
3981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r3 = rounded down r0
3991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
4011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase0b:
4021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4, r5		!  50 EX
4031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase00b:
4051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r0, r3		!   5 MT (latency=0)
4061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	#(~0x1f), r1	!   6 EX
4071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	and	r1, r3		!  78 EX
4091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4, r2		!   5 MT (latency=0)
4101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r3, r0		!  54 MT
4121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#0x1f, r2	!  50 EX
4131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	1f		! 110 BR
4151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 and	r1, r2		!  78 EX
4161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! copy initial words until cache line aligned
4181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
4201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	#4, r0		!  87 MT
4211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r5, r6		!   5 MT (latency=0)
4231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4, r6		!  50 EX
4241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	4f		! 111 BR
4261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#8, r3		!  50 EX
4271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	#0x18, r0	!  87 MT
4291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	1f		! 109 BR
4311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r1,@-r0		!  30 LS
4321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 4 cycles, 2 long words per iteration
4341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
4351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
4371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r3, r0		!  54 MT
4381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r1, @-r0	!  30 LS
4401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	3b		! 109 BR
4411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r7, @-r0	!  30 LS
4431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Copy the cache line aligned blocks
4451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
4461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! In use: r0, r2, r4, r5
4471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Scratch: r1, r3, r6, r7
4481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
4491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! We could do this with the four scratch registers, but if src
4501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! and dest hit the same cache line, this will thrash, so make
4511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! use of additional registers.
4521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
4531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
4541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r5:	 src (was r0+r5)
4551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r1:	 dest (was r0)
4561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! this can be reversed at the end, so we don't need to save any extra
4571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! state.
4581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
4591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	mov.l	r8, @-r15	!  30 LS
4601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	r0, r5		!  49 EX
4611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r9, @-r15	!  30 LS
4631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r0, r1		!   5 MT (latency=0)
4641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r10, @-r15	!  30 LS
4661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x1c, r5	!  50 EX
4671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r11, @-r15	!  30 LS
4691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 16 cycles, 32 bytes per iteration
4711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
4721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x20, r1	! 50 EX
4731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
4741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
4751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
4761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
4771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
4781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
4791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
4801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	movca.l	r0,@r1		! 40 LS (latency=3-7)
4811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r3,@(0x04,r1)	! 33 LS
4821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r6,@(0x08,r1)	! 33 LS
4831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r7,@(0x0c,r1)	! 33 LS
4841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r8,@(0x10,r1)	! 33 LS
4861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x20, r5	! 50 EX
4871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r9,@(0x14,r1)	! 33 LS
4891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r2,r1		! 54 MT
4901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r10,@(0x18,r1)	!  33 LS
4921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	2b		! 109 BR
4931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r11,@(0x1c,r1)	!  33 LS
4951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1, r0		!   5 MT (latency=0)
4971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
4981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r15+, r11	!  15 LS
4991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	sub	r1, r5		!  75 EX
5001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r15+, r10	!  15 LS
5021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r4, r0		!  54 MT
5031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	1f		! 109 BR
5051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	 @r15+, r9	!  15 LS
5061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	rts
5081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	 mov.l	@r15+, r8	!  15 LS
5091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	sub	r4, r1		!  75 EX		(len remaining)
5101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! number of trailing bytes is non-zero
5121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
5131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! invariants restored (r5 already decremented by 4)
5141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! also r1=num bytes remaining
5151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	#4, r2		!   6 EX
5171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4, r7		!   5 MT (latency=0)
5181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#0x1c, r5	!  50 EX		(back to -4)
5201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hs	r2, r1		!  58 MT
5211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	5f		! 108 BR
5231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	 #11, r7	!  50 EX
5241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
5261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	tst	r2, r1		!  86 MT
5271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r5, r3		!   5 MT (latency=0)
5291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	4f		! 111 BR
5301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#-4, r3		!  50 EX
5321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hs	r2, r1		!  58 MT
5331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	5f		! 111 BR
5351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r6,@-r0		!  30 LS
5361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 4 cycles, 2 long words per iteration
5381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
5391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
5411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r7, r0
5421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r6, @-r0	!  30 LS
5441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 109 BR
5451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r2, @-r0	!  30 LS
5471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Copy the final 0-3 bytes
5491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds5:	cmp/eq	r0, r4		!  54 MT
5511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-10, r7	!  50 EX
5521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt	9f		! 110 BR
5541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#3,r5		!  50 EX
5551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 3 cycles, 1 byte per iteration
5571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	mov.b	@(r0,r5),r1	!  19 LS
5581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r7,r0		!  57 MT
5591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	1b		! 111 BR
5611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0		!  28 LS
5621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9:	rts
5641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
5651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
5671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
5681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
5691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
5711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase2:
5721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is 16 or greater and less then 64, but may have trailing bytes
5731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	mov	r5, r6		!   5 MT (latency=0)
5751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-2,r5		!  50 EX
5761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4,r2		!   5 MT (latency=0)
5781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4,r6		!  50 EX
5791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#7,r2		!  50 EX
5811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
5821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
5841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2,r0		!  57 MT
5851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.w	r1,@-r0		!  29 LS
5871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 111 BR
5881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.w	r3,@-r0		!  29 LS
5901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bra	10f
5921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 nop
5931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
5951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign	32
5961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase2b:
5971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Size is at least 64 bytes, so will be going round the big loop at least once.
5981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
5991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r2 = rounded up r4
6001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r3 = rounded down r0
6011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r0, r3		!   5 MT (latency=0)
6031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	#(~0x1f), r1	!   6 EX
6041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	and	r1, r3		!  78 EX
6061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r4, r2		!   5 MT (latency=0)
6071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r3, r0		!  54 MT
6091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#0x1f, r2	!  50 EX
6101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-2, r5		!  50 EX
6121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	1f		! 110 BR
6131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 and	r1, r2		!  78 EX
6141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Copy a short word one at a time until we are cache line aligned
6161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   Normal values: r0, r2, r3, r4
6171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   Unused: r1, r6, r7
6181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   Mod: r5 (=r5-2)
6191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
6201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#2, r3		!  50 EX
6211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
6231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r3,r0		!  54 MT
6241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	2b		! 111 BR
6261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.w	r1,@-r0		!  29 LS
6281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Copy the cache line aligned blocks
6301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
6311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! In use: r0, r2, r4, r5 (=r5-2)
6321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Scratch: r1, r3, r6, r7
6331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
6341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! We could do this with the four scratch registers, but if src
6351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! and dest hit the same cache line, this will thrash, so make
6361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! use of additional registers.
6371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
6381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
6391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r5:	 src (was r0+r5)
6401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!   r1:	 dest (was r0)
6411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! this can be reversed at the end, so we don't need to save any extra
6421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! state.
6431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
6441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	mov.l	r8, @-r15	!  30 LS
6451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	r0, r5		!  49 EX
6461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r9, @-r15	!  30 LS
6481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r0, r1		!   5 MT (latency=0)
6491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r10, @-r15	!  30 LS
6511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x1e, r5	!  50 EX
6521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r11, @-r15	!  30 LS
6541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r12, @-r15	!  30 LS
6561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! 17 cycles, 32 bytes per iteration
6581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#ifdef CONFIG_CPU_LITTLE_ENDIAN
6591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
6601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x20, r1	!  50 EX
6611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
6631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
6651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll16	r0		! 103 EX			JI..
6661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r7	!  15 LS (latency=2)
6681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r3, r0		!  48 EX			LKJI
6691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r8	!  15 LS (latency=2)
6711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r6, r3		!  48 EX			PONM
6721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r9	!  15 LS (latency=2)
6741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r7, r6		!  48 EX
6751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r10	!  15 LS (latency=2)
6771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r8, r7		!  48 EX
6781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r5+, r11	!  15 LS (latency=2)
6801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r9, r8		!  48 EX
6811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.w	@r5+, r12	!  15 LS (latency=2)
6831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r10, r9		!  48 EX
6841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	movca.l	r0,@r1		!  40 LS (latency=3-7)
6861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r11, r10	!  48 EX
6871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r3, @(0x04,r1)	!  33 LS
6891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r12, r11	!  48 EX
6901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r6, @(0x08,r1)	!  33 LS
6921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r7, @(0x0c,r1)	!  33 LS
6941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r8, @(0x10,r1)	!  33 LS
6961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x40, r5	!  50 EX
6971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
6981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r9, @(0x14,r1)	!  33 LS
6991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r2,r1		!  54 MT
7001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	r10, @(0x18,r1)	!  33 LS
7021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	2b		! 109 BR
7031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	r11, @(0x1c,r1)	!  33 LS
7051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#else
7061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
7071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-2, r5		!  50 EX
7081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
7101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-4, r1		!  50 EX
7111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
7131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	shll16	r0		! 103 EX
7141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
7161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r3, r0		!  48 EX
7171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
7191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r6, r3		!  48 EX
7201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
7221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r7, r6		!  48 EX
7231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
7251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r8, r7		!  48 EX
7261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
7281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r9, r8		!  48 EX
7291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
730c7afb7e5cbc4baa781ec82731fc9fe9039efee22Nobuhiro Iwamatsu	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
731c7afb7e5cbc4baa781ec82731fc9fe9039efee22Nobuhiro Iwamatsu    	xtrct	r10, r9		!  48 EX
7321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	movca.l	r0,@r1		!  40 LS (latency=3-7)
7341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#-0x1c, r1	!  50 EX
7351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
736e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	r3, @(0x18,r1)	!  33 LS
7371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r11, r10	!  48 EX
7381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
739e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	r6, @(0x14,r1)	!  33 LS
7401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	xtrct	r12, r11	!  48 EX
7411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
742e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	r7, @(0x10,r1)	!  33 LS
7431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
744e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	r8, @(0x0c,r1)	!  33 LS
745e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	add	#-0x1e, r5	!  50 EX
7461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
747e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	r9, @(0x08,r1)	!  33 LS
7481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r2,r1		!  54 MT
7491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
750e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	mov.l	r10, @(0x04,r1)	!  33 LS
7511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	2b		! 109 BR
7521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
753e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito	 mov.l	r11, @(0x00,r1)	!  33 LS
7541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif
7551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r15+, r12
7571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov	r1, r0		!   5 MT (latency=0)
7581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r15+, r11	!  15 LS
7601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	sub	r1, r5		!  75 EX
7611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.l	@r15+, r10	!  15 LS
7631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r4, r0		!  54 MT
7641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	1f		! 109 BR
7661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.l	 @r15+, r9	!  15 LS
7671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	rts
7691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:	 mov.l	@r15+, r8	!  15 LS
7701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#0x1e, r5	!  50 EX
7721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Finish off a short word at a time
7741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! r5 must be invariant - 2
7751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds10:	mov	r4,r2		!   5 MT (latency=0)
7761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	add	#1,r2		!  50 EX
7771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2, r0		!  57 MT
7791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bf/s	1f		! 109 BR
7801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#2, r2		!  50 EX
7821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:	mov.w	@(r0,r5),r1	!  20 LS
7841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/hi	r2,r0		!  57 MT
7851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	3b		! 109 BR
7871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.w	r1,@-r0		!  29 LS
7891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:
7901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
7911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	!
7921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	! Finally, copy the last byte if necessary
7931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	cmp/eq	r4,r0		!  54 MT
7941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	bt/s	9b
7951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 add	#1,r5
7961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	mov.b	@(r0,r5),r1
7971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	rts
7981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	 mov.b	r1,@-r0
7991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
800