11da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/*
21da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
31da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
41da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   This file is subject to the terms and conditions of the GNU General Public
51da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   License.  See the file "COPYING" in the main directory of this archive
61da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   for more details.
71da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
81da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   Tight version of mempy for the case of just copying a page.
91da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   Prefetch strategy empirically optimised against RTL simulations
101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   of SH5-101 cut2 eval chip with Cayman board DDR memory.
111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   Parameters:
13379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt   r2 : destination effective address (start of page)
14379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt   r3 : source effective address (start of page)
151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   Always copies 4096 bytes.
171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   Points to review.
191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds     It seems like the prefetch needs to be at at least 4 lines ahead to get
211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds     the data into the cache in time, and the allocos contend with outstanding
221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds     prefetches for the same cache set, so it's better to have the numbers
231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds     different.
241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   */
251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.section .text..SHmedia32,"ax"
271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.little
281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	.balign 8
30379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	.global copy_page
31379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundtcopy_page:
321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
33379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	/* Copy 4096 bytes worth of data from r3 to r2.
341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	   Do prefetches 4 lines ahead.
351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	   Do alloco 2 lines ahead */
361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	pta 1f, tr1
381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	pta 2f, tr2
391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	pta 3f, tr3
401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	ptabs r18, tr0
411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#if 0
431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	/* TAKum03020 */
44379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ld.q r3, 0x00, r63
45379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ld.q r3, 0x20, r63
46379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ld.q r3, 0x40, r63
47379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ld.q r3, 0x60, r63
481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif
49379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	alloco r2, 0x00
501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	synco		! TAKum03020
51379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	alloco r2, 0x20
521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	synco		! TAKum03020
531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	movi 3968, r6
55379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	add  r2, r6, r6
561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	addi r6, 64, r7
571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	addi r7, 64, r8
58379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	sub r3, r2, r60
591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	addi r60, 8, r61
601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	addi r61, 8, r62
611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	addi r62, 8, r23
621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	addi r60, 0x80, r22
631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/* Minimal code size.  The extra branches inside the loop don't cost much
651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   because they overlap with the time spent waiting for prefetches to
661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds   complete. */
671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1:
681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#if 0
691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	/* TAKum03020 */
70379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
71379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ldx.q r2, r22, r63 ! prefetch 4 lines hence
721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif
731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2:
74379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	bge/u r2, r7, tr3  ! skip alloco for last 2 lines
75379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	alloco r2, 0x40    ! alloc destination line 2 lines ahead
761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	synco		! TAKum03020
771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3:
78379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ldx.q r2, r60, r36
79379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ldx.q r2, r61, r37
80379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ldx.q r2, r62, r38
81379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	ldx.q r2, r23, r39
82379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	st.q  r2,   0, r36
83379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	st.q  r2,   8, r37
84379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	st.q  r2,  16, r38
85379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	st.q  r2,  24, r39
86379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	addi r2, 32, r2
87379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt	bgt/l r8, r2, tr1
881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds
891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds	blink tr0, r63	   ! return
90