11da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/* 21da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Copyright 2003 Richard Curnow, SuperH (UK) Ltd. 31da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 41da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds This file is subject to the terms and conditions of the GNU General Public 51da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds License. See the file "COPYING" in the main directory of this archive 61da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds for more details. 71da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 81da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Tight version of mempy for the case of just copying a page. 91da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Prefetch strategy empirically optimised against RTL simulations 101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds of SH5-101 cut2 eval chip with Cayman board DDR memory. 111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Parameters: 13379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt r2 : destination effective address (start of page) 14379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt r3 : source effective address (start of page) 151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Always copies 4096 bytes. 171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Points to review. 191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. 201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds It seems like the prefetch needs to be at at least 4 lines ahead to get 211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds the data into the cache in time, and the allocos contend with outstanding 221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds prefetches for the same cache set, so it's better to have the numbers 231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds different. 241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds */ 251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .section .text..SHmedia32,"ax" 271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .little 281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 8 30379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt .global copy_page 31379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundtcopy_page: 321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 33379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt /* Copy 4096 bytes worth of data from r3 to r2. 341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Do prefetches 4 lines ahead. 351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds Do alloco 2 lines ahead */ 361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds pta 1f, tr1 381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds pta 2f, tr2 391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds pta 3f, tr3 401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ptabs r18, tr0 411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#if 0 431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds /* TAKum03020 */ 44379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ld.q r3, 0x00, r63 45379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ld.q r3, 0x20, r63 46379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ld.q r3, 0x40, r63 47379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ld.q r3, 0x60, r63 481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif 49379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt alloco r2, 0x00 501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds synco ! TAKum03020 51379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt alloco r2, 0x20 521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds synco ! TAKum03020 531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds movi 3968, r6 55379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt add r2, r6, r6 561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds addi r6, 64, r7 571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds addi r7, 64, r8 58379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt sub r3, r2, r60 591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds addi r60, 8, r61 601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds addi r61, 8, r62 611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds addi r62, 8, r23 621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds addi r60, 0x80, r22 631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/* Minimal code size. The extra branches inside the loop don't cost much 651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds because they overlap with the time spent waiting for prefetches to 661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds complete. */ 671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: 681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#if 0 691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds /* TAKum03020 */ 70379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt bge/u r2, r6, tr2 ! skip prefetch for last 4 lines 71379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ldx.q r2, r22, r63 ! prefetch 4 lines hence 721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif 731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: 74379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt bge/u r2, r7, tr3 ! skip alloco for last 2 lines 75379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt alloco r2, 0x40 ! alloc destination line 2 lines ahead 761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds synco ! TAKum03020 771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: 78379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ldx.q r2, r60, r36 79379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ldx.q r2, r61, r37 80379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ldx.q r2, r62, r38 81379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt ldx.q r2, r23, r39 82379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt st.q r2, 0, r36 83379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt st.q r2, 8, r37 84379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt st.q r2, 16, r38 85379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt st.q r2, 24, r39 86379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt addi r2, 32, r2 87379a95d1d2c3e3682e380084c40b6fc01e38fa1fPaul Mundt bgt/l r8, r2, tr1 881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds blink tr0, r63 ! return 90