11da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/* 21da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * "memcpy" implementation of SuperH 31da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * 41da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Copyright (C) 1999 Niibe Yutaka 51da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Copyright (c) 2002 STMicroelectronics Ltd 61da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Modified from memcpy.S and micro-optimised for SH4 71da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * Stuart Menefy (stuart.menefy@st.com) 81da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * 91da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds */ 101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#include <linux/linkage.h> 111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds/* 131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * void *memcpy(void *dst, const void *src, size_t n); 141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * 151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * It is assumed that there is no overlap between src and dst. 161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds * If there is an overlap, then the results are undefined. 171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds */ 181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. 211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is 16 or greater, and may have trailing bytes 241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase1: 271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Read a long word and write a long word at once 281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! At the start of each iteration, r7 contains last long load 291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-1,r5 ! 79 EX 301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4,r2 ! 5 MT (0 cycles latency) 311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4,r5 ! 50 EX 341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #7,r2 ! 79 EX 361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#ifdef CONFIG_CPU_LITTLE_ENDIAN 381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6 cycles, 4 bytes per iteration 391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r7, r3 ! 5 MT (latency=0) ! RQPO 411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2,r0 ! 57 MT 431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll16 r3 ! 103 EX 441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1,r6 ! 5 MT (latency=0) 461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll8 r3 ! 102 EX ! Oxxx 471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr8 r6 ! 106 EX ! xNML 491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1, r7 ! 5 MT (latency=0) 501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds or r6,r3 ! 82 EX ! ONML 521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 109 BR 531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r3,@-r0 ! 30 LS 551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#else 561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN 571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r7,r3 ! 5 MT (latency=0) ! OPQR 581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2,r0 ! 57 MT 601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr16 r3 ! 107 EX 611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr8 r3 ! 106 EX ! xxxO 631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1,r6 ! 5 MT (latency=0) 641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll8 r6 ! 102 EX ! LMNx 661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1,r7 ! 5 MT (latency=0) 671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds or r6,r3 ! 82 EX ! LMNO 691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 109 BR 701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r3,@-r0 ! 30 LS 721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif 731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Finally, copy a byte at once, if necessary 741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #4,r5 ! 50 EX 761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r4,r0 ! 54 MT 771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-6,r2 ! 50 EX 791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt 9f ! 109 BR 801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds8: cmp/hi r2,r0 ! 57 MT 821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b @(r0,r5),r1 ! 20 LS (latency=2) 831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 8b ! 109 BR 851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 29 LS 871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9: rts 891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... 941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is 16 or greater, and may have trailing bytes 971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase3: 1001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Read a long word and write a long word at once 1011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! At the start of each iteration, r7 contains last long load 1021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-3,r5 ! 79 EX 1031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4,r2 ! 5 MT (0 cycles latency) 1041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 1061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4,r5 ! 50 EX 1071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #7,r2 ! 79 EX 1091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 1101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#ifdef CONFIG_CPU_LITTLE_ENDIAN 1111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6 cycles, 4 bytes per iteration 1121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 1131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r7, r3 ! 5 MT (latency=0) ! RQPO 1141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2,r0 ! 57 MT 1161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll8 r3 ! 102 EX ! QPOx 1171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1,r6 ! 5 MT (latency=0) 1191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr16 r6 ! 107 EX 1201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr8 r6 ! 106 EX ! xxxN 1221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1, r7 ! 5 MT (latency=0) 1231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds or r6,r3 ! 82 EX ! QPON 1251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 109 BR 1261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r3,@-r0 ! 30 LS 1281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#else 129e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito3: mov r7,r3 ! OPQR 1301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr8 r3 ! xOPQ 131e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l @(r0,r5),r7 ! KLMN 132e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov r7,r6 1331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll16 r6 1341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll8 r6 ! Nxxx 1351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds or r6,r3 ! NOPQ 1361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2,r0 1371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b 1381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r3,@-r0 1391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif 1401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Finally, copy a byte at once, if necessary 1421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #6,r5 ! 50 EX 1441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r4,r0 ! 54 MT 1451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-6,r2 ! 50 EX 1471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt 9f ! 109 BR 1481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds8: cmp/hi r2,r0 ! 57 MT 1501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b @(r0,r5),r1 ! 20 LS (latency=2) 1511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 8b ! 109 BR 1531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 29 LS 1551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9: rts 1571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 1581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus TorvaldsENTRY(memcpy) 1601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Calculate the invariants which will be used in the remainder 1621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! of the code: 1631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 1641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r4 --> [ ... ] DST [ ... ] SRC 1651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! [ ... ] [ ... ] 1661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! : : 1671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r0 --> [ ... ] r0+r5 --> [ ... ] 1681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 1691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 1701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Short circuit the common case of src, dst and len being 32 bit aligned 1721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! and test for zero length move 1731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r6, r0 ! 5 MT (0 cycle latency) 1751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds or r4, r0 ! 82 EX 1761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds or r5, r0 ! 82 EX 1781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst r6, r6 ! 86 MT 1791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 99f ! 111 BR (zero len) 1811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst #3, r0 ! 87 MT 1821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4, r0 ! 5 MT (0 cycle latency) 1841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add r6, r0 ! 49 EX 1851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #16, r1 ! 6 EX 1871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s .Lcase00 ! 111 BR (aligned) 1881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds sub r4, r5 ! 75 EX 1901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 1911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Arguments are not nicely long word aligned or zero len. 1921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Check for small copies, and if so do a simple byte at a time copy. 1931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 1941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Deciding on an exact value of 'small' is not easy, as the point at which 1951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! using the optimised routines become worthwhile varies (these are the 1961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): 1971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! size byte-at-time long word byte 1981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 16 42 39-40 46-50 50-55 1991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 24 58 43-44 54-58 62-67 2001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 36 82 49-50 66-70 80-85 2011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! However the penalty for getting it 'wrong' is much higher for long word 2021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! aligned data (and this is more common), so use a value of 16. 2031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/gt r6,r1 ! 56 MT 2051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-1,r5 ! 50 EX 2071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 6f ! 108 BR (not small) 2081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r5, r3 ! 5 MT (latency=0) 2101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr r6 ! 104 EX 2111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b @(r0,r5),r1 ! 20 LS (latency=2) 2131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 4f ! 111 BR 2141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-1,r3 ! 50 EX 2161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst r6, r6 ! 86 MT 2171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 98f ! 110 BR 2191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 29 LS 2201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4 cycles, 2 bytes per iteration 2221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) 2231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) 2251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds dt r6 ! 67 EX 2261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 29 LS 2281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 3b ! 111 BR 2291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r2,@-r0 ! 29 LS 2311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds98: 2321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds rts 2331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 2341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds99: rts 2361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4, r0 2371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is not small, so its worthwhile looking for optimisations. 2391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! First align destination to a long word boundary. 2401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 2411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r5 = normal value -1 2421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds6: tst #3, r0 ! 87 MT 2441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #3, r3 ! 6 EX 2451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 2f ! 111 BR 2471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds and r0,r3 ! 78 EX 2481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 3 cycles, 1 byte per iteration 2501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: dt r3 ! 67 EX 2511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b @(r0,r5),r1 ! 19 LS (latency=2) 2521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-1, r6 ! 79 EX 2541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 1b ! 109 BR 2551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 28 LS 2571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: add #1, r5 ! 79 EX 2591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Now select the appropriate bulk transfer code based on relative 2611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! alignment of src and dst. 2621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r0, r3 ! 5 MT (latency=0) 2641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r5, r0 ! 5 MT (latency=0) 2661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst #1, r0 ! 87 MT 2671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 1f ! 111 BR 2691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #64, r7 ! 6 EX 2701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! bit 0 clear 2721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/ge r7, r6 ! 55 MT 2741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 2f ! 111 BR 2761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst #2, r0 ! 87 MT 2771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! small 2791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s .Lcase0 2801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r3, r0 2811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bra .Lcase2 2831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 2841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! big 2861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: bt/s .Lcase0b 2871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r3, r0 2881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bra .Lcase2b 2901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 2911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! bit 0 set 2931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: tst #2, r0 ! 87 MT 2941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s .Lcase1 2961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r3, r0 2971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 2981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bra .Lcase3 2991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 3001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 3031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR 3041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 3051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! src, dst and size are all long word aligned 3071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! size is non-zero 3081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 3101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase00: 3111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #64, r1 ! 6 EX 3121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r5, r3 ! 5 MT (latency=0) 3131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/gt r6, r1 ! 56 MT 3151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r5 ! 50 EX 3161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf .Lcase00b ! 108 BR (big loop) 3181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr2 r6 ! 105 EX 3191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shlr r6 ! 104 EX 3211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 4f ! 111 BR 3241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-8, r3 ! 50 EX 3251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst r6, r6 ! 86 MT 3271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 5f ! 110 BR 3281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r1,@-r0 ! 30 LS 3301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4 cycles, 2 long words per iteration 3321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 3351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds dt r6 ! 67 EX 3361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r1, @-r0 ! 30 LS 3381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 3b ! 109 BR 3391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r2, @-r0 ! 30 LS 3411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds5: rts 3431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 3441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is 16 or greater and less than 64, but may have trailing bytes 3471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 3491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase0: 3501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r5 ! 50 EX 3511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4, r7 ! 5 MT (latency=0) 3521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #4, r2 ! 6 EX 3551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #11, r7 ! 50 EX 3571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst r2, r6 ! 86 MT 3581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r5, r3 ! 5 MT (latency=0) 3601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 4f ! 111 BR 3611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r3 ! 50 EX 3631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r1,@-r0 ! 30 LS 3641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4 cycles, 2 long words per iteration 3661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 3671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 3691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r7, r0 3701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r1, @-r0 ! 30 LS 3721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 109 BR 3731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r2, @-r0 ! 30 LS 3751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Copy the final 0-3 bytes 3771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #3,r5 ! 50 EX 3791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r0, r4 ! 54 MT 3811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-10, r7 ! 50 EX 3821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt 9f ! 110 BR 3841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 3 cycles, 1 byte per iteration 3861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: mov.b @(r0,r5),r1 ! 19 LS 3871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r7,r0 ! 57 MT 3881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 1b ! 111 BR 3901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 28 LS 3911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9: rts 3931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 3941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 3951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is at least 64 bytes, so will be going round the big loop at least once. 3961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 3971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r2 = rounded up r4 3981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r3 = rounded down r0 3991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 4011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase0b: 4021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r5 ! 50 EX 4031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase00b: 4051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r0, r3 ! 5 MT (latency=0) 4061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #(~0x1f), r1 ! 6 EX 4071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds and r1, r3 ! 78 EX 4091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4, r2 ! 5 MT (latency=0) 4101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r3, r0 ! 54 MT 4121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #0x1f, r2 ! 50 EX 4131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 1f ! 110 BR 4151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds and r1, r2 ! 78 EX 4161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! copy initial words until cache line aligned 4181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(r0, r5), r1 ! 21 LS (latency=2) 4201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst #4, r0 ! 87 MT 4211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r5, r6 ! 5 MT (latency=0) 4231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r6 ! 50 EX 4241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 4f ! 111 BR 4261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #8, r3 ! 50 EX 4271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst #0x18, r0 ! 87 MT 4291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 1f ! 109 BR 4311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r1,@-r0 ! 30 LS 4321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4 cycles, 2 long words per iteration 4341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 4351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) 4371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r3, r0 ! 54 MT 4381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r1, @-r0 ! 30 LS 4401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 3b ! 109 BR 4411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r7, @-r0 ! 30 LS 4431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Copy the cache line aligned blocks 4451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! In use: r0, r2, r4, r5 4471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Scratch: r1, r3, r6, r7 4481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! We could do this with the four scratch registers, but if src 4501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! and dest hit the same cache line, this will thrash, so make 4511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! use of additional registers. 4521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 4541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r5: src (was r0+r5) 4551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r1: dest (was r0) 4561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! this can be reversed at the end, so we don't need to save any extra 4571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! state. 4581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: mov.l r8, @-r15 ! 30 LS 4601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add r0, r5 ! 49 EX 4611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r9, @-r15 ! 30 LS 4631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r0, r1 ! 5 MT (latency=0) 4641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r10, @-r15 ! 30 LS 4661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x1c, r5 ! 50 EX 4671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r11, @-r15 ! 30 LS 4691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 16 cycles, 32 bytes per iteration 4711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) 4721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x20, r1 ! 50 EX 4731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x04,r5),r3 ! 18 LS (latency=2) 4741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x08,r5),r6 ! 18 LS (latency=2) 4751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) 4761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x10,r5),r8 ! 18 LS (latency=2) 4771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x14,r5),r9 ! 18 LS (latency=2) 4781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x18,r5),r10 ! 18 LS (latency=2) 4791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) 4801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds movca.l r0,@r1 ! 40 LS (latency=3-7) 4811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r3,@(0x04,r1) ! 33 LS 4821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r6,@(0x08,r1) ! 33 LS 4831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r7,@(0x0c,r1) ! 33 LS 4841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r8,@(0x10,r1) ! 33 LS 4861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x20, r5 ! 50 EX 4871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r9,@(0x14,r1) ! 33 LS 4891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r2,r1 ! 54 MT 4901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r10,@(0x18,r1) ! 33 LS 4921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 2b ! 109 BR 4931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r11,@(0x1c,r1) ! 33 LS 4951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1, r0 ! 5 MT (latency=0) 4971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 4981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r11 ! 15 LS 4991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds sub r1, r5 ! 75 EX 5001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r10 ! 15 LS 5021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r4, r0 ! 54 MT 5031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 1f ! 109 BR 5051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r9 ! 15 LS 5061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds rts 5081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: mov.l @r15+, r8 ! 15 LS 5091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds sub r4, r1 ! 75 EX (len remaining) 5101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! number of trailing bytes is non-zero 5121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 5131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! invariants restored (r5 already decremented by 4) 5141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! also r1=num bytes remaining 5151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #4, r2 ! 6 EX 5171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4, r7 ! 5 MT (latency=0) 5181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #0x1c, r5 ! 50 EX (back to -4) 5201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hs r2, r1 ! 58 MT 5211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 5f ! 108 BR 5231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #11, r7 ! 50 EX 5241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(r0, r5), r6 ! 21 LS (latency=2) 5261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds tst r2, r1 ! 86 MT 5271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r5, r3 ! 5 MT (latency=0) 5291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 4f ! 111 BR 5301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r3 ! 50 EX 5321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hs r2, r1 ! 58 MT 5331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 5f ! 111 BR 5351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r6,@-r0 ! 30 LS 5361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 4 cycles, 2 long words per iteration 5381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) 5391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 5411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r7, r0 5421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r6, @-r0 ! 30 LS 5441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 109 BR 5451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r2, @-r0 ! 30 LS 5471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Copy the final 0-3 bytes 5491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds5: cmp/eq r0, r4 ! 54 MT 5511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-10, r7 ! 50 EX 5521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt 9f ! 110 BR 5541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #3,r5 ! 50 EX 5551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 3 cycles, 1 byte per iteration 5571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: mov.b @(r0,r5),r1 ! 19 LS 5581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r7,r0 ! 57 MT 5591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 1b ! 111 BR 5611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 ! 28 LS 5621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds9: rts 5641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 5651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 5671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. 5681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 5691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 5711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase2: 5721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is 16 or greater and less then 64, but may have trailing bytes 5731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: mov r5, r6 ! 5 MT (latency=0) 5751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-2,r5 ! 50 EX 5761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4,r2 ! 5 MT (latency=0) 5781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4,r6 ! 50 EX 5791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #7,r2 ! 50 EX 5811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 5821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.w @(r0,r6),r3 ! 20 LS (latency=2) 5841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2,r0 ! 57 MT 5851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.w r1,@-r0 ! 29 LS 5871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 111 BR 5881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.w r3,@-r0 ! 29 LS 5901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bra 10f 5921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds nop 5931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 5951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds .balign 32 5961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds.Lcase2b: 5971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Size is at least 64 bytes, so will be going round the big loop at least once. 5981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 5991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r2 = rounded up r4 6001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r3 = rounded down r0 6011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r0, r3 ! 5 MT (latency=0) 6031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov #(~0x1f), r1 ! 6 EX 6041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds and r1, r3 ! 78 EX 6061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r4, r2 ! 5 MT (latency=0) 6071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r3, r0 ! 54 MT 6091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #0x1f, r2 ! 50 EX 6101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-2, r5 ! 50 EX 6121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 1f ! 110 BR 6131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds and r1, r2 ! 78 EX 6141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Copy a short word one at a time until we are cache line aligned 6161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Normal values: r0, r2, r3, r4 6171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Unused: r1, r6, r7 6181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Mod: r5 (=r5-2) 6191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #2, r3 ! 50 EX 6211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 6231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r3,r0 ! 54 MT 6241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 2b ! 111 BR 6261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.w r1,@-r0 ! 29 LS 6281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Copy the cache line aligned blocks 6301da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6311da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! In use: r0, r2, r4, r5 (=r5-2) 6321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Scratch: r1, r3, r6, r7 6331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! We could do this with the four scratch registers, but if src 6351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! and dest hit the same cache line, this will thrash, so make 6361da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! use of additional registers. 6371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 6391da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r5: src (was r0+r5) 6401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r1: dest (was r0) 6411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! this can be reversed at the end, so we don't need to save any extra 6421da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! state. 6431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 6441da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: mov.l r8, @-r15 ! 30 LS 6451da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add r0, r5 ! 49 EX 6461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6471da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r9, @-r15 ! 30 LS 6481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r0, r1 ! 5 MT (latency=0) 6491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6501da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r10, @-r15 ! 30 LS 6511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x1e, r5 ! 50 EX 6521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6531da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r11, @-r15 ! 30 LS 6541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r12, @-r15 ! 30 LS 6561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 17 cycles, 32 bytes per iteration 6581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#ifdef CONFIG_CPU_LITTLE_ENDIAN 6591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI 6601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x20, r1 ! 50 EX 6611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r3 ! 15 LS (latency=2) NMLK 6631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r6 ! 15 LS (latency=2) RQPO 6651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll16 r0 ! 103 EX JI.. 6661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r7 ! 15 LS (latency=2) 6681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r3, r0 ! 48 EX LKJI 6691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r8 ! 15 LS (latency=2) 6711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r6, r3 ! 48 EX PONM 6721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r9 ! 15 LS (latency=2) 6741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r7, r6 ! 48 EX 6751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r10 ! 15 LS (latency=2) 6771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r8, r7 ! 48 EX 6781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r5+, r11 ! 15 LS (latency=2) 6801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r9, r8 ! 48 EX 6811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.w @r5+, r12 ! 15 LS (latency=2) 6831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r10, r9 ! 48 EX 6841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds movca.l r0,@r1 ! 40 LS (latency=3-7) 6861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r11, r10 ! 48 EX 6871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r3, @(0x04,r1) ! 33 LS 6891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r12, r11 ! 48 EX 6901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r6, @(0x08,r1) ! 33 LS 6921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r7, @(0x0c,r1) ! 33 LS 6941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r8, @(0x10,r1) ! 33 LS 6961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x40, r5 ! 50 EX 6971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 6981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r9, @(0x14,r1) ! 33 LS 6991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r2,r1 ! 54 MT 7001da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7011da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r10, @(0x18,r1) ! 33 LS 7021da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 2b ! 109 BR 7031da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7041da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l r11, @(0x1c,r1) ! 33 LS 7051da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#else 7061da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) 7071da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-2, r5 ! 50 EX 7081da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7091da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) 7101da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-4, r1 ! 50 EX 7111da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7121da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x18,r5), r6 ! 18 LS (latency=2) 7131da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds shll16 r0 ! 103 EX 7141da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7151da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x14,r5), r7 ! 18 LS (latency=2) 7161da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r3, r0 ! 48 EX 7171da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7181da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x10,r5), r8 ! 18 LS (latency=2) 7191da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r6, r3 ! 48 EX 7201da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7211da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) 7221da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r7, r6 ! 48 EX 7231da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7241da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x08,r5), r10 ! 18 LS (latency=2) 7251da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r8, r7 ! 48 EX 7261da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7271da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @(0x04,r5), r11 ! 18 LS (latency=2) 7281da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r9, r8 ! 48 EX 7291da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 730c7afb7e5cbc4baa781ec82731fc9fe9039efee22Nobuhiro Iwamatsu mov.l @(0x00,r5), r12 ! 18 LS (latency=2) 731c7afb7e5cbc4baa781ec82731fc9fe9039efee22Nobuhiro Iwamatsu xtrct r10, r9 ! 48 EX 7321da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7331da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds movca.l r0,@r1 ! 40 LS (latency=3-7) 7341da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #-0x1c, r1 ! 50 EX 7351da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 736e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r3, @(0x18,r1) ! 33 LS 7371da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r11, r10 ! 48 EX 7381da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 739e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r6, @(0x14,r1) ! 33 LS 7401da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds xtrct r12, r11 ! 48 EX 7411da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 742e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r7, @(0x10,r1) ! 33 LS 7431da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 744e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r8, @(0x0c,r1) ! 33 LS 745e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito add #-0x1e, r5 ! 50 EX 7461da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 747e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r9, @(0x08,r1) ! 33 LS 7481da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r2,r1 ! 54 MT 7491da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 750e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r10, @(0x04,r1) ! 33 LS 7511da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 2b ! 109 BR 7521da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 753e08b954c9a140f2062649faec72514eb505f18c3Hideo Saito mov.l r11, @(0x00,r1) ! 33 LS 7541da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds#endif 7551da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7561da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r12 7571da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov r1, r0 ! 5 MT (latency=0) 7581da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7591da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r11 ! 15 LS 7601da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds sub r1, r5 ! 75 EX 7611da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7621da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r10 ! 15 LS 7631da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r4, r0 ! 54 MT 7641da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7651da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 1f ! 109 BR 7661da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.l @r15+, r9 ! 15 LS 7671da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7681da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds rts 7691da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: mov.l @r15+, r8 ! 15 LS 7701da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7711da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #0x1e, r5 ! 50 EX 7721da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7731da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Finish off a short word at a time 7741da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! r5 must be invariant - 2 7751da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds10: mov r4,r2 ! 5 MT (latency=0) 7761da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #1,r2 ! 50 EX 7771da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7781da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2, r0 ! 57 MT 7791da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bf/s 1f ! 109 BR 7801da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7811da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #2, r2 ! 50 EX 7821da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7831da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds3: mov.w @(r0,r5),r1 ! 20 LS 7841da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/hi r2,r0 ! 57 MT 7851da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7861da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 3b ! 109 BR 7871da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7881da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.w r1,@-r0 ! 29 LS 7891da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds1: 7901da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 7911da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! 7921da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds ! Finally, copy the last byte if necessary 7931da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds cmp/eq r4,r0 ! 54 MT 7941da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds bt/s 9b 7951da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds add #1,r5 7961da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b @(r0,r5),r1 7971da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds rts 7981da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds mov.b r1,@-r0 7991da177e4c3f41524e886b7f1b8a0c1fc7321cacLinus Torvalds 800