1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30#include <private/bionic_asm.h> 31 32#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY) 33 34 .text 35 .fpu neon 36 37#ifdef HAVE_32_BYTE_CACHE_LINE 38/* a prefetch distance of 2 cache-lines */ 39#define CACHE_LINE_SIZE 32 40#else 41/* a prefetch distance of 4 cache-lines works best experimentally */ 42#define CACHE_LINE_SIZE 64 43#endif 44 45ENTRY(memcpy) 46 .save {r0, lr} 47 /* start preloading as early as possible */ 48 pld [r1, #(CACHE_LINE_SIZE * 0)] 49 stmfd sp!, {r0, lr} 50 pld [r1, #(CACHE_LINE_SIZE * 1)] 51 52/* If Neon supports unaligned access then remove the align code, 53 * unless a size limit has been specified. 54 */ 55#ifndef NEON_UNALIGNED_ACCESS 56 /* do we have at least 16-bytes to copy (needed for alignment below) */ 57 cmp r2, #16 58 blo 5f 59 60 /* check if buffers are aligned. If so, run arm-only version */ 61 eor r3, r0, r1 62 ands r3, r3, #0x3 63 beq 11f 64 65 /* align destination to cache-line for the write-buffer */ 66 rsb r3, r0, #0 67 ands r3, r3, #0xF 68 beq 2f 69 70 /* copy up to 15-bytes (count in r3) */ 71 sub r2, r2, r3 72 movs ip, r3, lsl #31 73 ldrmib lr, [r1], #1 74 strmib lr, [r0], #1 75 ldrcsb ip, [r1], #1 76 ldrcsb lr, [r1], #1 77 strcsb ip, [r0], #1 78 strcsb lr, [r0], #1 79 movs ip, r3, lsl #29 80 bge 1f 81 // copies 4 bytes, destination 32-bits aligned 82 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 83 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! 841: bcc 2f 85 // copies 8 bytes, destination 64-bits aligned 86 vld1.8 {d0}, [r1]! 87 vst1.8 {d0}, [r0, :64]! 882: 89 /* preload immediately the next cache line, which we may need */ 90 pld [r1, #(CACHE_LINE_SIZE * 0)] 91 pld [r1, #(CACHE_LINE_SIZE * 1)] 92 93#ifdef HAVE_32_BYTE_CACHE_LINE 94 /* make sure we have at least 32 bytes to copy */ 95 subs r2, r2, #32 96 blo 4f 97 98 /* preload all the cache lines we need. 99 * NOTE: the number of pld below depends on PREFETCH_DISTANCE, 100 * ideally would would increase the distance in the main loop to 101 * avoid the goofy code below. In practice this doesn't seem to make 102 * a big difference. 103 */ 104 pld [r1, #(PREFETCH_DISTANCE)] 105 1061: /* The main loop copies 32 bytes at a time */ 107 vld1.8 {d0 - d3}, [r1]! 108 pld [r1, #(PREFETCH_DISTANCE)] 109 subs r2, r2, #32 110 vst1.8 {d0 - d3}, [r0, :128]! 111 bhs 1b 112#else 113 /* make sure we have at least 64 bytes to copy */ 114 subs r2, r2, #64 115 blo 2f 116 117 /* preload all the cache lines we need. */ 118 pld [r1, #(CACHE_LINE_SIZE * 2)] 119 pld [r1, #(CACHE_LINE_SIZE * 3)] 120 1211: /* The main loop copies 64 bytes at a time */ 122 vld1.8 {d0 - d3}, [r1]! 123 vld1.8 {d4 - d7}, [r1]! 124#ifdef HAVE_32_BYTE_CACHE_LINE 125 pld [r1, #(CACHE_LINE_SIZE * 2)] 126 pld [r1, #(CACHE_LINE_SIZE * 3)] 127#else 128 pld [r1, #(CACHE_LINE_SIZE * 3)] 129#endif 130 subs r2, r2, #64 131 vst1.8 {d0 - d3}, [r0, :128]! 132 vst1.8 {d4 - d7}, [r0, :128]! 133 bhs 1b 134 1352: /* fix-up the remaining count and make sure we have >= 32 bytes left */ 136 add r2, r2, #64 137 subs r2, r2, #32 138 blo 4f 139 1403: /* 32 bytes at a time. These cache lines were already preloaded */ 141 vld1.8 {d0 - d3}, [r1]! 142 subs r2, r2, #32 143 vst1.8 {d0 - d3}, [r0, :128]! 144 bhs 3b 145#endif 1464: /* less than 32 left */ 147 add r2, r2, #32 148 tst r2, #0x10 149 beq 5f 150 // copies 16 bytes, 128-bits aligned 151 vld1.8 {d0, d1}, [r1]! 152 vst1.8 {d0, d1}, [r0, :128]! 1535: /* copy up to 15-bytes (count in r2) */ 154 movs ip, r2, lsl #29 155 bcc 1f 156 vld1.8 {d0}, [r1]! 157 vst1.8 {d0}, [r0]! 1581: bge 2f 159 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 160 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! 1612: movs ip, r2, lsl #31 162 ldrmib r3, [r1], #1 163 ldrcsb ip, [r1], #1 164 ldrcsb lr, [r1], #1 165 strmib r3, [r0], #1 166 strcsb ip, [r0], #1 167 strcsb lr, [r0], #1 168 169 ldmfd sp!, {r0, lr} 170 bx lr 171 172#else /* NEON_UNALIGNED_ACCESS */ 173 174 // Check so divider is at least 16 bytes, needed for alignment code. 175 cmp r2, #16 176 blo 5f 177 178#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER 179 /* Check the upper size limit for Neon unaligned memory access in memcpy */ 180#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16 181 cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER 182 blo 3f 183#endif 184 /* check if buffers are aligned. If so, run arm-only version */ 185 eor r3, r0, r1 186 ands r3, r3, #0x3 187 beq 11f 188 189 /* align destination to 16 bytes for the write-buffer */ 190 rsb r3, r0, #0 191 ands r3, r3, #0xF 192 beq 3f 193 194 /* copy up to 15-bytes (count in r3) */ 195 sub r2, r2, r3 196 movs ip, r3, lsl #31 197 ldrmib lr, [r1], #1 198 strmib lr, [r0], #1 199 ldrcsb ip, [r1], #1 200 ldrcsb lr, [r1], #1 201 strcsb ip, [r0], #1 202 strcsb lr, [r0], #1 203 movs ip, r3, lsl #29 204 bge 1f 205 // copies 4 bytes, destination 32-bits aligned 206 vld1.32 {d0[0]}, [r1]! 207 vst1.32 {d0[0]}, [r0, :32]! 2081: bcc 2f 209 // copies 8 bytes, destination 64-bits aligned 210 vld1.8 {d0}, [r1]! 211 vst1.8 {d0}, [r0, :64]! 2122: 213 /* preload immediately the next cache line, which we may need */ 214 pld [r1, #(CACHE_LINE_SIZE * 0)] 215 pld [r1, #(CACHE_LINE_SIZE * 1)] 2163: 217#endif 218 /* make sure we have at least 64 bytes to copy */ 219 subs r2, r2, #64 220 blo 2f 221 222 /* preload all the cache lines we need */ 223 pld [r1, #(CACHE_LINE_SIZE * 2)] 224 pld [r1, #(CACHE_LINE_SIZE * 3)] 225 2261: /* The main loop copies 64 bytes at a time */ 227 vld1.8 {d0 - d3}, [r1]! 228 vld1.8 {d4 - d7}, [r1]! 229#ifdef HAVE_32_BYTE_CACHE_LINE 230 pld [r1, #(CACHE_LINE_SIZE * 2)] 231 pld [r1, #(CACHE_LINE_SIZE * 3)] 232#else 233 pld [r1, #(CACHE_LINE_SIZE * 3)] 234#endif 235 subs r2, r2, #64 236 vst1.8 {d0 - d3}, [r0]! 237 vst1.8 {d4 - d7}, [r0]! 238 bhs 1b 239 2402: /* fix-up the remaining count and make sure we have >= 32 bytes left */ 241 add r2, r2, #64 242 subs r2, r2, #32 243 blo 4f 244 2453: /* 32 bytes at a time. These cache lines were already preloaded */ 246 vld1.8 {d0 - d3}, [r1]! 247 subs r2, r2, #32 248 vst1.8 {d0 - d3}, [r0]! 249 bhs 3b 250 2514: /* less than 32 left */ 252 add r2, r2, #32 253 tst r2, #0x10 254 beq 5f 255 // copies 16 bytes, 128-bits aligned 256 vld1.8 {d0, d1}, [r1]! 257 vst1.8 {d0, d1}, [r0]! 2585: /* copy up to 15-bytes (count in r2) */ 259 movs ip, r2, lsl #29 260 bcc 1f 261 vld1.8 {d0}, [r1]! 262 vst1.8 {d0}, [r0]! 2631: bge 2f 264 vld1.32 {d0[0]}, [r1]! 265 vst1.32 {d0[0]}, [r0]! 2662: movs ip, r2, lsl #31 267 ldrmib r3, [r1], #1 268 ldrcsb ip, [r1], #1 269 ldrcsb lr, [r1], #1 270 strmib r3, [r0], #1 271 strcsb ip, [r0], #1 272 strcsb lr, [r0], #1 273 274 ldmfd sp!, {r0, lr} 275 bx lr 276#endif /* NEON_UNALIGNED_ACCESS */ 27711: 278 /* Simple arm-only copy loop to handle aligned copy operations */ 279 stmfd sp!, {r4, r5, r6, r7, r8} 280 pld [r1, #(CACHE_LINE_SIZE * 2)] 281 282 /* Check alignment */ 283 rsb r3, r1, #0 284 ands r3, #3 285 beq 2f 286 287 /* align source to 32 bits. We need to insert 2 instructions between 288 * a ldr[b|h] and str[b|h] because byte and half-word instructions 289 * stall 2 cycles. 290 */ 291 movs r12, r3, lsl #31 292 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 293 ldrmib r3, [r1], #1 294 ldrcsb r4, [r1], #1 295 ldrcsb r5, [r1], #1 296 strmib r3, [r0], #1 297 strcsb r4, [r0], #1 298 strcsb r5, [r0], #1 2992: 300 subs r2, #32 301 blt 5f 302 pld [r1, #(CACHE_LINE_SIZE * 3)] 3033: /* Main copy loop, copying 32 bytes at a time */ 304 pld [r1, #(CACHE_LINE_SIZE * 4)] 305 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr} 306 subs r2, r2, #32 307 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr} 308 bge 3b 3095: /* Handle any remaining bytes */ 310 adds r2, #32 311 beq 6f 312 313 movs r12, r2, lsl #28 314 ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */ 315 ldmmiia r1!, {r7, r8} /* 8 bytes */ 316 stmcsia r0!, {r3, r4, r5, r6} 317 stmmiia r0!, {r7, r8} 318 movs r12, r2, lsl #30 319 ldrcs r3, [r1], #4 /* 4 bytes */ 320 ldrmih r4, [r1], #2 /* 2 bytes */ 321 strcs r3, [r0], #4 322 strmih r4, [r0], #2 323 tst r2, #0x1 324 ldrneb r3, [r1] /* last byte */ 325 strneb r3, [r0] 3266: 327 ldmfd sp!, {r4, r5, r6, r7, r8} 328 ldmfd sp!, {r0, pc} 329END(memcpy) 330 331 332#else /* __ARM_ARCH__ < 7 */ 333 334 335 /* 336 * Optimized memcpy() for ARM. 337 * 338 * note that memcpy() always returns the destination pointer, 339 * so we have to preserve R0. 340 */ 341 342ENTRY(memcpy) 343 /* The stack must always be 64-bits aligned to be compliant with the 344 * ARM ABI. Since we have to save R0, we might as well save R4 345 * which we can use for better pipelining of the reads below 346 */ 347 .save {r0, r4, lr} 348 stmfd sp!, {r0, r4, lr} 349 /* Making room for r5-r11 which will be spilled later */ 350 .pad #28 351 sub sp, sp, #28 352 353 // preload the destination because we'll align it to a cache line 354 // with small writes. Also start the source "pump". 355 pld [r0, #0] 356 pld [r1, #0] 357 pld [r1, #32] 358 359 /* it simplifies things to take care of len<4 early */ 360 cmp r2, #4 361 blo copy_last_3_and_return 362 363 /* compute the offset to align the source 364 * offset = (4-(src&3))&3 = -src & 3 365 */ 366 rsb r3, r1, #0 367 ands r3, r3, #3 368 beq src_aligned 369 370 /* align source to 32 bits. We need to insert 2 instructions between 371 * a ldr[b|h] and str[b|h] because byte and half-word instructions 372 * stall 2 cycles. 373 */ 374 movs r12, r3, lsl #31 375 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 376 ldrmib r3, [r1], #1 377 ldrcsb r4, [r1], #1 378 ldrcsb r12,[r1], #1 379 strmib r3, [r0], #1 380 strcsb r4, [r0], #1 381 strcsb r12,[r0], #1 382 383src_aligned: 384 385 /* see if src and dst are aligned together (congruent) */ 386 eor r12, r0, r1 387 tst r12, #3 388 bne non_congruent 389 390 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 391 * frame. Don't update sp. 392 */ 393 stmea sp, {r5-r11} 394 395 /* align the destination to a cache-line */ 396 rsb r3, r0, #0 397 ands r3, r3, #0x1C 398 beq congruent_aligned32 399 cmp r3, r2 400 andhi r3, r2, #0x1C 401 402 /* conditionnaly copies 0 to 7 words (length in r3) */ 403 movs r12, r3, lsl #28 404 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ 405 ldmmiia r1!, {r8, r9} /* 8 bytes */ 406 stmcsia r0!, {r4, r5, r6, r7} 407 stmmiia r0!, {r8, r9} 408 tst r3, #0x4 409 ldrne r10,[r1], #4 /* 4 bytes */ 410 strne r10,[r0], #4 411 sub r2, r2, r3 412 413congruent_aligned32: 414 /* 415 * here source is aligned to 32 bytes. 416 */ 417 418cached_aligned32: 419 subs r2, r2, #32 420 blo less_than_32_left 421 422 /* 423 * We preload a cache-line up to 64 bytes ahead. On the 926, this will 424 * stall only until the requested world is fetched, but the linefill 425 * continues in the the background. 426 * While the linefill is going, we write our previous cache-line 427 * into the write-buffer (which should have some free space). 428 * When the linefill is done, the writebuffer will 429 * start dumping its content into memory 430 * 431 * While all this is going, we then load a full cache line into 432 * 8 registers, this cache line should be in the cache by now 433 * (or partly in the cache). 434 * 435 * This code should work well regardless of the source/dest alignment. 436 * 437 */ 438 439 // Align the preload register to a cache-line because the cpu does 440 // "critical word first" (the first word requested is loaded first). 441 bic r12, r1, #0x1F 442 add r12, r12, #64 443 4441: ldmia r1!, { r4-r11 } 445 pld [r12, #64] 446 subs r2, r2, #32 447 448 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi 449 // for ARM9 preload will not be safely guarded by the preceding subs. 450 // When it is safely guarded the only possibility to have SIGSEGV here 451 // is because the caller overstates the length. 452 ldrhi r3, [r12], #32 /* cheap ARM9 preload */ 453 stmia r0!, { r4-r11 } 454 bhs 1b 455 456 add r2, r2, #32 457 458 459 460 461less_than_32_left: 462 /* 463 * less than 32 bytes left at this point (length in r2) 464 */ 465 466 /* skip all this if there is nothing to do, which should 467 * be a common case (if not executed the code below takes 468 * about 16 cycles) 469 */ 470 tst r2, #0x1F 471 beq 1f 472 473 /* conditionnaly copies 0 to 31 bytes */ 474 movs r12, r2, lsl #28 475 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ 476 ldmmiia r1!, {r8, r9} /* 8 bytes */ 477 stmcsia r0!, {r4, r5, r6, r7} 478 stmmiia r0!, {r8, r9} 479 movs r12, r2, lsl #30 480 ldrcs r3, [r1], #4 /* 4 bytes */ 481 ldrmih r4, [r1], #2 /* 2 bytes */ 482 strcs r3, [r0], #4 483 strmih r4, [r0], #2 484 tst r2, #0x1 485 ldrneb r3, [r1] /* last byte */ 486 strneb r3, [r0] 487 488 /* we're done! restore everything and return */ 4891: ldmfd sp!, {r5-r11} 490 ldmfd sp!, {r0, r4, lr} 491 bx lr 492 493 /********************************************************************/ 494 495non_congruent: 496 /* 497 * here source is aligned to 4 bytes 498 * but destination is not. 499 * 500 * in the code below r2 is the number of bytes read 501 * (the number of bytes written is always smaller, because we have 502 * partial words in the shift queue) 503 */ 504 cmp r2, #4 505 blo copy_last_3_and_return 506 507 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 508 * frame. Don't update sp. 509 */ 510 stmea sp, {r5-r11} 511 512 /* compute shifts needed to align src to dest */ 513 rsb r5, r0, #0 514 and r5, r5, #3 /* r5 = # bytes in partial words */ 515 mov r12, r5, lsl #3 /* r12 = right */ 516 rsb lr, r12, #32 /* lr = left */ 517 518 /* read the first word */ 519 ldr r3, [r1], #4 520 sub r2, r2, #4 521 522 /* write a partial word (0 to 3 bytes), such that destination 523 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) 524 */ 525 movs r5, r5, lsl #31 526 strmib r3, [r0], #1 527 movmi r3, r3, lsr #8 528 strcsb r3, [r0], #1 529 movcs r3, r3, lsr #8 530 strcsb r3, [r0], #1 531 movcs r3, r3, lsr #8 532 533 cmp r2, #4 534 blo partial_word_tail 535 536 /* Align destination to 32 bytes (cache line boundary) */ 5371: tst r0, #0x1c 538 beq 2f 539 ldr r5, [r1], #4 540 sub r2, r2, #4 541 orr r4, r3, r5, lsl lr 542 mov r3, r5, lsr r12 543 str r4, [r0], #4 544 cmp r2, #4 545 bhs 1b 546 blo partial_word_tail 547 548 /* copy 32 bytes at a time */ 5492: subs r2, r2, #32 550 blo less_than_thirtytwo 551 552 /* Use immediate mode for the shifts, because there is an extra cycle 553 * for register shifts, which could account for up to 50% of 554 * performance hit. 555 */ 556 557 cmp r12, #24 558 beq loop24 559 cmp r12, #8 560 beq loop8 561 562loop16: 563 ldr r12, [r1], #4 5641: mov r4, r12 565 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 566 pld [r1, #64] 567 subs r2, r2, #32 568 ldrhs r12, [r1], #4 569 orr r3, r3, r4, lsl #16 570 mov r4, r4, lsr #16 571 orr r4, r4, r5, lsl #16 572 mov r5, r5, lsr #16 573 orr r5, r5, r6, lsl #16 574 mov r6, r6, lsr #16 575 orr r6, r6, r7, lsl #16 576 mov r7, r7, lsr #16 577 orr r7, r7, r8, lsl #16 578 mov r8, r8, lsr #16 579 orr r8, r8, r9, lsl #16 580 mov r9, r9, lsr #16 581 orr r9, r9, r10, lsl #16 582 mov r10, r10, lsr #16 583 orr r10, r10, r11, lsl #16 584 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 585 mov r3, r11, lsr #16 586 bhs 1b 587 b less_than_thirtytwo 588 589loop8: 590 ldr r12, [r1], #4 5911: mov r4, r12 592 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 593 pld [r1, #64] 594 subs r2, r2, #32 595 ldrhs r12, [r1], #4 596 orr r3, r3, r4, lsl #24 597 mov r4, r4, lsr #8 598 orr r4, r4, r5, lsl #24 599 mov r5, r5, lsr #8 600 orr r5, r5, r6, lsl #24 601 mov r6, r6, lsr #8 602 orr r6, r6, r7, lsl #24 603 mov r7, r7, lsr #8 604 orr r7, r7, r8, lsl #24 605 mov r8, r8, lsr #8 606 orr r8, r8, r9, lsl #24 607 mov r9, r9, lsr #8 608 orr r9, r9, r10, lsl #24 609 mov r10, r10, lsr #8 610 orr r10, r10, r11, lsl #24 611 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 612 mov r3, r11, lsr #8 613 bhs 1b 614 b less_than_thirtytwo 615 616loop24: 617 ldr r12, [r1], #4 6181: mov r4, r12 619 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 620 pld [r1, #64] 621 subs r2, r2, #32 622 ldrhs r12, [r1], #4 623 orr r3, r3, r4, lsl #8 624 mov r4, r4, lsr #24 625 orr r4, r4, r5, lsl #8 626 mov r5, r5, lsr #24 627 orr r5, r5, r6, lsl #8 628 mov r6, r6, lsr #24 629 orr r6, r6, r7, lsl #8 630 mov r7, r7, lsr #24 631 orr r7, r7, r8, lsl #8 632 mov r8, r8, lsr #24 633 orr r8, r8, r9, lsl #8 634 mov r9, r9, lsr #24 635 orr r9, r9, r10, lsl #8 636 mov r10, r10, lsr #24 637 orr r10, r10, r11, lsl #8 638 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 639 mov r3, r11, lsr #24 640 bhs 1b 641 642 643less_than_thirtytwo: 644 /* copy the last 0 to 31 bytes of the source */ 645 rsb r12, lr, #32 /* we corrupted r12, recompute it */ 646 add r2, r2, #32 647 cmp r2, #4 648 blo partial_word_tail 649 6501: ldr r5, [r1], #4 651 sub r2, r2, #4 652 orr r4, r3, r5, lsl lr 653 mov r3, r5, lsr r12 654 str r4, [r0], #4 655 cmp r2, #4 656 bhs 1b 657 658partial_word_tail: 659 /* we have a partial word in the input buffer */ 660 movs r5, lr, lsl #(31-3) 661 strmib r3, [r0], #1 662 movmi r3, r3, lsr #8 663 strcsb r3, [r0], #1 664 movcs r3, r3, lsr #8 665 strcsb r3, [r0], #1 666 667 /* Refill spilled registers from the stack. Don't update sp. */ 668 ldmfd sp, {r5-r11} 669 670copy_last_3_and_return: 671 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ 672 ldrmib r2, [r1], #1 673 ldrcsb r3, [r1], #1 674 ldrcsb r12,[r1] 675 strmib r2, [r0], #1 676 strcsb r3, [r0], #1 677 strcsb r12,[r0] 678 679 /* we're done! restore sp and spilled registers and return */ 680 add sp, sp, #28 681 ldmfd sp!, {r0, r4, lr} 682 bx lr 683END(memcpy) 684 685 686#endif /* __ARM_ARCH__ < 7 */ 687