ssse3-memcpy-atom.S revision bed110af26f947057fd7940ba383b6f562d2df97
1/* 2Copyright (c) 2010, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMCPY 34# define MEMCPY memcpy 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#define DEST PARMS 77#define SRC DEST+4 78#define LEN SRC+4 79 80#define CFI_PUSH(REG) \ 81 cfi_adjust_cfa_offset (4); \ 82 cfi_rel_offset (REG, 0) 83 84#define CFI_POP(REG) \ 85 cfi_adjust_cfa_offset (-4); \ 86 cfi_restore (REG) 87 88#define PUSH(REG) pushl REG; CFI_PUSH (REG) 89#define POP(REG) popl REG; CFI_POP (REG) 90 91#if (defined SHARED || defined __PIC__) 92# define PARMS 8 /* Preserve EBX. */ 93# define ENTRANCE PUSH (%ebx); 94# define RETURN_END POP (%ebx); ret 95# define RETURN RETURN_END; CFI_PUSH (%ebx) 96# define JMPTBL(I, B) I - B 97 98# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x 99 100/* Load an entry in a jump table into EBX and branch to it. TABLE is a 101 jump table with relative offsets. INDEX is a register contains the 102 index into the jump table. SCALE is the scale of INDEX. */ 103 104# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 105 /* We first load PC into EBX. */ \ 106 SETUP_PIC_REG(bx); \ 107 /* Get the address of the jump table. */ \ 108 addl $(TABLE - .), %ebx; \ 109 /* Get the entry and convert the relative offset to the \ 110 absolute address. */ \ 111 addl (%ebx, INDEX, SCALE), %ebx; \ 112 /* We loaded the jump table. Go. */ \ 113 jmp *%ebx 114#else 115 116# define PARMS 4 117# define ENTRANCE 118# define RETURN_END ret 119# define RETURN RETURN_END 120# define JMPTBL(I, B) I 121 122/* Branch to an entry in a jump table. TABLE is a jump table with 123 absolute offsets. INDEX is a register contains the index into the 124 jump table. SCALE is the scale of INDEX. */ 125 126# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 127 jmp *TABLE(, INDEX, SCALE) 128#endif 129 130 .section .text.ssse3,"ax",@progbits 131ENTRY (MEMCPY) 132 ENTRANCE 133 movl LEN(%esp), %ecx 134 movl SRC(%esp), %eax 135 movl DEST(%esp), %edx 136 137#ifdef USE_AS_MEMMOVE 138 cmp %eax, %edx 139 jb L(copy_forward) 140 je L(fwd_write_0bytes) 141 cmp $32, %ecx 142 jae L(memmove_bwd) 143 jmp L(bk_write_less32bytes_2) 144 145 .p2align 4 146L(memmove_bwd): 147 add %ecx, %eax 148 cmp %eax, %edx 149 movl SRC(%esp), %eax 150 jb L(copy_backward) 151 152L(copy_forward): 153#endif 154 cmp $48, %ecx 155 jae L(48bytesormore) 156 157L(fwd_write_less32bytes): 158#ifndef USE_AS_MEMMOVE 159 cmp %dl, %al 160 jb L(bk_write) 161#endif 162 add %ecx, %edx 163 add %ecx, %eax 164 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 165#ifndef USE_AS_MEMMOVE 166 .p2align 4 167L(bk_write): 168 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 169#endif 170 171 .p2align 4 172L(48bytesormore): 173#ifndef USE_AS_MEMMOVE 174 movlpd (%eax), %xmm0 175 movlpd 8(%eax), %xmm1 176 movlpd %xmm0, (%edx) 177 movlpd %xmm1, 8(%edx) 178#else 179 movdqu (%eax), %xmm0 180#endif 181 PUSH (%edi) 182 movl %edx, %edi 183 and $-16, %edx 184 add $16, %edx 185 sub %edx, %edi 186 add %edi, %ecx 187 sub %edi, %eax 188 189#ifdef SHARED_CACHE_SIZE_HALF 190 cmp $SHARED_CACHE_SIZE_HALF, %ecx 191#else 192# if (defined SHARED || defined __PIC__) 193 SETUP_PIC_REG(bx) 194 add $_GLOBAL_OFFSET_TABLE_, %ebx 195 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 196# else 197 cmp __x86_shared_cache_size_half, %ecx 198# endif 199#endif 200 201 mov %eax, %edi 202 jae L(large_page) 203 and $0xf, %edi 204 jz L(shl_0) 205 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 206 207 .p2align 4 208L(shl_0): 209#ifdef USE_AS_MEMMOVE 210 movl DEST+4(%esp), %edi 211 movdqu %xmm0, (%edi) 212#endif 213 xor %edi, %edi 214 cmp $127, %ecx 215 ja L(shl_0_gobble) 216 lea -32(%ecx), %ecx 217 218 .p2align 4 219L(shl_0_loop): 220 movdqa (%eax, %edi), %xmm0 221 movdqa 16(%eax, %edi), %xmm1 222 sub $32, %ecx 223 movdqa %xmm0, (%edx, %edi) 224 movdqa %xmm1, 16(%edx, %edi) 225 lea 32(%edi), %edi 226 jb L(shl_0_end) 227 228 movdqa (%eax, %edi), %xmm0 229 movdqa 16(%eax, %edi), %xmm1 230 sub $32, %ecx 231 movdqa %xmm0, (%edx, %edi) 232 movdqa %xmm1, 16(%edx, %edi) 233 lea 32(%edi), %edi 234 jb L(shl_0_end) 235 236 movdqa (%eax, %edi), %xmm0 237 movdqa 16(%eax, %edi), %xmm1 238 sub $32, %ecx 239 movdqa %xmm0, (%edx, %edi) 240 movdqa %xmm1, 16(%edx, %edi) 241 lea 32(%edi), %edi 242 jb L(shl_0_end) 243 244 movdqa (%eax, %edi), %xmm0 245 movdqa 16(%eax, %edi), %xmm1 246 sub $32, %ecx 247 movdqa %xmm0, (%edx, %edi) 248 movdqa %xmm1, 16(%edx, %edi) 249 lea 32(%edi), %edi 250 251L(shl_0_end): 252 lea 32(%ecx), %ecx 253 add %ecx, %edi 254 add %edi, %edx 255 add %edi, %eax 256 POP (%edi) 257 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 258 259 CFI_PUSH (%edi) 260 261 .p2align 4 262L(shl_0_gobble): 263#ifdef DATA_CACHE_SIZE_HALF 264 cmp $DATA_CACHE_SIZE_HALF, %ecx 265#else 266# if (defined SHARED || defined __PIC__) 267 SETUP_PIC_REG(bx) 268 add $_GLOBAL_OFFSET_TABLE_, %ebx 269 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 270# else 271 cmp __x86_data_cache_size_half, %ecx 272# endif 273#endif 274 POP (%edi) 275 lea -128(%ecx), %ecx 276 jae L(shl_0_gobble_mem_loop) 277 278 .p2align 4 279L(shl_0_gobble_cache_loop): 280 movdqa (%eax), %xmm0 281 movdqa 0x10(%eax), %xmm1 282 movdqa 0x20(%eax), %xmm2 283 movdqa 0x30(%eax), %xmm3 284 movdqa 0x40(%eax), %xmm4 285 movdqa 0x50(%eax), %xmm5 286 movdqa 0x60(%eax), %xmm6 287 movdqa 0x70(%eax), %xmm7 288 lea 0x80(%eax), %eax 289 sub $128, %ecx 290 movdqa %xmm0, (%edx) 291 movdqa %xmm1, 0x10(%edx) 292 movdqa %xmm2, 0x20(%edx) 293 movdqa %xmm3, 0x30(%edx) 294 movdqa %xmm4, 0x40(%edx) 295 movdqa %xmm5, 0x50(%edx) 296 movdqa %xmm6, 0x60(%edx) 297 movdqa %xmm7, 0x70(%edx) 298 lea 0x80(%edx), %edx 299 300 jae L(shl_0_gobble_cache_loop) 301 cmp $-0x40, %ecx 302 lea 0x80(%ecx), %ecx 303 jl L(shl_0_cache_less_64bytes) 304 305 movdqa (%eax), %xmm0 306 sub $0x40, %ecx 307 movdqa 0x10(%eax), %xmm1 308 movdqa %xmm0, (%edx) 309 movdqa %xmm1, 0x10(%edx) 310 movdqa 0x20(%eax), %xmm0 311 movdqa 0x30(%eax), %xmm1 312 add $0x40, %eax 313 movdqa %xmm0, 0x20(%edx) 314 movdqa %xmm1, 0x30(%edx) 315 add $0x40, %edx 316 317L(shl_0_cache_less_64bytes): 318 cmp $0x20, %ecx 319 jb L(shl_0_cache_less_32bytes) 320 movdqa (%eax), %xmm0 321 sub $0x20, %ecx 322 movdqa 0x10(%eax), %xmm1 323 add $0x20, %eax 324 movdqa %xmm0, (%edx) 325 movdqa %xmm1, 0x10(%edx) 326 add $0x20, %edx 327 328L(shl_0_cache_less_32bytes): 329 cmp $0x10, %ecx 330 jb L(shl_0_cache_less_16bytes) 331 sub $0x10, %ecx 332 movdqa (%eax), %xmm0 333 add $0x10, %eax 334 movdqa %xmm0, (%edx) 335 add $0x10, %edx 336 337L(shl_0_cache_less_16bytes): 338 add %ecx, %edx 339 add %ecx, %eax 340 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 341 342 .p2align 4 343L(shl_0_gobble_mem_loop): 344 prefetcht0 0x1c0(%eax) 345 prefetcht0 0x280(%eax) 346 prefetcht0 0x1c0(%edx) 347 348 movdqa (%eax), %xmm0 349 movdqa 0x10(%eax), %xmm1 350 movdqa 0x20(%eax), %xmm2 351 movdqa 0x30(%eax), %xmm3 352 movdqa 0x40(%eax), %xmm4 353 movdqa 0x50(%eax), %xmm5 354 movdqa 0x60(%eax), %xmm6 355 movdqa 0x70(%eax), %xmm7 356 lea 0x80(%eax), %eax 357 sub $0x80, %ecx 358 movdqa %xmm0, (%edx) 359 movdqa %xmm1, 0x10(%edx) 360 movdqa %xmm2, 0x20(%edx) 361 movdqa %xmm3, 0x30(%edx) 362 movdqa %xmm4, 0x40(%edx) 363 movdqa %xmm5, 0x50(%edx) 364 movdqa %xmm6, 0x60(%edx) 365 movdqa %xmm7, 0x70(%edx) 366 lea 0x80(%edx), %edx 367 368 jae L(shl_0_gobble_mem_loop) 369 cmp $-0x40, %ecx 370 lea 0x80(%ecx), %ecx 371 jl L(shl_0_mem_less_64bytes) 372 373 movdqa (%eax), %xmm0 374 sub $0x40, %ecx 375 movdqa 0x10(%eax), %xmm1 376 377 movdqa %xmm0, (%edx) 378 movdqa %xmm1, 0x10(%edx) 379 380 movdqa 0x20(%eax), %xmm0 381 movdqa 0x30(%eax), %xmm1 382 add $0x40, %eax 383 384 movdqa %xmm0, 0x20(%edx) 385 movdqa %xmm1, 0x30(%edx) 386 add $0x40, %edx 387 388L(shl_0_mem_less_64bytes): 389 cmp $0x20, %ecx 390 jb L(shl_0_mem_less_32bytes) 391 movdqa (%eax), %xmm0 392 sub $0x20, %ecx 393 movdqa 0x10(%eax), %xmm1 394 add $0x20, %eax 395 movdqa %xmm0, (%edx) 396 movdqa %xmm1, 0x10(%edx) 397 add $0x20, %edx 398 399L(shl_0_mem_less_32bytes): 400 cmp $0x10, %ecx 401 jb L(shl_0_mem_less_16bytes) 402 sub $0x10, %ecx 403 movdqa (%eax), %xmm0 404 add $0x10, %eax 405 movdqa %xmm0, (%edx) 406 add $0x10, %edx 407 408L(shl_0_mem_less_16bytes): 409 add %ecx, %edx 410 add %ecx, %eax 411 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) 412 413 .p2align 4 414L(shl_1): 415#ifndef USE_AS_MEMMOVE 416 movaps -1(%eax), %xmm1 417#else 418 movl DEST+4(%esp), %edi 419 movaps -1(%eax), %xmm1 420 movdqu %xmm0, (%edi) 421#endif 422#ifdef DATA_CACHE_SIZE_HALF 423 cmp $DATA_CACHE_SIZE_HALF, %ecx 424#else 425# if (defined SHARED || defined __PIC__) 426 SETUP_PIC_REG(bx) 427 add $_GLOBAL_OFFSET_TABLE_, %ebx 428 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 429# else 430 cmp __x86_data_cache_size_half, %ecx 431# endif 432#endif 433 jb L(sh_1_no_prefetch) 434 435 lea -64(%ecx), %ecx 436 437 .p2align 4 438L(Shl1LoopStart): 439 prefetcht0 0x1c0(%eax) 440 prefetcht0 0x1c0(%edx) 441 movaps 15(%eax), %xmm2 442 movaps 31(%eax), %xmm3 443 movaps 47(%eax), %xmm4 444 movaps 63(%eax), %xmm5 445 movaps %xmm5, %xmm7 446 palignr $1, %xmm4, %xmm5 447 palignr $1, %xmm3, %xmm4 448 movaps %xmm5, 48(%edx) 449 palignr $1, %xmm2, %xmm3 450 lea 64(%eax), %eax 451 palignr $1, %xmm1, %xmm2 452 movaps %xmm4, 32(%edx) 453 movaps %xmm3, 16(%edx) 454 movaps %xmm7, %xmm1 455 movaps %xmm2, (%edx) 456 lea 64(%edx), %edx 457 sub $64, %ecx 458 ja L(Shl1LoopStart) 459 460L(Shl1LoopLeave): 461 add $32, %ecx 462 jle L(shl_end_0) 463 464 movaps 15(%eax), %xmm2 465 movaps 31(%eax), %xmm3 466 palignr $1, %xmm2, %xmm3 467 palignr $1, %xmm1, %xmm2 468 movaps %xmm2, (%edx) 469 movaps %xmm3, 16(%edx) 470 lea 32(%edx, %ecx), %edx 471 lea 32(%eax, %ecx), %eax 472 POP (%edi) 473 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 474 475 CFI_PUSH (%edi) 476 477 .p2align 4 478L(sh_1_no_prefetch): 479 lea -32(%ecx), %ecx 480 lea -1(%eax), %eax 481 xor %edi, %edi 482 483 .p2align 4 484L(sh_1_no_prefetch_loop): 485 movdqa 16(%eax, %edi), %xmm2 486 sub $32, %ecx 487 movdqa 32(%eax, %edi), %xmm3 488 movdqa %xmm3, %xmm4 489 palignr $1, %xmm2, %xmm3 490 palignr $1, %xmm1, %xmm2 491 lea 32(%edi), %edi 492 movdqa %xmm2, -32(%edx, %edi) 493 movdqa %xmm3, -16(%edx, %edi) 494 jb L(sh_1_end_no_prefetch_loop) 495 496 movdqa 16(%eax, %edi), %xmm2 497 sub $32, %ecx 498 movdqa 32(%eax, %edi), %xmm3 499 movdqa %xmm3, %xmm1 500 palignr $1, %xmm2, %xmm3 501 palignr $1, %xmm4, %xmm2 502 lea 32(%edi), %edi 503 movdqa %xmm2, -32(%edx, %edi) 504 movdqa %xmm3, -16(%edx, %edi) 505 jae L(sh_1_no_prefetch_loop) 506 507L(sh_1_end_no_prefetch_loop): 508 lea 32(%ecx), %ecx 509 add %ecx, %edi 510 add %edi, %edx 511 lea 1(%edi, %eax), %eax 512 POP (%edi) 513 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 514 515 CFI_PUSH (%edi) 516 517 .p2align 4 518L(shl_2): 519#ifndef USE_AS_MEMMOVE 520 movaps -2(%eax), %xmm1 521#else 522 movl DEST+4(%esp), %edi 523 movaps -2(%eax), %xmm1 524 movdqu %xmm0, (%edi) 525#endif 526#ifdef DATA_CACHE_SIZE_HALF 527 cmp $DATA_CACHE_SIZE_HALF, %ecx 528#else 529# if (defined SHARED || defined __PIC__) 530 SETUP_PIC_REG(bx) 531 add $_GLOBAL_OFFSET_TABLE_, %ebx 532 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 533# else 534 cmp __x86_data_cache_size_half, %ecx 535# endif 536#endif 537 jb L(sh_2_no_prefetch) 538 539 lea -64(%ecx), %ecx 540 541 .p2align 4 542L(Shl2LoopStart): 543 prefetcht0 0x1c0(%eax) 544 prefetcht0 0x1c0(%edx) 545 movaps 14(%eax), %xmm2 546 movaps 30(%eax), %xmm3 547 movaps 46(%eax), %xmm4 548 movaps 62(%eax), %xmm5 549 movaps %xmm5, %xmm7 550 palignr $2, %xmm4, %xmm5 551 palignr $2, %xmm3, %xmm4 552 movaps %xmm5, 48(%edx) 553 palignr $2, %xmm2, %xmm3 554 lea 64(%eax), %eax 555 palignr $2, %xmm1, %xmm2 556 movaps %xmm4, 32(%edx) 557 movaps %xmm3, 16(%edx) 558 movaps %xmm7, %xmm1 559 movaps %xmm2, (%edx) 560 lea 64(%edx), %edx 561 sub $64, %ecx 562 ja L(Shl2LoopStart) 563 564L(Shl2LoopLeave): 565 add $32, %ecx 566 jle L(shl_end_0) 567 568 movaps 14(%eax), %xmm2 569 movaps 30(%eax), %xmm3 570 palignr $2, %xmm2, %xmm3 571 palignr $2, %xmm1, %xmm2 572 movaps %xmm2, (%edx) 573 movaps %xmm3, 16(%edx) 574 lea 32(%edx, %ecx), %edx 575 lea 32(%eax, %ecx), %eax 576 POP (%edi) 577 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 578 579 CFI_PUSH (%edi) 580 581 .p2align 4 582L(sh_2_no_prefetch): 583 lea -32(%ecx), %ecx 584 lea -2(%eax), %eax 585 xor %edi, %edi 586 587 .p2align 4 588L(sh_2_no_prefetch_loop): 589 movdqa 16(%eax, %edi), %xmm2 590 sub $32, %ecx 591 movdqa 32(%eax, %edi), %xmm3 592 movdqa %xmm3, %xmm4 593 palignr $2, %xmm2, %xmm3 594 palignr $2, %xmm1, %xmm2 595 lea 32(%edi), %edi 596 movdqa %xmm2, -32(%edx, %edi) 597 movdqa %xmm3, -16(%edx, %edi) 598 jb L(sh_2_end_no_prefetch_loop) 599 600 movdqa 16(%eax, %edi), %xmm2 601 sub $32, %ecx 602 movdqa 32(%eax, %edi), %xmm3 603 movdqa %xmm3, %xmm1 604 palignr $2, %xmm2, %xmm3 605 palignr $2, %xmm4, %xmm2 606 lea 32(%edi), %edi 607 movdqa %xmm2, -32(%edx, %edi) 608 movdqa %xmm3, -16(%edx, %edi) 609 jae L(sh_2_no_prefetch_loop) 610 611L(sh_2_end_no_prefetch_loop): 612 lea 32(%ecx), %ecx 613 add %ecx, %edi 614 add %edi, %edx 615 lea 2(%edi, %eax), %eax 616 POP (%edi) 617 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 618 619 CFI_PUSH (%edi) 620 621 .p2align 4 622L(shl_3): 623#ifndef USE_AS_MEMMOVE 624 movaps -3(%eax), %xmm1 625#else 626 movl DEST+4(%esp), %edi 627 movaps -3(%eax), %xmm1 628 movdqu %xmm0, (%edi) 629#endif 630#ifdef DATA_CACHE_SIZE_HALF 631 cmp $DATA_CACHE_SIZE_HALF, %ecx 632#else 633# if (defined SHARED || defined __PIC__) 634 SETUP_PIC_REG(bx) 635 add $_GLOBAL_OFFSET_TABLE_, %ebx 636 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 637# else 638 cmp __x86_data_cache_size_half, %ecx 639# endif 640#endif 641 jb L(sh_3_no_prefetch) 642 643 lea -64(%ecx), %ecx 644 645 .p2align 4 646L(Shl3LoopStart): 647 prefetcht0 0x1c0(%eax) 648 prefetcht0 0x1c0(%edx) 649 movaps 13(%eax), %xmm2 650 movaps 29(%eax), %xmm3 651 movaps 45(%eax), %xmm4 652 movaps 61(%eax), %xmm5 653 movaps %xmm5, %xmm7 654 palignr $3, %xmm4, %xmm5 655 palignr $3, %xmm3, %xmm4 656 movaps %xmm5, 48(%edx) 657 palignr $3, %xmm2, %xmm3 658 lea 64(%eax), %eax 659 palignr $3, %xmm1, %xmm2 660 movaps %xmm4, 32(%edx) 661 movaps %xmm3, 16(%edx) 662 movaps %xmm7, %xmm1 663 movaps %xmm2, (%edx) 664 lea 64(%edx), %edx 665 sub $64, %ecx 666 ja L(Shl3LoopStart) 667 668L(Shl3LoopLeave): 669 add $32, %ecx 670 jle L(shl_end_0) 671 672 movaps 13(%eax), %xmm2 673 movaps 29(%eax), %xmm3 674 palignr $3, %xmm2, %xmm3 675 palignr $3, %xmm1, %xmm2 676 movaps %xmm2, (%edx) 677 movaps %xmm3, 16(%edx) 678 lea 32(%edx, %ecx), %edx 679 lea 32(%eax, %ecx), %eax 680 POP (%edi) 681 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 682 683 CFI_PUSH (%edi) 684 685 .p2align 4 686L(sh_3_no_prefetch): 687 lea -32(%ecx), %ecx 688 lea -3(%eax), %eax 689 xor %edi, %edi 690 691 .p2align 4 692L(sh_3_no_prefetch_loop): 693 movdqa 16(%eax, %edi), %xmm2 694 sub $32, %ecx 695 movdqa 32(%eax, %edi), %xmm3 696 movdqa %xmm3, %xmm4 697 palignr $3, %xmm2, %xmm3 698 palignr $3, %xmm1, %xmm2 699 lea 32(%edi), %edi 700 movdqa %xmm2, -32(%edx, %edi) 701 movdqa %xmm3, -16(%edx, %edi) 702 703 jb L(sh_3_end_no_prefetch_loop) 704 705 movdqa 16(%eax, %edi), %xmm2 706 sub $32, %ecx 707 movdqa 32(%eax, %edi), %xmm3 708 movdqa %xmm3, %xmm1 709 palignr $3, %xmm2, %xmm3 710 palignr $3, %xmm4, %xmm2 711 lea 32(%edi), %edi 712 movdqa %xmm2, -32(%edx, %edi) 713 movdqa %xmm3, -16(%edx, %edi) 714 715 jae L(sh_3_no_prefetch_loop) 716 717L(sh_3_end_no_prefetch_loop): 718 lea 32(%ecx), %ecx 719 add %ecx, %edi 720 add %edi, %edx 721 lea 3(%edi, %eax), %eax 722 POP (%edi) 723 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 724 725 CFI_PUSH (%edi) 726 727 .p2align 4 728L(shl_4): 729#ifndef USE_AS_MEMMOVE 730 movaps -4(%eax), %xmm1 731#else 732 movl DEST+4(%esp), %edi 733 movaps -4(%eax), %xmm1 734 movdqu %xmm0, (%edi) 735#endif 736#ifdef DATA_CACHE_SIZE_HALF 737 cmp $DATA_CACHE_SIZE_HALF, %ecx 738#else 739# if (defined SHARED || defined __PIC__) 740 SETUP_PIC_REG(bx) 741 add $_GLOBAL_OFFSET_TABLE_, %ebx 742 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 743# else 744 cmp __x86_data_cache_size_half, %ecx 745# endif 746#endif 747 jb L(sh_4_no_prefetch) 748 749 lea -64(%ecx), %ecx 750 751 .p2align 4 752L(Shl4LoopStart): 753 prefetcht0 0x1c0(%eax) 754 prefetcht0 0x1c0(%edx) 755 movaps 12(%eax), %xmm2 756 movaps 28(%eax), %xmm3 757 movaps 44(%eax), %xmm4 758 movaps 60(%eax), %xmm5 759 movaps %xmm5, %xmm7 760 palignr $4, %xmm4, %xmm5 761 palignr $4, %xmm3, %xmm4 762 movaps %xmm5, 48(%edx) 763 palignr $4, %xmm2, %xmm3 764 lea 64(%eax), %eax 765 palignr $4, %xmm1, %xmm2 766 movaps %xmm4, 32(%edx) 767 movaps %xmm3, 16(%edx) 768 movaps %xmm7, %xmm1 769 movaps %xmm2, (%edx) 770 lea 64(%edx), %edx 771 sub $64, %ecx 772 ja L(Shl4LoopStart) 773 774L(Shl4LoopLeave): 775 add $32, %ecx 776 jle L(shl_end_0) 777 778 movaps 12(%eax), %xmm2 779 movaps 28(%eax), %xmm3 780 palignr $4, %xmm2, %xmm3 781 palignr $4, %xmm1, %xmm2 782 movaps %xmm2, (%edx) 783 movaps %xmm3, 16(%edx) 784 lea 32(%edx, %ecx), %edx 785 lea 32(%eax, %ecx), %eax 786 POP (%edi) 787 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 788 789 CFI_PUSH (%edi) 790 791 .p2align 4 792L(sh_4_no_prefetch): 793 lea -32(%ecx), %ecx 794 lea -4(%eax), %eax 795 xor %edi, %edi 796 797 .p2align 4 798L(sh_4_no_prefetch_loop): 799 movdqa 16(%eax, %edi), %xmm2 800 sub $32, %ecx 801 movdqa 32(%eax, %edi), %xmm3 802 movdqa %xmm3, %xmm4 803 palignr $4, %xmm2, %xmm3 804 palignr $4, %xmm1, %xmm2 805 lea 32(%edi), %edi 806 movdqa %xmm2, -32(%edx, %edi) 807 movdqa %xmm3, -16(%edx, %edi) 808 809 jb L(sh_4_end_no_prefetch_loop) 810 811 movdqa 16(%eax, %edi), %xmm2 812 sub $32, %ecx 813 movdqa 32(%eax, %edi), %xmm3 814 movdqa %xmm3, %xmm1 815 palignr $4, %xmm2, %xmm3 816 palignr $4, %xmm4, %xmm2 817 lea 32(%edi), %edi 818 movdqa %xmm2, -32(%edx, %edi) 819 movdqa %xmm3, -16(%edx, %edi) 820 821 jae L(sh_4_no_prefetch_loop) 822 823L(sh_4_end_no_prefetch_loop): 824 lea 32(%ecx), %ecx 825 add %ecx, %edi 826 add %edi, %edx 827 lea 4(%edi, %eax), %eax 828 POP (%edi) 829 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 830 831 CFI_PUSH (%edi) 832 833 .p2align 4 834L(shl_5): 835#ifndef USE_AS_MEMMOVE 836 movaps -5(%eax), %xmm1 837#else 838 movl DEST+4(%esp), %edi 839 movaps -5(%eax), %xmm1 840 movdqu %xmm0, (%edi) 841#endif 842#ifdef DATA_CACHE_SIZE_HALF 843 cmp $DATA_CACHE_SIZE_HALF, %ecx 844#else 845# if (defined SHARED || defined __PIC__) 846 SETUP_PIC_REG(bx) 847 add $_GLOBAL_OFFSET_TABLE_, %ebx 848 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 849# else 850 cmp __x86_data_cache_size_half, %ecx 851# endif 852#endif 853 jb L(sh_5_no_prefetch) 854 855 lea -64(%ecx), %ecx 856 857 .p2align 4 858L(Shl5LoopStart): 859 prefetcht0 0x1c0(%eax) 860 prefetcht0 0x1c0(%edx) 861 movaps 11(%eax), %xmm2 862 movaps 27(%eax), %xmm3 863 movaps 43(%eax), %xmm4 864 movaps 59(%eax), %xmm5 865 movaps %xmm5, %xmm7 866 palignr $5, %xmm4, %xmm5 867 palignr $5, %xmm3, %xmm4 868 movaps %xmm5, 48(%edx) 869 palignr $5, %xmm2, %xmm3 870 lea 64(%eax), %eax 871 palignr $5, %xmm1, %xmm2 872 movaps %xmm4, 32(%edx) 873 movaps %xmm3, 16(%edx) 874 movaps %xmm7, %xmm1 875 movaps %xmm2, (%edx) 876 lea 64(%edx), %edx 877 sub $64, %ecx 878 ja L(Shl5LoopStart) 879 880L(Shl5LoopLeave): 881 add $32, %ecx 882 jle L(shl_end_0) 883 884 movaps 11(%eax), %xmm2 885 movaps 27(%eax), %xmm3 886 palignr $5, %xmm2, %xmm3 887 palignr $5, %xmm1, %xmm2 888 movaps %xmm2, (%edx) 889 movaps %xmm3, 16(%edx) 890 lea 32(%edx, %ecx), %edx 891 lea 32(%eax, %ecx), %eax 892 POP (%edi) 893 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 894 895 CFI_PUSH (%edi) 896 897 .p2align 4 898L(sh_5_no_prefetch): 899 lea -32(%ecx), %ecx 900 lea -5(%eax), %eax 901 xor %edi, %edi 902 903 .p2align 4 904L(sh_5_no_prefetch_loop): 905 movdqa 16(%eax, %edi), %xmm2 906 sub $32, %ecx 907 movdqa 32(%eax, %edi), %xmm3 908 movdqa %xmm3, %xmm4 909 palignr $5, %xmm2, %xmm3 910 palignr $5, %xmm1, %xmm2 911 lea 32(%edi), %edi 912 movdqa %xmm2, -32(%edx, %edi) 913 movdqa %xmm3, -16(%edx, %edi) 914 915 jb L(sh_5_end_no_prefetch_loop) 916 917 movdqa 16(%eax, %edi), %xmm2 918 sub $32, %ecx 919 movdqa 32(%eax, %edi), %xmm3 920 movdqa %xmm3, %xmm1 921 palignr $5, %xmm2, %xmm3 922 palignr $5, %xmm4, %xmm2 923 lea 32(%edi), %edi 924 movdqa %xmm2, -32(%edx, %edi) 925 movdqa %xmm3, -16(%edx, %edi) 926 927 jae L(sh_5_no_prefetch_loop) 928 929L(sh_5_end_no_prefetch_loop): 930 lea 32(%ecx), %ecx 931 add %ecx, %edi 932 add %edi, %edx 933 lea 5(%edi, %eax), %eax 934 POP (%edi) 935 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 936 937 CFI_PUSH (%edi) 938 939 .p2align 4 940L(shl_6): 941#ifndef USE_AS_MEMMOVE 942 movaps -6(%eax), %xmm1 943#else 944 movl DEST+4(%esp), %edi 945 movaps -6(%eax), %xmm1 946 movdqu %xmm0, (%edi) 947#endif 948#ifdef DATA_CACHE_SIZE_HALF 949 cmp $DATA_CACHE_SIZE_HALF, %ecx 950#else 951# if (defined SHARED || defined __PIC__) 952 SETUP_PIC_REG(bx) 953 add $_GLOBAL_OFFSET_TABLE_, %ebx 954 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 955# else 956 cmp __x86_data_cache_size_half, %ecx 957# endif 958#endif 959 jb L(sh_6_no_prefetch) 960 961 lea -64(%ecx), %ecx 962 963 .p2align 4 964L(Shl6LoopStart): 965 prefetcht0 0x1c0(%eax) 966 prefetcht0 0x1c0(%edx) 967 movaps 10(%eax), %xmm2 968 movaps 26(%eax), %xmm3 969 movaps 42(%eax), %xmm4 970 movaps 58(%eax), %xmm5 971 movaps %xmm5, %xmm7 972 palignr $6, %xmm4, %xmm5 973 palignr $6, %xmm3, %xmm4 974 movaps %xmm5, 48(%edx) 975 palignr $6, %xmm2, %xmm3 976 lea 64(%eax), %eax 977 palignr $6, %xmm1, %xmm2 978 movaps %xmm4, 32(%edx) 979 movaps %xmm3, 16(%edx) 980 movaps %xmm7, %xmm1 981 movaps %xmm2, (%edx) 982 lea 64(%edx), %edx 983 sub $64, %ecx 984 ja L(Shl6LoopStart) 985 986L(Shl6LoopLeave): 987 add $32, %ecx 988 jle L(shl_end_0) 989 990 movaps 10(%eax), %xmm2 991 movaps 26(%eax), %xmm3 992 palignr $6, %xmm2, %xmm3 993 palignr $6, %xmm1, %xmm2 994 movaps %xmm2, (%edx) 995 movaps %xmm3, 16(%edx) 996 lea 32(%edx, %ecx), %edx 997 lea 32(%eax, %ecx), %eax 998 POP (%edi) 999 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1000 1001 CFI_PUSH (%edi) 1002 1003 .p2align 4 1004L(sh_6_no_prefetch): 1005 lea -32(%ecx), %ecx 1006 lea -6(%eax), %eax 1007 xor %edi, %edi 1008 1009 .p2align 4 1010L(sh_6_no_prefetch_loop): 1011 movdqa 16(%eax, %edi), %xmm2 1012 sub $32, %ecx 1013 movdqa 32(%eax, %edi), %xmm3 1014 movdqa %xmm3, %xmm4 1015 palignr $6, %xmm2, %xmm3 1016 palignr $6, %xmm1, %xmm2 1017 lea 32(%edi), %edi 1018 movdqa %xmm2, -32(%edx, %edi) 1019 movdqa %xmm3, -16(%edx, %edi) 1020 1021 jb L(sh_6_end_no_prefetch_loop) 1022 1023 movdqa 16(%eax, %edi), %xmm2 1024 sub $32, %ecx 1025 movdqa 32(%eax, %edi), %xmm3 1026 movdqa %xmm3, %xmm1 1027 palignr $6, %xmm2, %xmm3 1028 palignr $6, %xmm4, %xmm2 1029 lea 32(%edi), %edi 1030 movdqa %xmm2, -32(%edx, %edi) 1031 movdqa %xmm3, -16(%edx, %edi) 1032 1033 jae L(sh_6_no_prefetch_loop) 1034 1035L(sh_6_end_no_prefetch_loop): 1036 lea 32(%ecx), %ecx 1037 add %ecx, %edi 1038 add %edi, %edx 1039 lea 6(%edi, %eax), %eax 1040 POP (%edi) 1041 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1042 1043 CFI_PUSH (%edi) 1044 1045 .p2align 4 1046L(shl_7): 1047#ifndef USE_AS_MEMMOVE 1048 movaps -7(%eax), %xmm1 1049#else 1050 movl DEST+4(%esp), %edi 1051 movaps -7(%eax), %xmm1 1052 movdqu %xmm0, (%edi) 1053#endif 1054#ifdef DATA_CACHE_SIZE_HALF 1055 cmp $DATA_CACHE_SIZE_HALF, %ecx 1056#else 1057# if (defined SHARED || defined __PIC__) 1058 SETUP_PIC_REG(bx) 1059 add $_GLOBAL_OFFSET_TABLE_, %ebx 1060 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1061# else 1062 cmp __x86_data_cache_size_half, %ecx 1063# endif 1064#endif 1065 jb L(sh_7_no_prefetch) 1066 1067 lea -64(%ecx), %ecx 1068 1069 .p2align 4 1070L(Shl7LoopStart): 1071 prefetcht0 0x1c0(%eax) 1072 prefetcht0 0x1c0(%edx) 1073 movaps 9(%eax), %xmm2 1074 movaps 25(%eax), %xmm3 1075 movaps 41(%eax), %xmm4 1076 movaps 57(%eax), %xmm5 1077 movaps %xmm5, %xmm7 1078 palignr $7, %xmm4, %xmm5 1079 palignr $7, %xmm3, %xmm4 1080 movaps %xmm5, 48(%edx) 1081 palignr $7, %xmm2, %xmm3 1082 lea 64(%eax), %eax 1083 palignr $7, %xmm1, %xmm2 1084 movaps %xmm4, 32(%edx) 1085 movaps %xmm3, 16(%edx) 1086 movaps %xmm7, %xmm1 1087 movaps %xmm2, (%edx) 1088 lea 64(%edx), %edx 1089 sub $64, %ecx 1090 ja L(Shl7LoopStart) 1091 1092L(Shl7LoopLeave): 1093 add $32, %ecx 1094 jle L(shl_end_0) 1095 1096 movaps 9(%eax), %xmm2 1097 movaps 25(%eax), %xmm3 1098 palignr $7, %xmm2, %xmm3 1099 palignr $7, %xmm1, %xmm2 1100 movaps %xmm2, (%edx) 1101 movaps %xmm3, 16(%edx) 1102 lea 32(%edx, %ecx), %edx 1103 lea 32(%eax, %ecx), %eax 1104 POP (%edi) 1105 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1106 1107 CFI_PUSH (%edi) 1108 1109 .p2align 4 1110L(sh_7_no_prefetch): 1111 lea -32(%ecx), %ecx 1112 lea -7(%eax), %eax 1113 xor %edi, %edi 1114 1115 .p2align 4 1116L(sh_7_no_prefetch_loop): 1117 movdqa 16(%eax, %edi), %xmm2 1118 sub $32, %ecx 1119 movdqa 32(%eax, %edi), %xmm3 1120 movdqa %xmm3, %xmm4 1121 palignr $7, %xmm2, %xmm3 1122 palignr $7, %xmm1, %xmm2 1123 lea 32(%edi), %edi 1124 movdqa %xmm2, -32(%edx, %edi) 1125 movdqa %xmm3, -16(%edx, %edi) 1126 jb L(sh_7_end_no_prefetch_loop) 1127 1128 movdqa 16(%eax, %edi), %xmm2 1129 sub $32, %ecx 1130 movdqa 32(%eax, %edi), %xmm3 1131 movdqa %xmm3, %xmm1 1132 palignr $7, %xmm2, %xmm3 1133 palignr $7, %xmm4, %xmm2 1134 lea 32(%edi), %edi 1135 movdqa %xmm2, -32(%edx, %edi) 1136 movdqa %xmm3, -16(%edx, %edi) 1137 jae L(sh_7_no_prefetch_loop) 1138 1139L(sh_7_end_no_prefetch_loop): 1140 lea 32(%ecx), %ecx 1141 add %ecx, %edi 1142 add %edi, %edx 1143 lea 7(%edi, %eax), %eax 1144 POP (%edi) 1145 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1146 1147 CFI_PUSH (%edi) 1148 1149 .p2align 4 1150L(shl_8): 1151#ifndef USE_AS_MEMMOVE 1152 movaps -8(%eax), %xmm1 1153#else 1154 movl DEST+4(%esp), %edi 1155 movaps -8(%eax), %xmm1 1156 movdqu %xmm0, (%edi) 1157#endif 1158#ifdef DATA_CACHE_SIZE_HALF 1159 cmp $DATA_CACHE_SIZE_HALF, %ecx 1160#else 1161# if (defined SHARED || defined __PIC__) 1162 SETUP_PIC_REG(bx) 1163 add $_GLOBAL_OFFSET_TABLE_, %ebx 1164 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1165# else 1166 cmp __x86_data_cache_size_half, %ecx 1167# endif 1168#endif 1169 jb L(sh_8_no_prefetch) 1170 1171 lea -64(%ecx), %ecx 1172 1173 .p2align 4 1174L(Shl8LoopStart): 1175 prefetcht0 0x1c0(%eax) 1176 prefetcht0 0x1c0(%edx) 1177 movaps 8(%eax), %xmm2 1178 movaps 24(%eax), %xmm3 1179 movaps 40(%eax), %xmm4 1180 movaps 56(%eax), %xmm5 1181 movaps %xmm5, %xmm7 1182 palignr $8, %xmm4, %xmm5 1183 palignr $8, %xmm3, %xmm4 1184 movaps %xmm5, 48(%edx) 1185 palignr $8, %xmm2, %xmm3 1186 lea 64(%eax), %eax 1187 palignr $8, %xmm1, %xmm2 1188 movaps %xmm4, 32(%edx) 1189 movaps %xmm3, 16(%edx) 1190 movaps %xmm7, %xmm1 1191 movaps %xmm2, (%edx) 1192 lea 64(%edx), %edx 1193 sub $64, %ecx 1194 ja L(Shl8LoopStart) 1195 1196L(LoopLeave8): 1197 add $32, %ecx 1198 jle L(shl_end_0) 1199 1200 movaps 8(%eax), %xmm2 1201 movaps 24(%eax), %xmm3 1202 palignr $8, %xmm2, %xmm3 1203 palignr $8, %xmm1, %xmm2 1204 movaps %xmm2, (%edx) 1205 movaps %xmm3, 16(%edx) 1206 lea 32(%edx, %ecx), %edx 1207 lea 32(%eax, %ecx), %eax 1208 POP (%edi) 1209 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1210 1211 CFI_PUSH (%edi) 1212 1213 .p2align 4 1214L(sh_8_no_prefetch): 1215 lea -32(%ecx), %ecx 1216 lea -8(%eax), %eax 1217 xor %edi, %edi 1218 1219 .p2align 4 1220L(sh_8_no_prefetch_loop): 1221 movdqa 16(%eax, %edi), %xmm2 1222 sub $32, %ecx 1223 movdqa 32(%eax, %edi), %xmm3 1224 movdqa %xmm3, %xmm4 1225 palignr $8, %xmm2, %xmm3 1226 palignr $8, %xmm1, %xmm2 1227 lea 32(%edi), %edi 1228 movdqa %xmm2, -32(%edx, %edi) 1229 movdqa %xmm3, -16(%edx, %edi) 1230 jb L(sh_8_end_no_prefetch_loop) 1231 1232 movdqa 16(%eax, %edi), %xmm2 1233 sub $32, %ecx 1234 movdqa 32(%eax, %edi), %xmm3 1235 movdqa %xmm3, %xmm1 1236 palignr $8, %xmm2, %xmm3 1237 palignr $8, %xmm4, %xmm2 1238 lea 32(%edi), %edi 1239 movdqa %xmm2, -32(%edx, %edi) 1240 movdqa %xmm3, -16(%edx, %edi) 1241 jae L(sh_8_no_prefetch_loop) 1242 1243L(sh_8_end_no_prefetch_loop): 1244 lea 32(%ecx), %ecx 1245 add %ecx, %edi 1246 add %edi, %edx 1247 lea 8(%edi, %eax), %eax 1248 POP (%edi) 1249 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1250 1251 CFI_PUSH (%edi) 1252 1253 .p2align 4 1254L(shl_9): 1255#ifndef USE_AS_MEMMOVE 1256 movaps -9(%eax), %xmm1 1257#else 1258 movl DEST+4(%esp), %edi 1259 movaps -9(%eax), %xmm1 1260 movdqu %xmm0, (%edi) 1261#endif 1262#ifdef DATA_CACHE_SIZE_HALF 1263 cmp $DATA_CACHE_SIZE_HALF, %ecx 1264#else 1265# if (defined SHARED || defined __PIC__) 1266 SETUP_PIC_REG(bx) 1267 add $_GLOBAL_OFFSET_TABLE_, %ebx 1268 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1269# else 1270 cmp __x86_data_cache_size_half, %ecx 1271# endif 1272#endif 1273 jb L(sh_9_no_prefetch) 1274 1275 lea -64(%ecx), %ecx 1276 1277 .p2align 4 1278L(Shl9LoopStart): 1279 prefetcht0 0x1c0(%eax) 1280 prefetcht0 0x1c0(%edx) 1281 movaps 7(%eax), %xmm2 1282 movaps 23(%eax), %xmm3 1283 movaps 39(%eax), %xmm4 1284 movaps 55(%eax), %xmm5 1285 movaps %xmm5, %xmm7 1286 palignr $9, %xmm4, %xmm5 1287 palignr $9, %xmm3, %xmm4 1288 movaps %xmm5, 48(%edx) 1289 palignr $9, %xmm2, %xmm3 1290 lea 64(%eax), %eax 1291 palignr $9, %xmm1, %xmm2 1292 movaps %xmm4, 32(%edx) 1293 movaps %xmm3, 16(%edx) 1294 movaps %xmm7, %xmm1 1295 movaps %xmm2, (%edx) 1296 lea 64(%edx), %edx 1297 sub $64, %ecx 1298 ja L(Shl9LoopStart) 1299 1300L(Shl9LoopLeave): 1301 add $32, %ecx 1302 jle L(shl_end_0) 1303 1304 movaps 7(%eax), %xmm2 1305 movaps 23(%eax), %xmm3 1306 palignr $9, %xmm2, %xmm3 1307 palignr $9, %xmm1, %xmm2 1308 1309 movaps %xmm2, (%edx) 1310 movaps %xmm3, 16(%edx) 1311 lea 32(%edx, %ecx), %edx 1312 lea 32(%eax, %ecx), %eax 1313 POP (%edi) 1314 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1315 1316 CFI_PUSH (%edi) 1317 1318 .p2align 4 1319L(sh_9_no_prefetch): 1320 lea -32(%ecx), %ecx 1321 lea -9(%eax), %eax 1322 xor %edi, %edi 1323 1324 .p2align 4 1325L(sh_9_no_prefetch_loop): 1326 movdqa 16(%eax, %edi), %xmm2 1327 sub $32, %ecx 1328 movdqa 32(%eax, %edi), %xmm3 1329 movdqa %xmm3, %xmm4 1330 palignr $9, %xmm2, %xmm3 1331 palignr $9, %xmm1, %xmm2 1332 lea 32(%edi), %edi 1333 movdqa %xmm2, -32(%edx, %edi) 1334 movdqa %xmm3, -16(%edx, %edi) 1335 jb L(sh_9_end_no_prefetch_loop) 1336 1337 movdqa 16(%eax, %edi), %xmm2 1338 sub $32, %ecx 1339 movdqa 32(%eax, %edi), %xmm3 1340 movdqa %xmm3, %xmm1 1341 palignr $9, %xmm2, %xmm3 1342 palignr $9, %xmm4, %xmm2 1343 lea 32(%edi), %edi 1344 movdqa %xmm2, -32(%edx, %edi) 1345 movdqa %xmm3, -16(%edx, %edi) 1346 jae L(sh_9_no_prefetch_loop) 1347 1348L(sh_9_end_no_prefetch_loop): 1349 lea 32(%ecx), %ecx 1350 add %ecx, %edi 1351 add %edi, %edx 1352 lea 9(%edi, %eax), %eax 1353 POP (%edi) 1354 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1355 1356 CFI_PUSH (%edi) 1357 1358 .p2align 4 1359L(shl_10): 1360#ifndef USE_AS_MEMMOVE 1361 movaps -10(%eax), %xmm1 1362#else 1363 movl DEST+4(%esp), %edi 1364 movaps -10(%eax), %xmm1 1365 movdqu %xmm0, (%edi) 1366#endif 1367#ifdef DATA_CACHE_SIZE_HALF 1368 cmp $DATA_CACHE_SIZE_HALF, %ecx 1369#else 1370# if (defined SHARED || defined __PIC__) 1371 SETUP_PIC_REG(bx) 1372 add $_GLOBAL_OFFSET_TABLE_, %ebx 1373 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1374# else 1375 cmp __x86_data_cache_size_half, %ecx 1376# endif 1377#endif 1378 jb L(sh_10_no_prefetch) 1379 1380 lea -64(%ecx), %ecx 1381 1382 .p2align 4 1383L(Shl10LoopStart): 1384 prefetcht0 0x1c0(%eax) 1385 prefetcht0 0x1c0(%edx) 1386 movaps 6(%eax), %xmm2 1387 movaps 22(%eax), %xmm3 1388 movaps 38(%eax), %xmm4 1389 movaps 54(%eax), %xmm5 1390 movaps %xmm5, %xmm7 1391 palignr $10, %xmm4, %xmm5 1392 palignr $10, %xmm3, %xmm4 1393 movaps %xmm5, 48(%edx) 1394 palignr $10, %xmm2, %xmm3 1395 lea 64(%eax), %eax 1396 palignr $10, %xmm1, %xmm2 1397 movaps %xmm4, 32(%edx) 1398 movaps %xmm3, 16(%edx) 1399 movaps %xmm7, %xmm1 1400 movaps %xmm2, (%edx) 1401 lea 64(%edx), %edx 1402 sub $64, %ecx 1403 ja L(Shl10LoopStart) 1404 1405L(Shl10LoopLeave): 1406 add $32, %ecx 1407 jle L(shl_end_0) 1408 1409 movaps 6(%eax), %xmm2 1410 movaps 22(%eax), %xmm3 1411 palignr $10, %xmm2, %xmm3 1412 palignr $10, %xmm1, %xmm2 1413 1414 movaps %xmm2, (%edx) 1415 movaps %xmm3, 16(%edx) 1416 lea 32(%edx, %ecx), %edx 1417 lea 32(%eax, %ecx), %eax 1418 POP (%edi) 1419 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1420 1421 CFI_PUSH (%edi) 1422 1423 .p2align 4 1424L(sh_10_no_prefetch): 1425 lea -32(%ecx), %ecx 1426 lea -10(%eax), %eax 1427 xor %edi, %edi 1428 1429 .p2align 4 1430L(sh_10_no_prefetch_loop): 1431 movdqa 16(%eax, %edi), %xmm2 1432 sub $32, %ecx 1433 movdqa 32(%eax, %edi), %xmm3 1434 movdqa %xmm3, %xmm4 1435 palignr $10, %xmm2, %xmm3 1436 palignr $10, %xmm1, %xmm2 1437 lea 32(%edi), %edi 1438 movdqa %xmm2, -32(%edx, %edi) 1439 movdqa %xmm3, -16(%edx, %edi) 1440 jb L(sh_10_end_no_prefetch_loop) 1441 1442 movdqa 16(%eax, %edi), %xmm2 1443 sub $32, %ecx 1444 movdqa 32(%eax, %edi), %xmm3 1445 movdqa %xmm3, %xmm1 1446 palignr $10, %xmm2, %xmm3 1447 palignr $10, %xmm4, %xmm2 1448 lea 32(%edi), %edi 1449 movdqa %xmm2, -32(%edx, %edi) 1450 movdqa %xmm3, -16(%edx, %edi) 1451 jae L(sh_10_no_prefetch_loop) 1452 1453L(sh_10_end_no_prefetch_loop): 1454 lea 32(%ecx), %ecx 1455 add %ecx, %edi 1456 add %edi, %edx 1457 lea 10(%edi, %eax), %eax 1458 POP (%edi) 1459 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1460 1461 CFI_PUSH (%edi) 1462 1463 .p2align 4 1464L(shl_11): 1465#ifndef USE_AS_MEMMOVE 1466 movaps -11(%eax), %xmm1 1467#else 1468 movl DEST+4(%esp), %edi 1469 movaps -11(%eax), %xmm1 1470 movdqu %xmm0, (%edi) 1471#endif 1472#ifdef DATA_CACHE_SIZE_HALF 1473 cmp $DATA_CACHE_SIZE_HALF, %ecx 1474#else 1475# if (defined SHARED || defined __PIC__) 1476 SETUP_PIC_REG(bx) 1477 add $_GLOBAL_OFFSET_TABLE_, %ebx 1478 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1479# else 1480 cmp __x86_data_cache_size_half, %ecx 1481# endif 1482#endif 1483 jb L(sh_11_no_prefetch) 1484 1485 lea -64(%ecx), %ecx 1486 1487 .p2align 4 1488L(Shl11LoopStart): 1489 prefetcht0 0x1c0(%eax) 1490 prefetcht0 0x1c0(%edx) 1491 movaps 5(%eax), %xmm2 1492 movaps 21(%eax), %xmm3 1493 movaps 37(%eax), %xmm4 1494 movaps 53(%eax), %xmm5 1495 movaps %xmm5, %xmm7 1496 palignr $11, %xmm4, %xmm5 1497 palignr $11, %xmm3, %xmm4 1498 movaps %xmm5, 48(%edx) 1499 palignr $11, %xmm2, %xmm3 1500 lea 64(%eax), %eax 1501 palignr $11, %xmm1, %xmm2 1502 movaps %xmm4, 32(%edx) 1503 movaps %xmm3, 16(%edx) 1504 movaps %xmm7, %xmm1 1505 movaps %xmm2, (%edx) 1506 lea 64(%edx), %edx 1507 sub $64, %ecx 1508 ja L(Shl11LoopStart) 1509 1510L(Shl11LoopLeave): 1511 add $32, %ecx 1512 jle L(shl_end_0) 1513 1514 movaps 5(%eax), %xmm2 1515 movaps 21(%eax), %xmm3 1516 palignr $11, %xmm2, %xmm3 1517 palignr $11, %xmm1, %xmm2 1518 1519 movaps %xmm2, (%edx) 1520 movaps %xmm3, 16(%edx) 1521 lea 32(%edx, %ecx), %edx 1522 lea 32(%eax, %ecx), %eax 1523 POP (%edi) 1524 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1525 1526 CFI_PUSH (%edi) 1527 1528 .p2align 4 1529L(sh_11_no_prefetch): 1530 lea -32(%ecx), %ecx 1531 lea -11(%eax), %eax 1532 xor %edi, %edi 1533 1534 .p2align 4 1535L(sh_11_no_prefetch_loop): 1536 movdqa 16(%eax, %edi), %xmm2 1537 sub $32, %ecx 1538 movdqa 32(%eax, %edi), %xmm3 1539 movdqa %xmm3, %xmm4 1540 palignr $11, %xmm2, %xmm3 1541 palignr $11, %xmm1, %xmm2 1542 lea 32(%edi), %edi 1543 movdqa %xmm2, -32(%edx, %edi) 1544 movdqa %xmm3, -16(%edx, %edi) 1545 jb L(sh_11_end_no_prefetch_loop) 1546 1547 movdqa 16(%eax, %edi), %xmm2 1548 sub $32, %ecx 1549 movdqa 32(%eax, %edi), %xmm3 1550 movdqa %xmm3, %xmm1 1551 palignr $11, %xmm2, %xmm3 1552 palignr $11, %xmm4, %xmm2 1553 lea 32(%edi), %edi 1554 movdqa %xmm2, -32(%edx, %edi) 1555 movdqa %xmm3, -16(%edx, %edi) 1556 jae L(sh_11_no_prefetch_loop) 1557 1558L(sh_11_end_no_prefetch_loop): 1559 lea 32(%ecx), %ecx 1560 add %ecx, %edi 1561 add %edi, %edx 1562 lea 11(%edi, %eax), %eax 1563 POP (%edi) 1564 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1565 1566 CFI_PUSH (%edi) 1567 1568 .p2align 4 1569L(shl_12): 1570#ifndef USE_AS_MEMMOVE 1571 movaps -12(%eax), %xmm1 1572#else 1573 movl DEST+4(%esp), %edi 1574 movaps -12(%eax), %xmm1 1575 movdqu %xmm0, (%edi) 1576#endif 1577#ifdef DATA_CACHE_SIZE_HALF 1578 cmp $DATA_CACHE_SIZE_HALF, %ecx 1579#else 1580# if (defined SHARED || defined __PIC__) 1581 SETUP_PIC_REG(bx) 1582 add $_GLOBAL_OFFSET_TABLE_, %ebx 1583 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1584# else 1585 cmp __x86_data_cache_size_half, %ecx 1586# endif 1587#endif 1588 jb L(sh_12_no_prefetch) 1589 1590 lea -64(%ecx), %ecx 1591 1592 .p2align 4 1593L(Shl12LoopStart): 1594 prefetcht0 0x1c0(%eax) 1595 prefetcht0 0x1c0(%edx) 1596 movaps 4(%eax), %xmm2 1597 movaps 20(%eax), %xmm3 1598 movaps 36(%eax), %xmm4 1599 movaps 52(%eax), %xmm5 1600 movaps %xmm5, %xmm7 1601 palignr $12, %xmm4, %xmm5 1602 palignr $12, %xmm3, %xmm4 1603 movaps %xmm5, 48(%edx) 1604 palignr $12, %xmm2, %xmm3 1605 lea 64(%eax), %eax 1606 palignr $12, %xmm1, %xmm2 1607 movaps %xmm4, 32(%edx) 1608 movaps %xmm3, 16(%edx) 1609 movaps %xmm7, %xmm1 1610 movaps %xmm2, (%edx) 1611 lea 64(%edx), %edx 1612 sub $64, %ecx 1613 ja L(Shl12LoopStart) 1614 1615L(Shl12LoopLeave): 1616 add $32, %ecx 1617 jle L(shl_end_0) 1618 1619 movaps 4(%eax), %xmm2 1620 movaps 20(%eax), %xmm3 1621 palignr $12, %xmm2, %xmm3 1622 palignr $12, %xmm1, %xmm2 1623 1624 movaps %xmm2, (%edx) 1625 movaps %xmm3, 16(%edx) 1626 lea 32(%edx, %ecx), %edx 1627 lea 32(%eax, %ecx), %eax 1628 POP (%edi) 1629 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1630 1631 CFI_PUSH (%edi) 1632 1633 .p2align 4 1634L(sh_12_no_prefetch): 1635 lea -32(%ecx), %ecx 1636 lea -12(%eax), %eax 1637 xor %edi, %edi 1638 1639 .p2align 4 1640L(sh_12_no_prefetch_loop): 1641 movdqa 16(%eax, %edi), %xmm2 1642 sub $32, %ecx 1643 movdqa 32(%eax, %edi), %xmm3 1644 movdqa %xmm3, %xmm4 1645 palignr $12, %xmm2, %xmm3 1646 palignr $12, %xmm1, %xmm2 1647 lea 32(%edi), %edi 1648 movdqa %xmm2, -32(%edx, %edi) 1649 movdqa %xmm3, -16(%edx, %edi) 1650 jb L(sh_12_end_no_prefetch_loop) 1651 1652 movdqa 16(%eax, %edi), %xmm2 1653 sub $32, %ecx 1654 movdqa 32(%eax, %edi), %xmm3 1655 movdqa %xmm3, %xmm1 1656 palignr $12, %xmm2, %xmm3 1657 palignr $12, %xmm4, %xmm2 1658 lea 32(%edi), %edi 1659 movdqa %xmm2, -32(%edx, %edi) 1660 movdqa %xmm3, -16(%edx, %edi) 1661 jae L(sh_12_no_prefetch_loop) 1662 1663L(sh_12_end_no_prefetch_loop): 1664 lea 32(%ecx), %ecx 1665 add %ecx, %edi 1666 add %edi, %edx 1667 lea 12(%edi, %eax), %eax 1668 POP (%edi) 1669 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1670 1671 CFI_PUSH (%edi) 1672 1673 .p2align 4 1674L(shl_13): 1675#ifndef USE_AS_MEMMOVE 1676 movaps -13(%eax), %xmm1 1677#else 1678 movl DEST+4(%esp), %edi 1679 movaps -13(%eax), %xmm1 1680 movdqu %xmm0, (%edi) 1681#endif 1682#ifdef DATA_CACHE_SIZE_HALF 1683 cmp $DATA_CACHE_SIZE_HALF, %ecx 1684#else 1685# if (defined SHARED || defined __PIC__) 1686 SETUP_PIC_REG(bx) 1687 add $_GLOBAL_OFFSET_TABLE_, %ebx 1688 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1689# else 1690 cmp __x86_data_cache_size_half, %ecx 1691# endif 1692#endif 1693 jb L(sh_13_no_prefetch) 1694 1695 lea -64(%ecx), %ecx 1696 1697 .p2align 4 1698L(Shl13LoopStart): 1699 prefetcht0 0x1c0(%eax) 1700 prefetcht0 0x1c0(%edx) 1701 movaps 3(%eax), %xmm2 1702 movaps 19(%eax), %xmm3 1703 movaps 35(%eax), %xmm4 1704 movaps 51(%eax), %xmm5 1705 movaps %xmm5, %xmm7 1706 palignr $13, %xmm4, %xmm5 1707 palignr $13, %xmm3, %xmm4 1708 movaps %xmm5, 48(%edx) 1709 palignr $13, %xmm2, %xmm3 1710 lea 64(%eax), %eax 1711 palignr $13, %xmm1, %xmm2 1712 movaps %xmm4, 32(%edx) 1713 movaps %xmm3, 16(%edx) 1714 movaps %xmm7, %xmm1 1715 movaps %xmm2, (%edx) 1716 lea 64(%edx), %edx 1717 sub $64, %ecx 1718 ja L(Shl13LoopStart) 1719 1720L(Shl13LoopLeave): 1721 add $32, %ecx 1722 jle L(shl_end_0) 1723 1724 movaps 3(%eax), %xmm2 1725 movaps 19(%eax), %xmm3 1726 palignr $13, %xmm2, %xmm3 1727 palignr $13, %xmm1, %xmm2 1728 1729 movaps %xmm2, (%edx) 1730 movaps %xmm3, 16(%edx) 1731 lea 32(%edx, %ecx), %edx 1732 lea 32(%eax, %ecx), %eax 1733 POP (%edi) 1734 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1735 1736 CFI_PUSH (%edi) 1737 1738 .p2align 4 1739L(sh_13_no_prefetch): 1740 lea -32(%ecx), %ecx 1741 lea -13(%eax), %eax 1742 xor %edi, %edi 1743 1744 .p2align 4 1745L(sh_13_no_prefetch_loop): 1746 movdqa 16(%eax, %edi), %xmm2 1747 sub $32, %ecx 1748 movdqa 32(%eax, %edi), %xmm3 1749 movdqa %xmm3, %xmm4 1750 palignr $13, %xmm2, %xmm3 1751 palignr $13, %xmm1, %xmm2 1752 lea 32(%edi), %edi 1753 movdqa %xmm2, -32(%edx, %edi) 1754 movdqa %xmm3, -16(%edx, %edi) 1755 jb L(sh_13_end_no_prefetch_loop) 1756 1757 movdqa 16(%eax, %edi), %xmm2 1758 sub $32, %ecx 1759 movdqa 32(%eax, %edi), %xmm3 1760 movdqa %xmm3, %xmm1 1761 palignr $13, %xmm2, %xmm3 1762 palignr $13, %xmm4, %xmm2 1763 lea 32(%edi), %edi 1764 movdqa %xmm2, -32(%edx, %edi) 1765 movdqa %xmm3, -16(%edx, %edi) 1766 jae L(sh_13_no_prefetch_loop) 1767 1768L(sh_13_end_no_prefetch_loop): 1769 lea 32(%ecx), %ecx 1770 add %ecx, %edi 1771 add %edi, %edx 1772 lea 13(%edi, %eax), %eax 1773 POP (%edi) 1774 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1775 1776 CFI_PUSH (%edi) 1777 1778 .p2align 4 1779L(shl_14): 1780#ifndef USE_AS_MEMMOVE 1781 movaps -14(%eax), %xmm1 1782#else 1783 movl DEST+4(%esp), %edi 1784 movaps -14(%eax), %xmm1 1785 movdqu %xmm0, (%edi) 1786#endif 1787#ifdef DATA_CACHE_SIZE_HALF 1788 cmp $DATA_CACHE_SIZE_HALF, %ecx 1789#else 1790# if (defined SHARED || defined __PIC__) 1791 SETUP_PIC_REG(bx) 1792 add $_GLOBAL_OFFSET_TABLE_, %ebx 1793 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1794# else 1795 cmp __x86_data_cache_size_half, %ecx 1796# endif 1797#endif 1798 jb L(sh_14_no_prefetch) 1799 1800 lea -64(%ecx), %ecx 1801 1802 .p2align 4 1803L(Shl14LoopStart): 1804 prefetcht0 0x1c0(%eax) 1805 prefetcht0 0x1c0(%edx) 1806 movaps 2(%eax), %xmm2 1807 movaps 18(%eax), %xmm3 1808 movaps 34(%eax), %xmm4 1809 movaps 50(%eax), %xmm5 1810 movaps %xmm5, %xmm7 1811 palignr $14, %xmm4, %xmm5 1812 palignr $14, %xmm3, %xmm4 1813 movaps %xmm5, 48(%edx) 1814 palignr $14, %xmm2, %xmm3 1815 lea 64(%eax), %eax 1816 palignr $14, %xmm1, %xmm2 1817 movaps %xmm4, 32(%edx) 1818 movaps %xmm3, 16(%edx) 1819 movaps %xmm7, %xmm1 1820 movaps %xmm2, (%edx) 1821 lea 64(%edx), %edx 1822 sub $64, %ecx 1823 ja L(Shl14LoopStart) 1824 1825L(Shl14LoopLeave): 1826 add $32, %ecx 1827 jle L(shl_end_0) 1828 1829 movaps 2(%eax), %xmm2 1830 movaps 18(%eax), %xmm3 1831 palignr $14, %xmm2, %xmm3 1832 palignr $14, %xmm1, %xmm2 1833 1834 movaps %xmm2, (%edx) 1835 movaps %xmm3, 16(%edx) 1836 lea 32(%edx, %ecx), %edx 1837 lea 32(%eax, %ecx), %eax 1838 POP (%edi) 1839 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1840 1841 CFI_PUSH (%edi) 1842 1843 .p2align 4 1844L(sh_14_no_prefetch): 1845 lea -32(%ecx), %ecx 1846 lea -14(%eax), %eax 1847 xor %edi, %edi 1848 1849 .p2align 4 1850L(sh_14_no_prefetch_loop): 1851 movdqa 16(%eax, %edi), %xmm2 1852 sub $32, %ecx 1853 movdqa 32(%eax, %edi), %xmm3 1854 movdqa %xmm3, %xmm4 1855 palignr $14, %xmm2, %xmm3 1856 palignr $14, %xmm1, %xmm2 1857 lea 32(%edi), %edi 1858 movdqa %xmm2, -32(%edx, %edi) 1859 movdqa %xmm3, -16(%edx, %edi) 1860 jb L(sh_14_end_no_prefetch_loop) 1861 1862 movdqa 16(%eax, %edi), %xmm2 1863 sub $32, %ecx 1864 movdqa 32(%eax, %edi), %xmm3 1865 movdqa %xmm3, %xmm1 1866 palignr $14, %xmm2, %xmm3 1867 palignr $14, %xmm4, %xmm2 1868 lea 32(%edi), %edi 1869 movdqa %xmm2, -32(%edx, %edi) 1870 movdqa %xmm3, -16(%edx, %edi) 1871 jae L(sh_14_no_prefetch_loop) 1872 1873L(sh_14_end_no_prefetch_loop): 1874 lea 32(%ecx), %ecx 1875 add %ecx, %edi 1876 add %edi, %edx 1877 lea 14(%edi, %eax), %eax 1878 POP (%edi) 1879 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1880 1881 CFI_PUSH (%edi) 1882 1883 .p2align 4 1884L(shl_15): 1885#ifndef USE_AS_MEMMOVE 1886 movaps -15(%eax), %xmm1 1887#else 1888 movl DEST+4(%esp), %edi 1889 movaps -15(%eax), %xmm1 1890 movdqu %xmm0, (%edi) 1891#endif 1892#ifdef DATA_CACHE_SIZE_HALF 1893 cmp $DATA_CACHE_SIZE_HALF, %ecx 1894#else 1895# if (defined SHARED || defined __PIC__) 1896 SETUP_PIC_REG(bx) 1897 add $_GLOBAL_OFFSET_TABLE_, %ebx 1898 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 1899# else 1900 cmp __x86_data_cache_size_half, %ecx 1901# endif 1902#endif 1903 jb L(sh_15_no_prefetch) 1904 1905 lea -64(%ecx), %ecx 1906 1907 .p2align 4 1908L(Shl15LoopStart): 1909 prefetcht0 0x1c0(%eax) 1910 prefetcht0 0x1c0(%edx) 1911 movaps 1(%eax), %xmm2 1912 movaps 17(%eax), %xmm3 1913 movaps 33(%eax), %xmm4 1914 movaps 49(%eax), %xmm5 1915 movaps %xmm5, %xmm7 1916 palignr $15, %xmm4, %xmm5 1917 palignr $15, %xmm3, %xmm4 1918 movaps %xmm5, 48(%edx) 1919 palignr $15, %xmm2, %xmm3 1920 lea 64(%eax), %eax 1921 palignr $15, %xmm1, %xmm2 1922 movaps %xmm4, 32(%edx) 1923 movaps %xmm3, 16(%edx) 1924 movaps %xmm7, %xmm1 1925 movaps %xmm2, (%edx) 1926 lea 64(%edx), %edx 1927 sub $64, %ecx 1928 ja L(Shl15LoopStart) 1929 1930L(Shl15LoopLeave): 1931 add $32, %ecx 1932 jle L(shl_end_0) 1933 1934 movaps 1(%eax), %xmm2 1935 movaps 17(%eax), %xmm3 1936 palignr $15, %xmm2, %xmm3 1937 palignr $15, %xmm1, %xmm2 1938 1939 movaps %xmm2, (%edx) 1940 movaps %xmm3, 16(%edx) 1941 lea 32(%edx, %ecx), %edx 1942 lea 32(%eax, %ecx), %eax 1943 POP (%edi) 1944 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1945 1946 CFI_PUSH (%edi) 1947 1948 .p2align 4 1949L(sh_15_no_prefetch): 1950 lea -32(%ecx), %ecx 1951 lea -15(%eax), %eax 1952 xor %edi, %edi 1953 1954 .p2align 4 1955L(sh_15_no_prefetch_loop): 1956 movdqa 16(%eax, %edi), %xmm2 1957 sub $32, %ecx 1958 movdqa 32(%eax, %edi), %xmm3 1959 movdqa %xmm3, %xmm4 1960 palignr $15, %xmm2, %xmm3 1961 palignr $15, %xmm1, %xmm2 1962 lea 32(%edi), %edi 1963 movdqa %xmm2, -32(%edx, %edi) 1964 movdqa %xmm3, -16(%edx, %edi) 1965 jb L(sh_15_end_no_prefetch_loop) 1966 1967 movdqa 16(%eax, %edi), %xmm2 1968 sub $32, %ecx 1969 movdqa 32(%eax, %edi), %xmm3 1970 movdqa %xmm3, %xmm1 1971 palignr $15, %xmm2, %xmm3 1972 palignr $15, %xmm4, %xmm2 1973 lea 32(%edi), %edi 1974 movdqa %xmm2, -32(%edx, %edi) 1975 movdqa %xmm3, -16(%edx, %edi) 1976 jae L(sh_15_no_prefetch_loop) 1977 1978L(sh_15_end_no_prefetch_loop): 1979 lea 32(%ecx), %ecx 1980 add %ecx, %edi 1981 add %edi, %edx 1982 lea 15(%edi, %eax), %eax 1983 POP (%edi) 1984 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1985 1986 CFI_PUSH (%edi) 1987 1988 .p2align 4 1989L(shl_end_0): 1990 lea 32(%ecx), %ecx 1991 lea (%edx, %ecx), %edx 1992 lea (%eax, %ecx), %eax 1993 POP (%edi) 1994 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) 1995 1996 .p2align 4 1997L(fwd_write_44bytes): 1998 movq -44(%eax), %xmm0 1999 movq %xmm0, -44(%edx) 2000L(fwd_write_36bytes): 2001 movq -36(%eax), %xmm0 2002 movq %xmm0, -36(%edx) 2003L(fwd_write_28bytes): 2004 movq -28(%eax), %xmm0 2005 movq %xmm0, -28(%edx) 2006L(fwd_write_20bytes): 2007 movq -20(%eax), %xmm0 2008 movq %xmm0, -20(%edx) 2009L(fwd_write_12bytes): 2010 movq -12(%eax), %xmm0 2011 movq %xmm0, -12(%edx) 2012L(fwd_write_4bytes): 2013 movl -4(%eax), %ecx 2014 movl %ecx, -4(%edx) 2015#ifdef USE_AS_MEMPCPY 2016 movl %edx, %eax 2017#else 2018 movl DEST(%esp), %eax 2019#endif 2020 RETURN 2021 2022 .p2align 4 2023L(fwd_write_40bytes): 2024 movq -40(%eax), %xmm0 2025 movq %xmm0, -40(%edx) 2026L(fwd_write_32bytes): 2027 movq -32(%eax), %xmm0 2028 movq %xmm0, -32(%edx) 2029L(fwd_write_24bytes): 2030 movq -24(%eax), %xmm0 2031 movq %xmm0, -24(%edx) 2032L(fwd_write_16bytes): 2033 movq -16(%eax), %xmm0 2034 movq %xmm0, -16(%edx) 2035L(fwd_write_8bytes): 2036 movq -8(%eax), %xmm0 2037 movq %xmm0, -8(%edx) 2038L(fwd_write_0bytes): 2039#ifdef USE_AS_MEMPCPY 2040 movl %edx, %eax 2041#else 2042 movl DEST(%esp), %eax 2043#endif 2044 RETURN 2045 2046 .p2align 4 2047L(fwd_write_5bytes): 2048 movl -5(%eax), %ecx 2049 movl -4(%eax), %eax 2050 movl %ecx, -5(%edx) 2051 movl %eax, -4(%edx) 2052#ifdef USE_AS_MEMPCPY 2053 movl %edx, %eax 2054#else 2055 movl DEST(%esp), %eax 2056#endif 2057 RETURN 2058 2059 .p2align 4 2060L(fwd_write_45bytes): 2061 movq -45(%eax), %xmm0 2062 movq %xmm0, -45(%edx) 2063L(fwd_write_37bytes): 2064 movq -37(%eax), %xmm0 2065 movq %xmm0, -37(%edx) 2066L(fwd_write_29bytes): 2067 movq -29(%eax), %xmm0 2068 movq %xmm0, -29(%edx) 2069L(fwd_write_21bytes): 2070 movq -21(%eax), %xmm0 2071 movq %xmm0, -21(%edx) 2072L(fwd_write_13bytes): 2073 movq -13(%eax), %xmm0 2074 movq %xmm0, -13(%edx) 2075 movl -5(%eax), %ecx 2076 movl %ecx, -5(%edx) 2077 movzbl -1(%eax), %ecx 2078 movb %cl, -1(%edx) 2079#ifdef USE_AS_MEMPCPY 2080 movl %edx, %eax 2081#else 2082 movl DEST(%esp), %eax 2083#endif 2084 RETURN 2085 2086 .p2align 4 2087L(fwd_write_41bytes): 2088 movq -41(%eax), %xmm0 2089 movq %xmm0, -41(%edx) 2090L(fwd_write_33bytes): 2091 movq -33(%eax), %xmm0 2092 movq %xmm0, -33(%edx) 2093L(fwd_write_25bytes): 2094 movq -25(%eax), %xmm0 2095 movq %xmm0, -25(%edx) 2096L(fwd_write_17bytes): 2097 movq -17(%eax), %xmm0 2098 movq %xmm0, -17(%edx) 2099L(fwd_write_9bytes): 2100 movq -9(%eax), %xmm0 2101 movq %xmm0, -9(%edx) 2102L(fwd_write_1bytes): 2103 movzbl -1(%eax), %ecx 2104 movb %cl, -1(%edx) 2105#ifdef USE_AS_MEMPCPY 2106 movl %edx, %eax 2107#else 2108 movl DEST(%esp), %eax 2109#endif 2110 RETURN 2111 2112 .p2align 4 2113L(fwd_write_46bytes): 2114 movq -46(%eax), %xmm0 2115 movq %xmm0, -46(%edx) 2116L(fwd_write_38bytes): 2117 movq -38(%eax), %xmm0 2118 movq %xmm0, -38(%edx) 2119L(fwd_write_30bytes): 2120 movq -30(%eax), %xmm0 2121 movq %xmm0, -30(%edx) 2122L(fwd_write_22bytes): 2123 movq -22(%eax), %xmm0 2124 movq %xmm0, -22(%edx) 2125L(fwd_write_14bytes): 2126 movq -14(%eax), %xmm0 2127 movq %xmm0, -14(%edx) 2128L(fwd_write_6bytes): 2129 movl -6(%eax), %ecx 2130 movl %ecx, -6(%edx) 2131 movzwl -2(%eax), %ecx 2132 movw %cx, -2(%edx) 2133#ifdef USE_AS_MEMPCPY 2134 movl %edx, %eax 2135#else 2136 movl DEST(%esp), %eax 2137#endif 2138 RETURN 2139 2140 .p2align 4 2141L(fwd_write_42bytes): 2142 movq -42(%eax), %xmm0 2143 movq %xmm0, -42(%edx) 2144L(fwd_write_34bytes): 2145 movq -34(%eax), %xmm0 2146 movq %xmm0, -34(%edx) 2147L(fwd_write_26bytes): 2148 movq -26(%eax), %xmm0 2149 movq %xmm0, -26(%edx) 2150L(fwd_write_18bytes): 2151 movq -18(%eax), %xmm0 2152 movq %xmm0, -18(%edx) 2153L(fwd_write_10bytes): 2154 movq -10(%eax), %xmm0 2155 movq %xmm0, -10(%edx) 2156L(fwd_write_2bytes): 2157 movzwl -2(%eax), %ecx 2158 movw %cx, -2(%edx) 2159#ifdef USE_AS_MEMPCPY 2160 movl %edx, %eax 2161#else 2162 movl DEST(%esp), %eax 2163#endif 2164 RETURN 2165 2166 .p2align 4 2167L(fwd_write_47bytes): 2168 movq -47(%eax), %xmm0 2169 movq %xmm0, -47(%edx) 2170L(fwd_write_39bytes): 2171 movq -39(%eax), %xmm0 2172 movq %xmm0, -39(%edx) 2173L(fwd_write_31bytes): 2174 movq -31(%eax), %xmm0 2175 movq %xmm0, -31(%edx) 2176L(fwd_write_23bytes): 2177 movq -23(%eax), %xmm0 2178 movq %xmm0, -23(%edx) 2179L(fwd_write_15bytes): 2180 movq -15(%eax), %xmm0 2181 movq %xmm0, -15(%edx) 2182L(fwd_write_7bytes): 2183 movl -7(%eax), %ecx 2184 movl %ecx, -7(%edx) 2185 movzwl -3(%eax), %ecx 2186 movzbl -1(%eax), %eax 2187 movw %cx, -3(%edx) 2188 movb %al, -1(%edx) 2189#ifdef USE_AS_MEMPCPY 2190 movl %edx, %eax 2191#else 2192 movl DEST(%esp), %eax 2193#endif 2194 RETURN 2195 2196 .p2align 4 2197L(fwd_write_43bytes): 2198 movq -43(%eax), %xmm0 2199 movq %xmm0, -43(%edx) 2200L(fwd_write_35bytes): 2201 movq -35(%eax), %xmm0 2202 movq %xmm0, -35(%edx) 2203L(fwd_write_27bytes): 2204 movq -27(%eax), %xmm0 2205 movq %xmm0, -27(%edx) 2206L(fwd_write_19bytes): 2207 movq -19(%eax), %xmm0 2208 movq %xmm0, -19(%edx) 2209L(fwd_write_11bytes): 2210 movq -11(%eax), %xmm0 2211 movq %xmm0, -11(%edx) 2212L(fwd_write_3bytes): 2213 movzwl -3(%eax), %ecx 2214 movzbl -1(%eax), %eax 2215 movw %cx, -3(%edx) 2216 movb %al, -1(%edx) 2217#ifdef USE_AS_MEMPCPY 2218 movl %edx, %eax 2219#else 2220 movl DEST(%esp), %eax 2221#endif 2222 RETURN 2223 2224 .p2align 4 2225L(fwd_write_40bytes_align): 2226 movdqa -40(%eax), %xmm0 2227 movdqa %xmm0, -40(%edx) 2228L(fwd_write_24bytes_align): 2229 movdqa -24(%eax), %xmm0 2230 movdqa %xmm0, -24(%edx) 2231L(fwd_write_8bytes_align): 2232 movq -8(%eax), %xmm0 2233 movq %xmm0, -8(%edx) 2234L(fwd_write_0bytes_align): 2235#ifdef USE_AS_MEMPCPY 2236 movl %edx, %eax 2237#else 2238 movl DEST(%esp), %eax 2239#endif 2240 RETURN 2241 2242 .p2align 4 2243L(fwd_write_32bytes_align): 2244 movdqa -32(%eax), %xmm0 2245 movdqa %xmm0, -32(%edx) 2246L(fwd_write_16bytes_align): 2247 movdqa -16(%eax), %xmm0 2248 movdqa %xmm0, -16(%edx) 2249#ifdef USE_AS_MEMPCPY 2250 movl %edx, %eax 2251#else 2252 movl DEST(%esp), %eax 2253#endif 2254 RETURN 2255 2256 .p2align 4 2257L(fwd_write_5bytes_align): 2258 movl -5(%eax), %ecx 2259 movl -4(%eax), %eax 2260 movl %ecx, -5(%edx) 2261 movl %eax, -4(%edx) 2262#ifdef USE_AS_MEMPCPY 2263 movl %edx, %eax 2264#else 2265 movl DEST(%esp), %eax 2266#endif 2267 RETURN 2268 2269 .p2align 4 2270L(fwd_write_45bytes_align): 2271 movdqa -45(%eax), %xmm0 2272 movdqa %xmm0, -45(%edx) 2273L(fwd_write_29bytes_align): 2274 movdqa -29(%eax), %xmm0 2275 movdqa %xmm0, -29(%edx) 2276L(fwd_write_13bytes_align): 2277 movq -13(%eax), %xmm0 2278 movq %xmm0, -13(%edx) 2279 movl -5(%eax), %ecx 2280 movl %ecx, -5(%edx) 2281 movzbl -1(%eax), %ecx 2282 movb %cl, -1(%edx) 2283#ifdef USE_AS_MEMPCPY 2284 movl %edx, %eax 2285#else 2286 movl DEST(%esp), %eax 2287#endif 2288 RETURN 2289 2290 .p2align 4 2291L(fwd_write_37bytes_align): 2292 movdqa -37(%eax), %xmm0 2293 movdqa %xmm0, -37(%edx) 2294L(fwd_write_21bytes_align): 2295 movdqa -21(%eax), %xmm0 2296 movdqa %xmm0, -21(%edx) 2297 movl -5(%eax), %ecx 2298 movl %ecx, -5(%edx) 2299 movzbl -1(%eax), %ecx 2300 movb %cl, -1(%edx) 2301#ifdef USE_AS_MEMPCPY 2302 movl %edx, %eax 2303#else 2304 movl DEST(%esp), %eax 2305#endif 2306 RETURN 2307 2308 .p2align 4 2309L(fwd_write_41bytes_align): 2310 movdqa -41(%eax), %xmm0 2311 movdqa %xmm0, -41(%edx) 2312L(fwd_write_25bytes_align): 2313 movdqa -25(%eax), %xmm0 2314 movdqa %xmm0, -25(%edx) 2315L(fwd_write_9bytes_align): 2316 movq -9(%eax), %xmm0 2317 movq %xmm0, -9(%edx) 2318L(fwd_write_1bytes_align): 2319 movzbl -1(%eax), %ecx 2320 movb %cl, -1(%edx) 2321#ifdef USE_AS_MEMPCPY 2322 movl %edx, %eax 2323#else 2324 movl DEST(%esp), %eax 2325#endif 2326 RETURN 2327 2328 .p2align 4 2329L(fwd_write_33bytes_align): 2330 movdqa -33(%eax), %xmm0 2331 movdqa %xmm0, -33(%edx) 2332L(fwd_write_17bytes_align): 2333 movdqa -17(%eax), %xmm0 2334 movdqa %xmm0, -17(%edx) 2335 movzbl -1(%eax), %ecx 2336 movb %cl, -1(%edx) 2337#ifdef USE_AS_MEMPCPY 2338 movl %edx, %eax 2339#else 2340 movl DEST(%esp), %eax 2341#endif 2342 RETURN 2343 2344 .p2align 4 2345L(fwd_write_46bytes_align): 2346 movdqa -46(%eax), %xmm0 2347 movdqa %xmm0, -46(%edx) 2348L(fwd_write_30bytes_align): 2349 movdqa -30(%eax), %xmm0 2350 movdqa %xmm0, -30(%edx) 2351L(fwd_write_14bytes_align): 2352 movq -14(%eax), %xmm0 2353 movq %xmm0, -14(%edx) 2354L(fwd_write_6bytes_align): 2355 movl -6(%eax), %ecx 2356 movl %ecx, -6(%edx) 2357 movzwl -2(%eax), %ecx 2358 movw %cx, -2(%edx) 2359#ifdef USE_AS_MEMPCPY 2360 movl %edx, %eax 2361#else 2362 movl DEST(%esp), %eax 2363#endif 2364 RETURN 2365 2366 .p2align 4 2367L(fwd_write_38bytes_align): 2368 movdqa -38(%eax), %xmm0 2369 movdqa %xmm0, -38(%edx) 2370L(fwd_write_22bytes_align): 2371 movdqa -22(%eax), %xmm0 2372 movdqa %xmm0, -22(%edx) 2373 movl -6(%eax), %ecx 2374 movl %ecx, -6(%edx) 2375 movzwl -2(%eax), %ecx 2376 movw %cx, -2(%edx) 2377#ifdef USE_AS_MEMPCPY 2378 movl %edx, %eax 2379#else 2380 movl DEST(%esp), %eax 2381#endif 2382 RETURN 2383 2384 .p2align 4 2385L(fwd_write_42bytes_align): 2386 movdqa -42(%eax), %xmm0 2387 movdqa %xmm0, -42(%edx) 2388L(fwd_write_26bytes_align): 2389 movdqa -26(%eax), %xmm0 2390 movdqa %xmm0, -26(%edx) 2391L(fwd_write_10bytes_align): 2392 movq -10(%eax), %xmm0 2393 movq %xmm0, -10(%edx) 2394L(fwd_write_2bytes_align): 2395 movzwl -2(%eax), %ecx 2396 movw %cx, -2(%edx) 2397#ifdef USE_AS_MEMPCPY 2398 movl %edx, %eax 2399#else 2400 movl DEST(%esp), %eax 2401#endif 2402 RETURN 2403 2404 .p2align 4 2405L(fwd_write_34bytes_align): 2406 movdqa -34(%eax), %xmm0 2407 movdqa %xmm0, -34(%edx) 2408L(fwd_write_18bytes_align): 2409 movdqa -18(%eax), %xmm0 2410 movdqa %xmm0, -18(%edx) 2411 movzwl -2(%eax), %ecx 2412 movw %cx, -2(%edx) 2413#ifdef USE_AS_MEMPCPY 2414 movl %edx, %eax 2415#else 2416 movl DEST(%esp), %eax 2417#endif 2418 RETURN 2419 2420 .p2align 4 2421L(fwd_write_47bytes_align): 2422 movdqa -47(%eax), %xmm0 2423 movdqa %xmm0, -47(%edx) 2424L(fwd_write_31bytes_align): 2425 movdqa -31(%eax), %xmm0 2426 movdqa %xmm0, -31(%edx) 2427L(fwd_write_15bytes_align): 2428 movq -15(%eax), %xmm0 2429 movq %xmm0, -15(%edx) 2430L(fwd_write_7bytes_align): 2431 movl -7(%eax), %ecx 2432 movl %ecx, -7(%edx) 2433 movzwl -3(%eax), %ecx 2434 movzbl -1(%eax), %eax 2435 movw %cx, -3(%edx) 2436 movb %al, -1(%edx) 2437#ifdef USE_AS_MEMPCPY 2438 movl %edx, %eax 2439#else 2440 movl DEST(%esp), %eax 2441#endif 2442 RETURN 2443 2444 .p2align 4 2445L(fwd_write_39bytes_align): 2446 movdqa -39(%eax), %xmm0 2447 movdqa %xmm0, -39(%edx) 2448L(fwd_write_23bytes_align): 2449 movdqa -23(%eax), %xmm0 2450 movdqa %xmm0, -23(%edx) 2451 movl -7(%eax), %ecx 2452 movl %ecx, -7(%edx) 2453 movzwl -3(%eax), %ecx 2454 movzbl -1(%eax), %eax 2455 movw %cx, -3(%edx) 2456 movb %al, -1(%edx) 2457#ifdef USE_AS_MEMPCPY 2458 movl %edx, %eax 2459#else 2460 movl DEST(%esp), %eax 2461#endif 2462 RETURN 2463 2464 .p2align 4 2465L(fwd_write_43bytes_align): 2466 movdqa -43(%eax), %xmm0 2467 movdqa %xmm0, -43(%edx) 2468L(fwd_write_27bytes_align): 2469 movdqa -27(%eax), %xmm0 2470 movdqa %xmm0, -27(%edx) 2471L(fwd_write_11bytes_align): 2472 movq -11(%eax), %xmm0 2473 movq %xmm0, -11(%edx) 2474L(fwd_write_3bytes_align): 2475 movzwl -3(%eax), %ecx 2476 movzbl -1(%eax), %eax 2477 movw %cx, -3(%edx) 2478 movb %al, -1(%edx) 2479#ifdef USE_AS_MEMPCPY 2480 movl %edx, %eax 2481#else 2482 movl DEST(%esp), %eax 2483#endif 2484 RETURN 2485 2486 .p2align 4 2487L(fwd_write_35bytes_align): 2488 movdqa -35(%eax), %xmm0 2489 movdqa %xmm0, -35(%edx) 2490L(fwd_write_19bytes_align): 2491 movdqa -19(%eax), %xmm0 2492 movdqa %xmm0, -19(%edx) 2493 movzwl -3(%eax), %ecx 2494 movzbl -1(%eax), %eax 2495 movw %cx, -3(%edx) 2496 movb %al, -1(%edx) 2497#ifdef USE_AS_MEMPCPY 2498 movl %edx, %eax 2499#else 2500 movl DEST(%esp), %eax 2501#endif 2502 RETURN 2503 2504 .p2align 4 2505L(fwd_write_44bytes_align): 2506 movdqa -44(%eax), %xmm0 2507 movdqa %xmm0, -44(%edx) 2508L(fwd_write_28bytes_align): 2509 movdqa -28(%eax), %xmm0 2510 movdqa %xmm0, -28(%edx) 2511L(fwd_write_12bytes_align): 2512 movq -12(%eax), %xmm0 2513 movq %xmm0, -12(%edx) 2514L(fwd_write_4bytes_align): 2515 movl -4(%eax), %ecx 2516 movl %ecx, -4(%edx) 2517#ifdef USE_AS_MEMPCPY 2518 movl %edx, %eax 2519#else 2520 movl DEST(%esp), %eax 2521#endif 2522 RETURN 2523 2524 .p2align 4 2525L(fwd_write_36bytes_align): 2526 movdqa -36(%eax), %xmm0 2527 movdqa %xmm0, -36(%edx) 2528L(fwd_write_20bytes_align): 2529 movdqa -20(%eax), %xmm0 2530 movdqa %xmm0, -20(%edx) 2531 movl -4(%eax), %ecx 2532 movl %ecx, -4(%edx) 2533#ifdef USE_AS_MEMPCPY 2534 movl %edx, %eax 2535#else 2536 movl DEST(%esp), %eax 2537#endif 2538 RETURN_END 2539 2540 CFI_PUSH (%edi) 2541 2542 .p2align 4 2543L(large_page): 2544 movdqu (%eax), %xmm1 2545#ifdef USE_AS_MEMMOVE 2546 movl DEST+4(%esp), %edi 2547 movdqu %xmm0, (%edi) 2548#endif 2549 lea 16(%eax), %eax 2550 movntdq %xmm1, (%edx) 2551 lea 16(%edx), %edx 2552 lea -0x90(%ecx), %ecx 2553 POP (%edi) 2554 2555 .p2align 4 2556L(large_page_loop): 2557 movdqu (%eax), %xmm0 2558 movdqu 0x10(%eax), %xmm1 2559 movdqu 0x20(%eax), %xmm2 2560 movdqu 0x30(%eax), %xmm3 2561 movdqu 0x40(%eax), %xmm4 2562 movdqu 0x50(%eax), %xmm5 2563 movdqu 0x60(%eax), %xmm6 2564 movdqu 0x70(%eax), %xmm7 2565 lea 0x80(%eax), %eax 2566 2567 sub $0x80, %ecx 2568 movntdq %xmm0, (%edx) 2569 movntdq %xmm1, 0x10(%edx) 2570 movntdq %xmm2, 0x20(%edx) 2571 movntdq %xmm3, 0x30(%edx) 2572 movntdq %xmm4, 0x40(%edx) 2573 movntdq %xmm5, 0x50(%edx) 2574 movntdq %xmm6, 0x60(%edx) 2575 movntdq %xmm7, 0x70(%edx) 2576 lea 0x80(%edx), %edx 2577 jae L(large_page_loop) 2578 cmp $-0x40, %ecx 2579 lea 0x80(%ecx), %ecx 2580 jl L(large_page_less_64bytes) 2581 2582 movdqu (%eax), %xmm0 2583 movdqu 0x10(%eax), %xmm1 2584 movdqu 0x20(%eax), %xmm2 2585 movdqu 0x30(%eax), %xmm3 2586 lea 0x40(%eax), %eax 2587 2588 movntdq %xmm0, (%edx) 2589 movntdq %xmm1, 0x10(%edx) 2590 movntdq %xmm2, 0x20(%edx) 2591 movntdq %xmm3, 0x30(%edx) 2592 lea 0x40(%edx), %edx 2593 sub $0x40, %ecx 2594L(large_page_less_64bytes): 2595 cmp $32, %ecx 2596 jb L(large_page_less_32bytes) 2597 movdqu (%eax), %xmm0 2598 movdqu 0x10(%eax), %xmm1 2599 lea 0x20(%eax), %eax 2600 movntdq %xmm0, (%edx) 2601 movntdq %xmm1, 0x10(%edx) 2602 lea 0x20(%edx), %edx 2603 sub $0x20, %ecx 2604L(large_page_less_32bytes): 2605 add %ecx, %edx 2606 add %ecx, %eax 2607 sfence 2608 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 2609 2610 .p2align 4 2611L(bk_write_44bytes): 2612 movq 36(%eax), %xmm0 2613 movq %xmm0, 36(%edx) 2614L(bk_write_36bytes): 2615 movq 28(%eax), %xmm0 2616 movq %xmm0, 28(%edx) 2617L(bk_write_28bytes): 2618 movq 20(%eax), %xmm0 2619 movq %xmm0, 20(%edx) 2620L(bk_write_20bytes): 2621 movq 12(%eax), %xmm0 2622 movq %xmm0, 12(%edx) 2623L(bk_write_12bytes): 2624 movq 4(%eax), %xmm0 2625 movq %xmm0, 4(%edx) 2626L(bk_write_4bytes): 2627 movl (%eax), %ecx 2628 movl %ecx, (%edx) 2629L(bk_write_0bytes): 2630 movl DEST(%esp), %eax 2631#ifdef USE_AS_MEMPCPY 2632 movl LEN(%esp), %ecx 2633 add %ecx, %eax 2634#endif 2635 RETURN 2636 2637 .p2align 4 2638L(bk_write_40bytes): 2639 movq 32(%eax), %xmm0 2640 movq %xmm0, 32(%edx) 2641L(bk_write_32bytes): 2642 movq 24(%eax), %xmm0 2643 movq %xmm0, 24(%edx) 2644L(bk_write_24bytes): 2645 movq 16(%eax), %xmm0 2646 movq %xmm0, 16(%edx) 2647L(bk_write_16bytes): 2648 movq 8(%eax), %xmm0 2649 movq %xmm0, 8(%edx) 2650L(bk_write_8bytes): 2651 movq (%eax), %xmm0 2652 movq %xmm0, (%edx) 2653 movl DEST(%esp), %eax 2654#ifdef USE_AS_MEMPCPY 2655 movl LEN(%esp), %ecx 2656 add %ecx, %eax 2657#endif 2658 RETURN 2659 2660 .p2align 4 2661L(bk_write_45bytes): 2662 movq 37(%eax), %xmm0 2663 movq %xmm0, 37(%edx) 2664L(bk_write_37bytes): 2665 movq 29(%eax), %xmm0 2666 movq %xmm0, 29(%edx) 2667L(bk_write_29bytes): 2668 movq 21(%eax), %xmm0 2669 movq %xmm0, 21(%edx) 2670L(bk_write_21bytes): 2671 movq 13(%eax), %xmm0 2672 movq %xmm0, 13(%edx) 2673L(bk_write_13bytes): 2674 movq 5(%eax), %xmm0 2675 movq %xmm0, 5(%edx) 2676L(bk_write_5bytes): 2677 movl 1(%eax), %ecx 2678 movl %ecx, 1(%edx) 2679L(bk_write_1bytes): 2680 movzbl (%eax), %ecx 2681 movb %cl, (%edx) 2682 movl DEST(%esp), %eax 2683#ifdef USE_AS_MEMPCPY 2684 movl LEN(%esp), %ecx 2685 add %ecx, %eax 2686#endif 2687 RETURN 2688 2689 .p2align 4 2690L(bk_write_41bytes): 2691 movq 33(%eax), %xmm0 2692 movq %xmm0, 33(%edx) 2693L(bk_write_33bytes): 2694 movq 25(%eax), %xmm0 2695 movq %xmm0, 25(%edx) 2696L(bk_write_25bytes): 2697 movq 17(%eax), %xmm0 2698 movq %xmm0, 17(%edx) 2699L(bk_write_17bytes): 2700 movq 9(%eax), %xmm0 2701 movq %xmm0, 9(%edx) 2702L(bk_write_9bytes): 2703 movq 1(%eax), %xmm0 2704 movq %xmm0, 1(%edx) 2705 movzbl (%eax), %ecx 2706 movb %cl, (%edx) 2707 movl DEST(%esp), %eax 2708#ifdef USE_AS_MEMPCPY 2709 movl LEN(%esp), %ecx 2710 add %ecx, %eax 2711#endif 2712 RETURN 2713 2714 .p2align 4 2715L(bk_write_46bytes): 2716 movq 38(%eax), %xmm0 2717 movq %xmm0, 38(%edx) 2718L(bk_write_38bytes): 2719 movq 30(%eax), %xmm0 2720 movq %xmm0, 30(%edx) 2721L(bk_write_30bytes): 2722 movq 22(%eax), %xmm0 2723 movq %xmm0, 22(%edx) 2724L(bk_write_22bytes): 2725 movq 14(%eax), %xmm0 2726 movq %xmm0, 14(%edx) 2727L(bk_write_14bytes): 2728 movq 6(%eax), %xmm0 2729 movq %xmm0, 6(%edx) 2730L(bk_write_6bytes): 2731 movl 2(%eax), %ecx 2732 movl %ecx, 2(%edx) 2733 movzwl (%eax), %ecx 2734 movw %cx, (%edx) 2735 movl DEST(%esp), %eax 2736#ifdef USE_AS_MEMPCPY 2737 movl LEN(%esp), %ecx 2738 add %ecx, %eax 2739#endif 2740 RETURN 2741 2742 .p2align 4 2743L(bk_write_42bytes): 2744 movq 34(%eax), %xmm0 2745 movq %xmm0, 34(%edx) 2746L(bk_write_34bytes): 2747 movq 26(%eax), %xmm0 2748 movq %xmm0, 26(%edx) 2749L(bk_write_26bytes): 2750 movq 18(%eax), %xmm0 2751 movq %xmm0, 18(%edx) 2752L(bk_write_18bytes): 2753 movq 10(%eax), %xmm0 2754 movq %xmm0, 10(%edx) 2755L(bk_write_10bytes): 2756 movq 2(%eax), %xmm0 2757 movq %xmm0, 2(%edx) 2758L(bk_write_2bytes): 2759 movzwl (%eax), %ecx 2760 movw %cx, (%edx) 2761 movl DEST(%esp), %eax 2762#ifdef USE_AS_MEMPCPY 2763 movl LEN(%esp), %ecx 2764 add %ecx, %eax 2765#endif 2766 RETURN 2767 2768 .p2align 4 2769L(bk_write_47bytes): 2770 movq 39(%eax), %xmm0 2771 movq %xmm0, 39(%edx) 2772L(bk_write_39bytes): 2773 movq 31(%eax), %xmm0 2774 movq %xmm0, 31(%edx) 2775L(bk_write_31bytes): 2776 movq 23(%eax), %xmm0 2777 movq %xmm0, 23(%edx) 2778L(bk_write_23bytes): 2779 movq 15(%eax), %xmm0 2780 movq %xmm0, 15(%edx) 2781L(bk_write_15bytes): 2782 movq 7(%eax), %xmm0 2783 movq %xmm0, 7(%edx) 2784L(bk_write_7bytes): 2785 movl 3(%eax), %ecx 2786 movl %ecx, 3(%edx) 2787 movzwl 1(%eax), %ecx 2788 movw %cx, 1(%edx) 2789 movzbl (%eax), %eax 2790 movb %al, (%edx) 2791 movl DEST(%esp), %eax 2792#ifdef USE_AS_MEMPCPY 2793 movl LEN(%esp), %ecx 2794 add %ecx, %eax 2795#endif 2796 RETURN 2797 2798 .p2align 4 2799L(bk_write_43bytes): 2800 movq 35(%eax), %xmm0 2801 movq %xmm0, 35(%edx) 2802L(bk_write_35bytes): 2803 movq 27(%eax), %xmm0 2804 movq %xmm0, 27(%edx) 2805L(bk_write_27bytes): 2806 movq 19(%eax), %xmm0 2807 movq %xmm0, 19(%edx) 2808L(bk_write_19bytes): 2809 movq 11(%eax), %xmm0 2810 movq %xmm0, 11(%edx) 2811L(bk_write_11bytes): 2812 movq 3(%eax), %xmm0 2813 movq %xmm0, 3(%edx) 2814L(bk_write_3bytes): 2815 movzwl 1(%eax), %ecx 2816 movw %cx, 1(%edx) 2817 movzbl (%eax), %eax 2818 movb %al, (%edx) 2819 movl DEST(%esp), %eax 2820#ifdef USE_AS_MEMPCPY 2821 movl LEN(%esp), %ecx 2822 add %ecx, %eax 2823#endif 2824 RETURN_END 2825 2826 2827 .pushsection .rodata.ssse3,"a",@progbits 2828 .p2align 2 2829L(table_48bytes_fwd): 2830 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 2831 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 2832 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 2833 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 2834 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 2835 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 2836 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 2837 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 2838 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 2839 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 2840 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 2841 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 2842 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 2843 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 2844 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 2845 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 2846 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 2847 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 2848 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 2849 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 2850 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 2851 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 2852 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 2853 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 2854 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 2855 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 2856 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 2857 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 2858 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 2859 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 2860 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 2861 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 2862 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 2863 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 2864 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 2865 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 2866 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 2867 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 2868 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 2869 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 2870 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 2871 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 2872 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 2873 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 2874 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 2875 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 2876 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 2877 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 2878 2879 .p2align 2 2880L(table_48bytes_fwd_align): 2881 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) 2882 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) 2883 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) 2884 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) 2885 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) 2886 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) 2887 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) 2888 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) 2889 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) 2890 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) 2891 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) 2892 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) 2893 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) 2894 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) 2895 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) 2896 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) 2897 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) 2898 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) 2899 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) 2900 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) 2901 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) 2902 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) 2903 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) 2904 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) 2905 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) 2906 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) 2907 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) 2908 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) 2909 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) 2910 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) 2911 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) 2912 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) 2913 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) 2914 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) 2915 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) 2916 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) 2917 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) 2918 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) 2919 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) 2920 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) 2921 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) 2922 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) 2923 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) 2924 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) 2925 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) 2926 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) 2927 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) 2928 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) 2929 2930 .p2align 2 2931L(shl_table): 2932 .int JMPTBL (L(shl_0), L(shl_table)) 2933 .int JMPTBL (L(shl_1), L(shl_table)) 2934 .int JMPTBL (L(shl_2), L(shl_table)) 2935 .int JMPTBL (L(shl_3), L(shl_table)) 2936 .int JMPTBL (L(shl_4), L(shl_table)) 2937 .int JMPTBL (L(shl_5), L(shl_table)) 2938 .int JMPTBL (L(shl_6), L(shl_table)) 2939 .int JMPTBL (L(shl_7), L(shl_table)) 2940 .int JMPTBL (L(shl_8), L(shl_table)) 2941 .int JMPTBL (L(shl_9), L(shl_table)) 2942 .int JMPTBL (L(shl_10), L(shl_table)) 2943 .int JMPTBL (L(shl_11), L(shl_table)) 2944 .int JMPTBL (L(shl_12), L(shl_table)) 2945 .int JMPTBL (L(shl_13), L(shl_table)) 2946 .int JMPTBL (L(shl_14), L(shl_table)) 2947 .int JMPTBL (L(shl_15), L(shl_table)) 2948 2949 .p2align 2 2950L(table_48_bytes_bwd): 2951 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 2952 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 2953 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 2954 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 2955 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 2956 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 2957 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 2958 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 2959 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 2960 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 2961 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 2962 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 2963 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 2964 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 2965 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 2966 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 2967 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 2968 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 2969 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 2970 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 2971 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 2972 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 2973 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 2974 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 2975 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 2976 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 2977 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 2978 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 2979 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 2980 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 2981 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 2982 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 2983 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 2984 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 2985 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 2986 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 2987 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 2988 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 2989 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 2990 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 2991 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 2992 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 2993 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 2994 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 2995 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 2996 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 2997 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 2998 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 2999 3000 .popsection 3001 3002#ifdef USE_AS_MEMMOVE 3003 .p2align 4 3004L(copy_backward): 3005 PUSH (%edi) 3006 movl %eax, %edi 3007 lea (%ecx,%edx,1),%edx 3008 lea (%ecx,%edi,1),%edi 3009 testl $0x3, %edx 3010 jnz L(bk_align) 3011 3012L(bk_aligned_4): 3013 cmp $64, %ecx 3014 jae L(bk_write_more64bytes) 3015 3016L(bk_write_64bytesless): 3017 cmp $32, %ecx 3018 jb L(bk_write_less32bytes) 3019 3020L(bk_write_more32bytes): 3021 /* Copy 32 bytes at a time. */ 3022 sub $32, %ecx 3023 movq -8(%edi), %xmm0 3024 movq %xmm0, -8(%edx) 3025 movq -16(%edi), %xmm0 3026 movq %xmm0, -16(%edx) 3027 movq -24(%edi), %xmm0 3028 movq %xmm0, -24(%edx) 3029 movq -32(%edi), %xmm0 3030 movq %xmm0, -32(%edx) 3031 sub $32, %edx 3032 sub $32, %edi 3033 3034L(bk_write_less32bytes): 3035 movl %edi, %eax 3036 sub %ecx, %edx 3037 sub %ecx, %eax 3038 POP (%edi) 3039L(bk_write_less32bytes_2): 3040 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 3041 3042 CFI_PUSH (%edi) 3043 3044 .p2align 4 3045L(bk_align): 3046 cmp $8, %ecx 3047 jbe L(bk_write_less32bytes) 3048 testl $1, %edx 3049 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 3050 then (EDX & 2) must be != 0. */ 3051 jz L(bk_got2) 3052 sub $1, %edi 3053 sub $1, %ecx 3054 sub $1, %edx 3055 movzbl (%edi), %eax 3056 movb %al, (%edx) 3057 3058 testl $2, %edx 3059 jz L(bk_aligned_4) 3060 3061L(bk_got2): 3062 sub $2, %edi 3063 sub $2, %ecx 3064 sub $2, %edx 3065 movzwl (%edi), %eax 3066 movw %ax, (%edx) 3067 jmp L(bk_aligned_4) 3068 3069 .p2align 4 3070L(bk_write_more64bytes): 3071 /* Check alignment of last byte. */ 3072 testl $15, %edx 3073 jz L(bk_ssse3_cpy_pre) 3074 3075/* EDX is aligned 4 bytes, but not 16 bytes. */ 3076L(bk_ssse3_align): 3077 sub $4, %edi 3078 sub $4, %ecx 3079 sub $4, %edx 3080 movl (%edi), %eax 3081 movl %eax, (%edx) 3082 3083 testl $15, %edx 3084 jz L(bk_ssse3_cpy_pre) 3085 3086 sub $4, %edi 3087 sub $4, %ecx 3088 sub $4, %edx 3089 movl (%edi), %eax 3090 movl %eax, (%edx) 3091 3092 testl $15, %edx 3093 jz L(bk_ssse3_cpy_pre) 3094 3095 sub $4, %edi 3096 sub $4, %ecx 3097 sub $4, %edx 3098 movl (%edi), %eax 3099 movl %eax, (%edx) 3100 3101L(bk_ssse3_cpy_pre): 3102 cmp $64, %ecx 3103 jb L(bk_write_more32bytes) 3104 3105 .p2align 4 3106L(bk_ssse3_cpy): 3107 sub $64, %edi 3108 sub $64, %ecx 3109 sub $64, %edx 3110 movdqu 0x30(%edi), %xmm3 3111 movdqa %xmm3, 0x30(%edx) 3112 movdqu 0x20(%edi), %xmm2 3113 movdqa %xmm2, 0x20(%edx) 3114 movdqu 0x10(%edi), %xmm1 3115 movdqa %xmm1, 0x10(%edx) 3116 movdqu (%edi), %xmm0 3117 movdqa %xmm0, (%edx) 3118 cmp $64, %ecx 3119 jae L(bk_ssse3_cpy) 3120 jmp L(bk_write_64bytesless) 3121 3122#endif 3123 3124END (MEMCPY) 3125