1#if defined(__x86_64__) 2.text 3 4.extern OPENSSL_ia32cap_P 5.hidden OPENSSL_ia32cap_P 6 7.globl bn_mul_mont_gather5 8.hidden bn_mul_mont_gather5 9.type bn_mul_mont_gather5,@function 10.align 64 11bn_mul_mont_gather5: 12 testl $7,%r9d 13 jnz .Lmul_enter 14 jmp .Lmul4x_enter 15 16.align 16 17.Lmul_enter: 18 movl %r9d,%r9d 19 movq %rsp,%rax 20 movl 8(%rsp),%r10d 21 pushq %rbx 22 pushq %rbp 23 pushq %r12 24 pushq %r13 25 pushq %r14 26 pushq %r15 27 leaq 2(%r9),%r11 28 negq %r11 29 leaq (%rsp,%r11,8),%rsp 30 andq $-1024,%rsp 31 32 movq %rax,8(%rsp,%r9,8) 33.Lmul_body: 34 movq %rdx,%r12 35 movq %r10,%r11 36 shrq $3,%r10 37 andq $7,%r11 38 notq %r10 39 leaq .Lmagic_masks(%rip),%rax 40 andq $3,%r10 41 leaq 96(%r12,%r11,8),%r12 42 movq 0(%rax,%r10,8),%xmm4 43 movq 8(%rax,%r10,8),%xmm5 44 movq 16(%rax,%r10,8),%xmm6 45 movq 24(%rax,%r10,8),%xmm7 46 47 movq -96(%r12),%xmm0 48 movq -32(%r12),%xmm1 49 pand %xmm4,%xmm0 50 movq 32(%r12),%xmm2 51 pand %xmm5,%xmm1 52 movq 96(%r12),%xmm3 53 pand %xmm6,%xmm2 54 por %xmm1,%xmm0 55 pand %xmm7,%xmm3 56 por %xmm2,%xmm0 57 leaq 256(%r12),%r12 58 por %xmm3,%xmm0 59 60.byte 102,72,15,126,195 61 62 movq (%r8),%r8 63 movq (%rsi),%rax 64 65 xorq %r14,%r14 66 xorq %r15,%r15 67 68 movq -96(%r12),%xmm0 69 movq -32(%r12),%xmm1 70 pand %xmm4,%xmm0 71 movq 32(%r12),%xmm2 72 pand %xmm5,%xmm1 73 74 movq %r8,%rbp 75 mulq %rbx 76 movq %rax,%r10 77 movq (%rcx),%rax 78 79 movq 96(%r12),%xmm3 80 pand %xmm6,%xmm2 81 por %xmm1,%xmm0 82 pand %xmm7,%xmm3 83 84 imulq %r10,%rbp 85 movq %rdx,%r11 86 87 por %xmm2,%xmm0 88 leaq 256(%r12),%r12 89 por %xmm3,%xmm0 90 91 mulq %rbp 92 addq %rax,%r10 93 movq 8(%rsi),%rax 94 adcq $0,%rdx 95 movq %rdx,%r13 96 97 leaq 1(%r15),%r15 98 jmp .L1st_enter 99 100.align 16 101.L1st: 102 addq %rax,%r13 103 movq (%rsi,%r15,8),%rax 104 adcq $0,%rdx 105 addq %r11,%r13 106 movq %r10,%r11 107 adcq $0,%rdx 108 movq %r13,-16(%rsp,%r15,8) 109 movq %rdx,%r13 110 111.L1st_enter: 112 mulq %rbx 113 addq %rax,%r11 114 movq (%rcx,%r15,8),%rax 115 adcq $0,%rdx 116 leaq 1(%r15),%r15 117 movq %rdx,%r10 118 119 mulq %rbp 120 cmpq %r9,%r15 121 jne .L1st 122 123.byte 102,72,15,126,195 124 125 addq %rax,%r13 126 movq (%rsi),%rax 127 adcq $0,%rdx 128 addq %r11,%r13 129 adcq $0,%rdx 130 movq %r13,-16(%rsp,%r15,8) 131 movq %rdx,%r13 132 movq %r10,%r11 133 134 xorq %rdx,%rdx 135 addq %r11,%r13 136 adcq $0,%rdx 137 movq %r13,-8(%rsp,%r9,8) 138 movq %rdx,(%rsp,%r9,8) 139 140 leaq 1(%r14),%r14 141 jmp .Louter 142.align 16 143.Louter: 144 xorq %r15,%r15 145 movq %r8,%rbp 146 movq (%rsp),%r10 147 148 movq -96(%r12),%xmm0 149 movq -32(%r12),%xmm1 150 pand %xmm4,%xmm0 151 movq 32(%r12),%xmm2 152 pand %xmm5,%xmm1 153 154 mulq %rbx 155 addq %rax,%r10 156 movq (%rcx),%rax 157 adcq $0,%rdx 158 159 movq 96(%r12),%xmm3 160 pand %xmm6,%xmm2 161 por %xmm1,%xmm0 162 pand %xmm7,%xmm3 163 164 imulq %r10,%rbp 165 movq %rdx,%r11 166 167 por %xmm2,%xmm0 168 leaq 256(%r12),%r12 169 por %xmm3,%xmm0 170 171 mulq %rbp 172 addq %rax,%r10 173 movq 8(%rsi),%rax 174 adcq $0,%rdx 175 movq 8(%rsp),%r10 176 movq %rdx,%r13 177 178 leaq 1(%r15),%r15 179 jmp .Linner_enter 180 181.align 16 182.Linner: 183 addq %rax,%r13 184 movq (%rsi,%r15,8),%rax 185 adcq $0,%rdx 186 addq %r10,%r13 187 movq (%rsp,%r15,8),%r10 188 adcq $0,%rdx 189 movq %r13,-16(%rsp,%r15,8) 190 movq %rdx,%r13 191 192.Linner_enter: 193 mulq %rbx 194 addq %rax,%r11 195 movq (%rcx,%r15,8),%rax 196 adcq $0,%rdx 197 addq %r11,%r10 198 movq %rdx,%r11 199 adcq $0,%r11 200 leaq 1(%r15),%r15 201 202 mulq %rbp 203 cmpq %r9,%r15 204 jne .Linner 205 206.byte 102,72,15,126,195 207 208 addq %rax,%r13 209 movq (%rsi),%rax 210 adcq $0,%rdx 211 addq %r10,%r13 212 movq (%rsp,%r15,8),%r10 213 adcq $0,%rdx 214 movq %r13,-16(%rsp,%r15,8) 215 movq %rdx,%r13 216 217 xorq %rdx,%rdx 218 addq %r11,%r13 219 adcq $0,%rdx 220 addq %r10,%r13 221 adcq $0,%rdx 222 movq %r13,-8(%rsp,%r9,8) 223 movq %rdx,(%rsp,%r9,8) 224 225 leaq 1(%r14),%r14 226 cmpq %r9,%r14 227 jb .Louter 228 229 xorq %r14,%r14 230 movq (%rsp),%rax 231 leaq (%rsp),%rsi 232 movq %r9,%r15 233 jmp .Lsub 234.align 16 235.Lsub: sbbq (%rcx,%r14,8),%rax 236 movq %rax,(%rdi,%r14,8) 237 movq 8(%rsi,%r14,8),%rax 238 leaq 1(%r14),%r14 239 decq %r15 240 jnz .Lsub 241 242 sbbq $0,%rax 243 xorq %r14,%r14 244 movq %r9,%r15 245.align 16 246.Lcopy: 247 movq (%rsp,%r14,8),%rsi 248 movq (%rdi,%r14,8),%rcx 249 xorq %rcx,%rsi 250 andq %rax,%rsi 251 xorq %rcx,%rsi 252 movq %r14,(%rsp,%r14,8) 253 movq %rsi,(%rdi,%r14,8) 254 leaq 1(%r14),%r14 255 subq $1,%r15 256 jnz .Lcopy 257 258 movq 8(%rsp,%r9,8),%rsi 259 movq $1,%rax 260 movq -48(%rsi),%r15 261 movq -40(%rsi),%r14 262 movq -32(%rsi),%r13 263 movq -24(%rsi),%r12 264 movq -16(%rsi),%rbp 265 movq -8(%rsi),%rbx 266 leaq (%rsi),%rsp 267.Lmul_epilogue: 268 .byte 0xf3,0xc3 269.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 270.type bn_mul4x_mont_gather5,@function 271.align 32 272bn_mul4x_mont_gather5: 273.Lmul4x_enter: 274.byte 0x67 275 movq %rsp,%rax 276 pushq %rbx 277 pushq %rbp 278 pushq %r12 279 pushq %r13 280 pushq %r14 281 pushq %r15 282.byte 0x67 283 movl %r9d,%r10d 284 shll $3,%r9d 285 shll $3+2,%r10d 286 negq %r9 287 288 289 290 291 292 293 294 295 leaq -64(%rsp,%r9,2),%r11 296 subq %rsi,%r11 297 andq $4095,%r11 298 cmpq %r11,%r10 299 jb .Lmul4xsp_alt 300 subq %r11,%rsp 301 leaq -64(%rsp,%r9,2),%rsp 302 jmp .Lmul4xsp_done 303 304.align 32 305.Lmul4xsp_alt: 306 leaq 4096-64(,%r9,2),%r10 307 leaq -64(%rsp,%r9,2),%rsp 308 subq %r10,%r11 309 movq $0,%r10 310 cmovcq %r10,%r11 311 subq %r11,%rsp 312.Lmul4xsp_done: 313 andq $-64,%rsp 314 negq %r9 315 316 movq %rax,40(%rsp) 317.Lmul4x_body: 318 319 call mul4x_internal 320 321 movq 40(%rsp),%rsi 322 movq $1,%rax 323 movq -48(%rsi),%r15 324 movq -40(%rsi),%r14 325 movq -32(%rsi),%r13 326 movq -24(%rsi),%r12 327 movq -16(%rsi),%rbp 328 movq -8(%rsi),%rbx 329 leaq (%rsi),%rsp 330.Lmul4x_epilogue: 331 .byte 0xf3,0xc3 332.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 333 334.type mul4x_internal,@function 335.align 32 336mul4x_internal: 337 shlq $5,%r9 338 movl 8(%rax),%r10d 339 leaq 256(%rdx,%r9,1),%r13 340 shrq $5,%r9 341 movq %r10,%r11 342 shrq $3,%r10 343 andq $7,%r11 344 notq %r10 345 leaq .Lmagic_masks(%rip),%rax 346 andq $3,%r10 347 leaq 96(%rdx,%r11,8),%r12 348 movq 0(%rax,%r10,8),%xmm4 349 movq 8(%rax,%r10,8),%xmm5 350 addq $7,%r11 351 movq 16(%rax,%r10,8),%xmm6 352 movq 24(%rax,%r10,8),%xmm7 353 andq $7,%r11 354 355 movq -96(%r12),%xmm0 356 leaq 256(%r12),%r14 357 movq -32(%r12),%xmm1 358 pand %xmm4,%xmm0 359 movq 32(%r12),%xmm2 360 pand %xmm5,%xmm1 361 movq 96(%r12),%xmm3 362 pand %xmm6,%xmm2 363.byte 0x67 364 por %xmm1,%xmm0 365 movq -96(%r14),%xmm1 366.byte 0x67 367 pand %xmm7,%xmm3 368.byte 0x67 369 por %xmm2,%xmm0 370 movq -32(%r14),%xmm2 371.byte 0x67 372 pand %xmm4,%xmm1 373.byte 0x67 374 por %xmm3,%xmm0 375 movq 32(%r14),%xmm3 376 377.byte 102,72,15,126,195 378 movq 96(%r14),%xmm0 379 movq %r13,16+8(%rsp) 380 movq %rdi,56+8(%rsp) 381 382 movq (%r8),%r8 383 movq (%rsi),%rax 384 leaq (%rsi,%r9,1),%rsi 385 negq %r9 386 387 movq %r8,%rbp 388 mulq %rbx 389 movq %rax,%r10 390 movq (%rcx),%rax 391 392 pand %xmm5,%xmm2 393 pand %xmm6,%xmm3 394 por %xmm2,%xmm1 395 396 imulq %r10,%rbp 397 398 399 400 401 402 403 404 leaq 64+8(%rsp,%r11,8),%r14 405 movq %rdx,%r11 406 407 pand %xmm7,%xmm0 408 por %xmm3,%xmm1 409 leaq 512(%r12),%r12 410 por %xmm1,%xmm0 411 412 mulq %rbp 413 addq %rax,%r10 414 movq 8(%rsi,%r9,1),%rax 415 adcq $0,%rdx 416 movq %rdx,%rdi 417 418 mulq %rbx 419 addq %rax,%r11 420 movq 16(%rcx),%rax 421 adcq $0,%rdx 422 movq %rdx,%r10 423 424 mulq %rbp 425 addq %rax,%rdi 426 movq 16(%rsi,%r9,1),%rax 427 adcq $0,%rdx 428 addq %r11,%rdi 429 leaq 32(%r9),%r15 430 leaq 64(%rcx),%rcx 431 adcq $0,%rdx 432 movq %rdi,(%r14) 433 movq %rdx,%r13 434 jmp .L1st4x 435 436.align 32 437.L1st4x: 438 mulq %rbx 439 addq %rax,%r10 440 movq -32(%rcx),%rax 441 leaq 32(%r14),%r14 442 adcq $0,%rdx 443 movq %rdx,%r11 444 445 mulq %rbp 446 addq %rax,%r13 447 movq -8(%rsi,%r15,1),%rax 448 adcq $0,%rdx 449 addq %r10,%r13 450 adcq $0,%rdx 451 movq %r13,-24(%r14) 452 movq %rdx,%rdi 453 454 mulq %rbx 455 addq %rax,%r11 456 movq -16(%rcx),%rax 457 adcq $0,%rdx 458 movq %rdx,%r10 459 460 mulq %rbp 461 addq %rax,%rdi 462 movq (%rsi,%r15,1),%rax 463 adcq $0,%rdx 464 addq %r11,%rdi 465 adcq $0,%rdx 466 movq %rdi,-16(%r14) 467 movq %rdx,%r13 468 469 mulq %rbx 470 addq %rax,%r10 471 movq 0(%rcx),%rax 472 adcq $0,%rdx 473 movq %rdx,%r11 474 475 mulq %rbp 476 addq %rax,%r13 477 movq 8(%rsi,%r15,1),%rax 478 adcq $0,%rdx 479 addq %r10,%r13 480 adcq $0,%rdx 481 movq %r13,-8(%r14) 482 movq %rdx,%rdi 483 484 mulq %rbx 485 addq %rax,%r11 486 movq 16(%rcx),%rax 487 adcq $0,%rdx 488 movq %rdx,%r10 489 490 mulq %rbp 491 addq %rax,%rdi 492 movq 16(%rsi,%r15,1),%rax 493 adcq $0,%rdx 494 addq %r11,%rdi 495 leaq 64(%rcx),%rcx 496 adcq $0,%rdx 497 movq %rdi,(%r14) 498 movq %rdx,%r13 499 500 addq $32,%r15 501 jnz .L1st4x 502 503 mulq %rbx 504 addq %rax,%r10 505 movq -32(%rcx),%rax 506 leaq 32(%r14),%r14 507 adcq $0,%rdx 508 movq %rdx,%r11 509 510 mulq %rbp 511 addq %rax,%r13 512 movq -8(%rsi),%rax 513 adcq $0,%rdx 514 addq %r10,%r13 515 adcq $0,%rdx 516 movq %r13,-24(%r14) 517 movq %rdx,%rdi 518 519 mulq %rbx 520 addq %rax,%r11 521 movq -16(%rcx),%rax 522 adcq $0,%rdx 523 movq %rdx,%r10 524 525 mulq %rbp 526 addq %rax,%rdi 527 movq (%rsi,%r9,1),%rax 528 adcq $0,%rdx 529 addq %r11,%rdi 530 adcq $0,%rdx 531 movq %rdi,-16(%r14) 532 movq %rdx,%r13 533 534.byte 102,72,15,126,195 535 leaq (%rcx,%r9,2),%rcx 536 537 xorq %rdi,%rdi 538 addq %r10,%r13 539 adcq $0,%rdi 540 movq %r13,-8(%r14) 541 542 jmp .Louter4x 543 544.align 32 545.Louter4x: 546 movq (%r14,%r9,1),%r10 547 movq %r8,%rbp 548 mulq %rbx 549 addq %rax,%r10 550 movq (%rcx),%rax 551 adcq $0,%rdx 552 553 movq -96(%r12),%xmm0 554 movq -32(%r12),%xmm1 555 pand %xmm4,%xmm0 556 movq 32(%r12),%xmm2 557 pand %xmm5,%xmm1 558 movq 96(%r12),%xmm3 559 560 imulq %r10,%rbp 561.byte 0x67 562 movq %rdx,%r11 563 movq %rdi,(%r14) 564 565 pand %xmm6,%xmm2 566 por %xmm1,%xmm0 567 pand %xmm7,%xmm3 568 por %xmm2,%xmm0 569 leaq (%r14,%r9,1),%r14 570 leaq 256(%r12),%r12 571 por %xmm3,%xmm0 572 573 mulq %rbp 574 addq %rax,%r10 575 movq 8(%rsi,%r9,1),%rax 576 adcq $0,%rdx 577 movq %rdx,%rdi 578 579 mulq %rbx 580 addq %rax,%r11 581 movq 16(%rcx),%rax 582 adcq $0,%rdx 583 addq 8(%r14),%r11 584 adcq $0,%rdx 585 movq %rdx,%r10 586 587 mulq %rbp 588 addq %rax,%rdi 589 movq 16(%rsi,%r9,1),%rax 590 adcq $0,%rdx 591 addq %r11,%rdi 592 leaq 32(%r9),%r15 593 leaq 64(%rcx),%rcx 594 adcq $0,%rdx 595 movq %rdx,%r13 596 jmp .Linner4x 597 598.align 32 599.Linner4x: 600 mulq %rbx 601 addq %rax,%r10 602 movq -32(%rcx),%rax 603 adcq $0,%rdx 604 addq 16(%r14),%r10 605 leaq 32(%r14),%r14 606 adcq $0,%rdx 607 movq %rdx,%r11 608 609 mulq %rbp 610 addq %rax,%r13 611 movq -8(%rsi,%r15,1),%rax 612 adcq $0,%rdx 613 addq %r10,%r13 614 adcq $0,%rdx 615 movq %rdi,-32(%r14) 616 movq %rdx,%rdi 617 618 mulq %rbx 619 addq %rax,%r11 620 movq -16(%rcx),%rax 621 adcq $0,%rdx 622 addq -8(%r14),%r11 623 adcq $0,%rdx 624 movq %rdx,%r10 625 626 mulq %rbp 627 addq %rax,%rdi 628 movq (%rsi,%r15,1),%rax 629 adcq $0,%rdx 630 addq %r11,%rdi 631 adcq $0,%rdx 632 movq %r13,-24(%r14) 633 movq %rdx,%r13 634 635 mulq %rbx 636 addq %rax,%r10 637 movq 0(%rcx),%rax 638 adcq $0,%rdx 639 addq (%r14),%r10 640 adcq $0,%rdx 641 movq %rdx,%r11 642 643 mulq %rbp 644 addq %rax,%r13 645 movq 8(%rsi,%r15,1),%rax 646 adcq $0,%rdx 647 addq %r10,%r13 648 adcq $0,%rdx 649 movq %rdi,-16(%r14) 650 movq %rdx,%rdi 651 652 mulq %rbx 653 addq %rax,%r11 654 movq 16(%rcx),%rax 655 adcq $0,%rdx 656 addq 8(%r14),%r11 657 adcq $0,%rdx 658 movq %rdx,%r10 659 660 mulq %rbp 661 addq %rax,%rdi 662 movq 16(%rsi,%r15,1),%rax 663 adcq $0,%rdx 664 addq %r11,%rdi 665 leaq 64(%rcx),%rcx 666 adcq $0,%rdx 667 movq %r13,-8(%r14) 668 movq %rdx,%r13 669 670 addq $32,%r15 671 jnz .Linner4x 672 673 mulq %rbx 674 addq %rax,%r10 675 movq -32(%rcx),%rax 676 adcq $0,%rdx 677 addq 16(%r14),%r10 678 leaq 32(%r14),%r14 679 adcq $0,%rdx 680 movq %rdx,%r11 681 682 mulq %rbp 683 addq %rax,%r13 684 movq -8(%rsi),%rax 685 adcq $0,%rdx 686 addq %r10,%r13 687 adcq $0,%rdx 688 movq %rdi,-32(%r14) 689 movq %rdx,%rdi 690 691 mulq %rbx 692 addq %rax,%r11 693 movq %rbp,%rax 694 movq -16(%rcx),%rbp 695 adcq $0,%rdx 696 addq -8(%r14),%r11 697 adcq $0,%rdx 698 movq %rdx,%r10 699 700 mulq %rbp 701 addq %rax,%rdi 702 movq (%rsi,%r9,1),%rax 703 adcq $0,%rdx 704 addq %r11,%rdi 705 adcq $0,%rdx 706 movq %r13,-24(%r14) 707 movq %rdx,%r13 708 709.byte 102,72,15,126,195 710 movq %rdi,-16(%r14) 711 leaq (%rcx,%r9,2),%rcx 712 713 xorq %rdi,%rdi 714 addq %r10,%r13 715 adcq $0,%rdi 716 addq (%r14),%r13 717 adcq $0,%rdi 718 movq %r13,-8(%r14) 719 720 cmpq 16+8(%rsp),%r12 721 jb .Louter4x 722 subq %r13,%rbp 723 adcq %r15,%r15 724 orq %r15,%rdi 725 xorq $1,%rdi 726 leaq (%r14,%r9,1),%rbx 727 leaq (%rcx,%rdi,8),%rbp 728 movq %r9,%rcx 729 sarq $3+2,%rcx 730 movq 56+8(%rsp),%rdi 731 jmp .Lsqr4x_sub 732.size mul4x_internal,.-mul4x_internal 733.globl bn_power5 734.hidden bn_power5 735.type bn_power5,@function 736.align 32 737bn_power5: 738 movq %rsp,%rax 739 pushq %rbx 740 pushq %rbp 741 pushq %r12 742 pushq %r13 743 pushq %r14 744 pushq %r15 745 movl %r9d,%r10d 746 shll $3,%r9d 747 shll $3+2,%r10d 748 negq %r9 749 movq (%r8),%r8 750 751 752 753 754 755 756 757 leaq -64(%rsp,%r9,2),%r11 758 subq %rsi,%r11 759 andq $4095,%r11 760 cmpq %r11,%r10 761 jb .Lpwr_sp_alt 762 subq %r11,%rsp 763 leaq -64(%rsp,%r9,2),%rsp 764 jmp .Lpwr_sp_done 765 766.align 32 767.Lpwr_sp_alt: 768 leaq 4096-64(,%r9,2),%r10 769 leaq -64(%rsp,%r9,2),%rsp 770 subq %r10,%r11 771 movq $0,%r10 772 cmovcq %r10,%r11 773 subq %r11,%rsp 774.Lpwr_sp_done: 775 andq $-64,%rsp 776 movq %r9,%r10 777 negq %r9 778 779 780 781 782 783 784 785 786 787 788 movq %r8,32(%rsp) 789 movq %rax,40(%rsp) 790.Lpower5_body: 791.byte 102,72,15,110,207 792.byte 102,72,15,110,209 793.byte 102,73,15,110,218 794.byte 102,72,15,110,226 795 796 call __bn_sqr8x_internal 797 call __bn_sqr8x_internal 798 call __bn_sqr8x_internal 799 call __bn_sqr8x_internal 800 call __bn_sqr8x_internal 801 802.byte 102,72,15,126,209 803.byte 102,72,15,126,226 804 movq %rsi,%rdi 805 movq 40(%rsp),%rax 806 leaq 32(%rsp),%r8 807 808 call mul4x_internal 809 810 movq 40(%rsp),%rsi 811 movq $1,%rax 812 movq -48(%rsi),%r15 813 movq -40(%rsi),%r14 814 movq -32(%rsi),%r13 815 movq -24(%rsi),%r12 816 movq -16(%rsi),%rbp 817 movq -8(%rsi),%rbx 818 leaq (%rsi),%rsp 819.Lpower5_epilogue: 820 .byte 0xf3,0xc3 821.size bn_power5,.-bn_power5 822 823.globl bn_sqr8x_internal 824.hidden bn_sqr8x_internal 825.hidden bn_sqr8x_internal 826.type bn_sqr8x_internal,@function 827.align 32 828bn_sqr8x_internal: 829__bn_sqr8x_internal: 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 leaq 32(%r10),%rbp 904 leaq (%rsi,%r9,1),%rsi 905 906 movq %r9,%rcx 907 908 909 movq -32(%rsi,%rbp,1),%r14 910 leaq 48+8(%rsp,%r9,2),%rdi 911 movq -24(%rsi,%rbp,1),%rax 912 leaq -32(%rdi,%rbp,1),%rdi 913 movq -16(%rsi,%rbp,1),%rbx 914 movq %rax,%r15 915 916 mulq %r14 917 movq %rax,%r10 918 movq %rbx,%rax 919 movq %rdx,%r11 920 movq %r10,-24(%rdi,%rbp,1) 921 922 mulq %r14 923 addq %rax,%r11 924 movq %rbx,%rax 925 adcq $0,%rdx 926 movq %r11,-16(%rdi,%rbp,1) 927 movq %rdx,%r10 928 929 930 movq -8(%rsi,%rbp,1),%rbx 931 mulq %r15 932 movq %rax,%r12 933 movq %rbx,%rax 934 movq %rdx,%r13 935 936 leaq (%rbp),%rcx 937 mulq %r14 938 addq %rax,%r10 939 movq %rbx,%rax 940 movq %rdx,%r11 941 adcq $0,%r11 942 addq %r12,%r10 943 adcq $0,%r11 944 movq %r10,-8(%rdi,%rcx,1) 945 jmp .Lsqr4x_1st 946 947.align 32 948.Lsqr4x_1st: 949 movq (%rsi,%rcx,1),%rbx 950 mulq %r15 951 addq %rax,%r13 952 movq %rbx,%rax 953 movq %rdx,%r12 954 adcq $0,%r12 955 956 mulq %r14 957 addq %rax,%r11 958 movq %rbx,%rax 959 movq 8(%rsi,%rcx,1),%rbx 960 movq %rdx,%r10 961 adcq $0,%r10 962 addq %r13,%r11 963 adcq $0,%r10 964 965 966 mulq %r15 967 addq %rax,%r12 968 movq %rbx,%rax 969 movq %r11,(%rdi,%rcx,1) 970 movq %rdx,%r13 971 adcq $0,%r13 972 973 mulq %r14 974 addq %rax,%r10 975 movq %rbx,%rax 976 movq 16(%rsi,%rcx,1),%rbx 977 movq %rdx,%r11 978 adcq $0,%r11 979 addq %r12,%r10 980 adcq $0,%r11 981 982 mulq %r15 983 addq %rax,%r13 984 movq %rbx,%rax 985 movq %r10,8(%rdi,%rcx,1) 986 movq %rdx,%r12 987 adcq $0,%r12 988 989 mulq %r14 990 addq %rax,%r11 991 movq %rbx,%rax 992 movq 24(%rsi,%rcx,1),%rbx 993 movq %rdx,%r10 994 adcq $0,%r10 995 addq %r13,%r11 996 adcq $0,%r10 997 998 999 mulq %r15 1000 addq %rax,%r12 1001 movq %rbx,%rax 1002 movq %r11,16(%rdi,%rcx,1) 1003 movq %rdx,%r13 1004 adcq $0,%r13 1005 leaq 32(%rcx),%rcx 1006 1007 mulq %r14 1008 addq %rax,%r10 1009 movq %rbx,%rax 1010 movq %rdx,%r11 1011 adcq $0,%r11 1012 addq %r12,%r10 1013 adcq $0,%r11 1014 movq %r10,-8(%rdi,%rcx,1) 1015 1016 cmpq $0,%rcx 1017 jne .Lsqr4x_1st 1018 1019 mulq %r15 1020 addq %rax,%r13 1021 leaq 16(%rbp),%rbp 1022 adcq $0,%rdx 1023 addq %r11,%r13 1024 adcq $0,%rdx 1025 1026 movq %r13,(%rdi) 1027 movq %rdx,%r12 1028 movq %rdx,8(%rdi) 1029 jmp .Lsqr4x_outer 1030 1031.align 32 1032.Lsqr4x_outer: 1033 movq -32(%rsi,%rbp,1),%r14 1034 leaq 48+8(%rsp,%r9,2),%rdi 1035 movq -24(%rsi,%rbp,1),%rax 1036 leaq -32(%rdi,%rbp,1),%rdi 1037 movq -16(%rsi,%rbp,1),%rbx 1038 movq %rax,%r15 1039 1040 mulq %r14 1041 movq -24(%rdi,%rbp,1),%r10 1042 addq %rax,%r10 1043 movq %rbx,%rax 1044 adcq $0,%rdx 1045 movq %r10,-24(%rdi,%rbp,1) 1046 movq %rdx,%r11 1047 1048 mulq %r14 1049 addq %rax,%r11 1050 movq %rbx,%rax 1051 adcq $0,%rdx 1052 addq -16(%rdi,%rbp,1),%r11 1053 movq %rdx,%r10 1054 adcq $0,%r10 1055 movq %r11,-16(%rdi,%rbp,1) 1056 1057 xorq %r12,%r12 1058 1059 movq -8(%rsi,%rbp,1),%rbx 1060 mulq %r15 1061 addq %rax,%r12 1062 movq %rbx,%rax 1063 adcq $0,%rdx 1064 addq -8(%rdi,%rbp,1),%r12 1065 movq %rdx,%r13 1066 adcq $0,%r13 1067 1068 mulq %r14 1069 addq %rax,%r10 1070 movq %rbx,%rax 1071 adcq $0,%rdx 1072 addq %r12,%r10 1073 movq %rdx,%r11 1074 adcq $0,%r11 1075 movq %r10,-8(%rdi,%rbp,1) 1076 1077 leaq (%rbp),%rcx 1078 jmp .Lsqr4x_inner 1079 1080.align 32 1081.Lsqr4x_inner: 1082 movq (%rsi,%rcx,1),%rbx 1083 mulq %r15 1084 addq %rax,%r13 1085 movq %rbx,%rax 1086 movq %rdx,%r12 1087 adcq $0,%r12 1088 addq (%rdi,%rcx,1),%r13 1089 adcq $0,%r12 1090 1091.byte 0x67 1092 mulq %r14 1093 addq %rax,%r11 1094 movq %rbx,%rax 1095 movq 8(%rsi,%rcx,1),%rbx 1096 movq %rdx,%r10 1097 adcq $0,%r10 1098 addq %r13,%r11 1099 adcq $0,%r10 1100 1101 mulq %r15 1102 addq %rax,%r12 1103 movq %r11,(%rdi,%rcx,1) 1104 movq %rbx,%rax 1105 movq %rdx,%r13 1106 adcq $0,%r13 1107 addq 8(%rdi,%rcx,1),%r12 1108 leaq 16(%rcx),%rcx 1109 adcq $0,%r13 1110 1111 mulq %r14 1112 addq %rax,%r10 1113 movq %rbx,%rax 1114 adcq $0,%rdx 1115 addq %r12,%r10 1116 movq %rdx,%r11 1117 adcq $0,%r11 1118 movq %r10,-8(%rdi,%rcx,1) 1119 1120 cmpq $0,%rcx 1121 jne .Lsqr4x_inner 1122 1123.byte 0x67 1124 mulq %r15 1125 addq %rax,%r13 1126 adcq $0,%rdx 1127 addq %r11,%r13 1128 adcq $0,%rdx 1129 1130 movq %r13,(%rdi) 1131 movq %rdx,%r12 1132 movq %rdx,8(%rdi) 1133 1134 addq $16,%rbp 1135 jnz .Lsqr4x_outer 1136 1137 1138 movq -32(%rsi),%r14 1139 leaq 48+8(%rsp,%r9,2),%rdi 1140 movq -24(%rsi),%rax 1141 leaq -32(%rdi,%rbp,1),%rdi 1142 movq -16(%rsi),%rbx 1143 movq %rax,%r15 1144 1145 mulq %r14 1146 addq %rax,%r10 1147 movq %rbx,%rax 1148 movq %rdx,%r11 1149 adcq $0,%r11 1150 1151 mulq %r14 1152 addq %rax,%r11 1153 movq %rbx,%rax 1154 movq %r10,-24(%rdi) 1155 movq %rdx,%r10 1156 adcq $0,%r10 1157 addq %r13,%r11 1158 movq -8(%rsi),%rbx 1159 adcq $0,%r10 1160 1161 mulq %r15 1162 addq %rax,%r12 1163 movq %rbx,%rax 1164 movq %r11,-16(%rdi) 1165 movq %rdx,%r13 1166 adcq $0,%r13 1167 1168 mulq %r14 1169 addq %rax,%r10 1170 movq %rbx,%rax 1171 movq %rdx,%r11 1172 adcq $0,%r11 1173 addq %r12,%r10 1174 adcq $0,%r11 1175 movq %r10,-8(%rdi) 1176 1177 mulq %r15 1178 addq %rax,%r13 1179 movq -16(%rsi),%rax 1180 adcq $0,%rdx 1181 addq %r11,%r13 1182 adcq $0,%rdx 1183 1184 movq %r13,(%rdi) 1185 movq %rdx,%r12 1186 movq %rdx,8(%rdi) 1187 1188 mulq %rbx 1189 addq $16,%rbp 1190 xorq %r14,%r14 1191 subq %r9,%rbp 1192 xorq %r15,%r15 1193 1194 addq %r12,%rax 1195 adcq $0,%rdx 1196 movq %rax,8(%rdi) 1197 movq %rdx,16(%rdi) 1198 movq %r15,24(%rdi) 1199 1200 movq -16(%rsi,%rbp,1),%rax 1201 leaq 48+8(%rsp),%rdi 1202 xorq %r10,%r10 1203 movq 8(%rdi),%r11 1204 1205 leaq (%r14,%r10,2),%r12 1206 shrq $63,%r10 1207 leaq (%rcx,%r11,2),%r13 1208 shrq $63,%r11 1209 orq %r10,%r13 1210 movq 16(%rdi),%r10 1211 movq %r11,%r14 1212 mulq %rax 1213 negq %r15 1214 movq 24(%rdi),%r11 1215 adcq %rax,%r12 1216 movq -8(%rsi,%rbp,1),%rax 1217 movq %r12,(%rdi) 1218 adcq %rdx,%r13 1219 1220 leaq (%r14,%r10,2),%rbx 1221 movq %r13,8(%rdi) 1222 sbbq %r15,%r15 1223 shrq $63,%r10 1224 leaq (%rcx,%r11,2),%r8 1225 shrq $63,%r11 1226 orq %r10,%r8 1227 movq 32(%rdi),%r10 1228 movq %r11,%r14 1229 mulq %rax 1230 negq %r15 1231 movq 40(%rdi),%r11 1232 adcq %rax,%rbx 1233 movq 0(%rsi,%rbp,1),%rax 1234 movq %rbx,16(%rdi) 1235 adcq %rdx,%r8 1236 leaq 16(%rbp),%rbp 1237 movq %r8,24(%rdi) 1238 sbbq %r15,%r15 1239 leaq 64(%rdi),%rdi 1240 jmp .Lsqr4x_shift_n_add 1241 1242.align 32 1243.Lsqr4x_shift_n_add: 1244 leaq (%r14,%r10,2),%r12 1245 shrq $63,%r10 1246 leaq (%rcx,%r11,2),%r13 1247 shrq $63,%r11 1248 orq %r10,%r13 1249 movq -16(%rdi),%r10 1250 movq %r11,%r14 1251 mulq %rax 1252 negq %r15 1253 movq -8(%rdi),%r11 1254 adcq %rax,%r12 1255 movq -8(%rsi,%rbp,1),%rax 1256 movq %r12,-32(%rdi) 1257 adcq %rdx,%r13 1258 1259 leaq (%r14,%r10,2),%rbx 1260 movq %r13,-24(%rdi) 1261 sbbq %r15,%r15 1262 shrq $63,%r10 1263 leaq (%rcx,%r11,2),%r8 1264 shrq $63,%r11 1265 orq %r10,%r8 1266 movq 0(%rdi),%r10 1267 movq %r11,%r14 1268 mulq %rax 1269 negq %r15 1270 movq 8(%rdi),%r11 1271 adcq %rax,%rbx 1272 movq 0(%rsi,%rbp,1),%rax 1273 movq %rbx,-16(%rdi) 1274 adcq %rdx,%r8 1275 1276 leaq (%r14,%r10,2),%r12 1277 movq %r8,-8(%rdi) 1278 sbbq %r15,%r15 1279 shrq $63,%r10 1280 leaq (%rcx,%r11,2),%r13 1281 shrq $63,%r11 1282 orq %r10,%r13 1283 movq 16(%rdi),%r10 1284 movq %r11,%r14 1285 mulq %rax 1286 negq %r15 1287 movq 24(%rdi),%r11 1288 adcq %rax,%r12 1289 movq 8(%rsi,%rbp,1),%rax 1290 movq %r12,0(%rdi) 1291 adcq %rdx,%r13 1292 1293 leaq (%r14,%r10,2),%rbx 1294 movq %r13,8(%rdi) 1295 sbbq %r15,%r15 1296 shrq $63,%r10 1297 leaq (%rcx,%r11,2),%r8 1298 shrq $63,%r11 1299 orq %r10,%r8 1300 movq 32(%rdi),%r10 1301 movq %r11,%r14 1302 mulq %rax 1303 negq %r15 1304 movq 40(%rdi),%r11 1305 adcq %rax,%rbx 1306 movq 16(%rsi,%rbp,1),%rax 1307 movq %rbx,16(%rdi) 1308 adcq %rdx,%r8 1309 movq %r8,24(%rdi) 1310 sbbq %r15,%r15 1311 leaq 64(%rdi),%rdi 1312 addq $32,%rbp 1313 jnz .Lsqr4x_shift_n_add 1314 1315 leaq (%r14,%r10,2),%r12 1316.byte 0x67 1317 shrq $63,%r10 1318 leaq (%rcx,%r11,2),%r13 1319 shrq $63,%r11 1320 orq %r10,%r13 1321 movq -16(%rdi),%r10 1322 movq %r11,%r14 1323 mulq %rax 1324 negq %r15 1325 movq -8(%rdi),%r11 1326 adcq %rax,%r12 1327 movq -8(%rsi),%rax 1328 movq %r12,-32(%rdi) 1329 adcq %rdx,%r13 1330 1331 leaq (%r14,%r10,2),%rbx 1332 movq %r13,-24(%rdi) 1333 sbbq %r15,%r15 1334 shrq $63,%r10 1335 leaq (%rcx,%r11,2),%r8 1336 shrq $63,%r11 1337 orq %r10,%r8 1338 mulq %rax 1339 negq %r15 1340 adcq %rax,%rbx 1341 adcq %rdx,%r8 1342 movq %rbx,-16(%rdi) 1343 movq %r8,-8(%rdi) 1344.byte 102,72,15,126,213 1345sqr8x_reduction: 1346 xorq %rax,%rax 1347 leaq (%rbp,%r9,2),%rcx 1348 leaq 48+8(%rsp,%r9,2),%rdx 1349 movq %rcx,0+8(%rsp) 1350 leaq 48+8(%rsp,%r9,1),%rdi 1351 movq %rdx,8+8(%rsp) 1352 negq %r9 1353 jmp .L8x_reduction_loop 1354 1355.align 32 1356.L8x_reduction_loop: 1357 leaq (%rdi,%r9,1),%rdi 1358.byte 0x66 1359 movq 0(%rdi),%rbx 1360 movq 8(%rdi),%r9 1361 movq 16(%rdi),%r10 1362 movq 24(%rdi),%r11 1363 movq 32(%rdi),%r12 1364 movq 40(%rdi),%r13 1365 movq 48(%rdi),%r14 1366 movq 56(%rdi),%r15 1367 movq %rax,(%rdx) 1368 leaq 64(%rdi),%rdi 1369 1370.byte 0x67 1371 movq %rbx,%r8 1372 imulq 32+8(%rsp),%rbx 1373 movq 0(%rbp),%rax 1374 movl $8,%ecx 1375 jmp .L8x_reduce 1376 1377.align 32 1378.L8x_reduce: 1379 mulq %rbx 1380 movq 16(%rbp),%rax 1381 negq %r8 1382 movq %rdx,%r8 1383 adcq $0,%r8 1384 1385 mulq %rbx 1386 addq %rax,%r9 1387 movq 32(%rbp),%rax 1388 adcq $0,%rdx 1389 addq %r9,%r8 1390 movq %rbx,48-8+8(%rsp,%rcx,8) 1391 movq %rdx,%r9 1392 adcq $0,%r9 1393 1394 mulq %rbx 1395 addq %rax,%r10 1396 movq 48(%rbp),%rax 1397 adcq $0,%rdx 1398 addq %r10,%r9 1399 movq 32+8(%rsp),%rsi 1400 movq %rdx,%r10 1401 adcq $0,%r10 1402 1403 mulq %rbx 1404 addq %rax,%r11 1405 movq 64(%rbp),%rax 1406 adcq $0,%rdx 1407 imulq %r8,%rsi 1408 addq %r11,%r10 1409 movq %rdx,%r11 1410 adcq $0,%r11 1411 1412 mulq %rbx 1413 addq %rax,%r12 1414 movq 80(%rbp),%rax 1415 adcq $0,%rdx 1416 addq %r12,%r11 1417 movq %rdx,%r12 1418 adcq $0,%r12 1419 1420 mulq %rbx 1421 addq %rax,%r13 1422 movq 96(%rbp),%rax 1423 adcq $0,%rdx 1424 addq %r13,%r12 1425 movq %rdx,%r13 1426 adcq $0,%r13 1427 1428 mulq %rbx 1429 addq %rax,%r14 1430 movq 112(%rbp),%rax 1431 adcq $0,%rdx 1432 addq %r14,%r13 1433 movq %rdx,%r14 1434 adcq $0,%r14 1435 1436 mulq %rbx 1437 movq %rsi,%rbx 1438 addq %rax,%r15 1439 movq 0(%rbp),%rax 1440 adcq $0,%rdx 1441 addq %r15,%r14 1442 movq %rdx,%r15 1443 adcq $0,%r15 1444 1445 decl %ecx 1446 jnz .L8x_reduce 1447 1448 leaq 128(%rbp),%rbp 1449 xorq %rax,%rax 1450 movq 8+8(%rsp),%rdx 1451 cmpq 0+8(%rsp),%rbp 1452 jae .L8x_no_tail 1453 1454.byte 0x66 1455 addq 0(%rdi),%r8 1456 adcq 8(%rdi),%r9 1457 adcq 16(%rdi),%r10 1458 adcq 24(%rdi),%r11 1459 adcq 32(%rdi),%r12 1460 adcq 40(%rdi),%r13 1461 adcq 48(%rdi),%r14 1462 adcq 56(%rdi),%r15 1463 sbbq %rsi,%rsi 1464 1465 movq 48+56+8(%rsp),%rbx 1466 movl $8,%ecx 1467 movq 0(%rbp),%rax 1468 jmp .L8x_tail 1469 1470.align 32 1471.L8x_tail: 1472 mulq %rbx 1473 addq %rax,%r8 1474 movq 16(%rbp),%rax 1475 movq %r8,(%rdi) 1476 movq %rdx,%r8 1477 adcq $0,%r8 1478 1479 mulq %rbx 1480 addq %rax,%r9 1481 movq 32(%rbp),%rax 1482 adcq $0,%rdx 1483 addq %r9,%r8 1484 leaq 8(%rdi),%rdi 1485 movq %rdx,%r9 1486 adcq $0,%r9 1487 1488 mulq %rbx 1489 addq %rax,%r10 1490 movq 48(%rbp),%rax 1491 adcq $0,%rdx 1492 addq %r10,%r9 1493 movq %rdx,%r10 1494 adcq $0,%r10 1495 1496 mulq %rbx 1497 addq %rax,%r11 1498 movq 64(%rbp),%rax 1499 adcq $0,%rdx 1500 addq %r11,%r10 1501 movq %rdx,%r11 1502 adcq $0,%r11 1503 1504 mulq %rbx 1505 addq %rax,%r12 1506 movq 80(%rbp),%rax 1507 adcq $0,%rdx 1508 addq %r12,%r11 1509 movq %rdx,%r12 1510 adcq $0,%r12 1511 1512 mulq %rbx 1513 addq %rax,%r13 1514 movq 96(%rbp),%rax 1515 adcq $0,%rdx 1516 addq %r13,%r12 1517 movq %rdx,%r13 1518 adcq $0,%r13 1519 1520 mulq %rbx 1521 addq %rax,%r14 1522 movq 112(%rbp),%rax 1523 adcq $0,%rdx 1524 addq %r14,%r13 1525 movq %rdx,%r14 1526 adcq $0,%r14 1527 1528 mulq %rbx 1529 movq 48-16+8(%rsp,%rcx,8),%rbx 1530 addq %rax,%r15 1531 adcq $0,%rdx 1532 addq %r15,%r14 1533 movq 0(%rbp),%rax 1534 movq %rdx,%r15 1535 adcq $0,%r15 1536 1537 decl %ecx 1538 jnz .L8x_tail 1539 1540 leaq 128(%rbp),%rbp 1541 movq 8+8(%rsp),%rdx 1542 cmpq 0+8(%rsp),%rbp 1543 jae .L8x_tail_done 1544 1545 movq 48+56+8(%rsp),%rbx 1546 negq %rsi 1547 movq 0(%rbp),%rax 1548 adcq 0(%rdi),%r8 1549 adcq 8(%rdi),%r9 1550 adcq 16(%rdi),%r10 1551 adcq 24(%rdi),%r11 1552 adcq 32(%rdi),%r12 1553 adcq 40(%rdi),%r13 1554 adcq 48(%rdi),%r14 1555 adcq 56(%rdi),%r15 1556 sbbq %rsi,%rsi 1557 1558 movl $8,%ecx 1559 jmp .L8x_tail 1560 1561.align 32 1562.L8x_tail_done: 1563 addq (%rdx),%r8 1564 adcq $0,%r9 1565 adcq $0,%r10 1566 adcq $0,%r11 1567 adcq $0,%r12 1568 adcq $0,%r13 1569 adcq $0,%r14 1570 adcq $0,%r15 1571 1572 1573 xorq %rax,%rax 1574 1575 negq %rsi 1576.L8x_no_tail: 1577 adcq 0(%rdi),%r8 1578 adcq 8(%rdi),%r9 1579 adcq 16(%rdi),%r10 1580 adcq 24(%rdi),%r11 1581 adcq 32(%rdi),%r12 1582 adcq 40(%rdi),%r13 1583 adcq 48(%rdi),%r14 1584 adcq 56(%rdi),%r15 1585 adcq $0,%rax 1586 movq -16(%rbp),%rcx 1587 xorq %rsi,%rsi 1588 1589.byte 102,72,15,126,213 1590 1591 movq %r8,0(%rdi) 1592 movq %r9,8(%rdi) 1593.byte 102,73,15,126,217 1594 movq %r10,16(%rdi) 1595 movq %r11,24(%rdi) 1596 movq %r12,32(%rdi) 1597 movq %r13,40(%rdi) 1598 movq %r14,48(%rdi) 1599 movq %r15,56(%rdi) 1600 leaq 64(%rdi),%rdi 1601 1602 cmpq %rdx,%rdi 1603 jb .L8x_reduction_loop 1604 1605 subq %r15,%rcx 1606 leaq (%rdi,%r9,1),%rbx 1607 adcq %rsi,%rsi 1608 movq %r9,%rcx 1609 orq %rsi,%rax 1610.byte 102,72,15,126,207 1611 xorq $1,%rax 1612.byte 102,72,15,126,206 1613 leaq (%rbp,%rax,8),%rbp 1614 sarq $3+2,%rcx 1615 jmp .Lsqr4x_sub 1616 1617.align 32 1618.Lsqr4x_sub: 1619.byte 0x66 1620 movq 0(%rbx),%r12 1621 movq 8(%rbx),%r13 1622 sbbq 0(%rbp),%r12 1623 movq 16(%rbx),%r14 1624 sbbq 16(%rbp),%r13 1625 movq 24(%rbx),%r15 1626 leaq 32(%rbx),%rbx 1627 sbbq 32(%rbp),%r14 1628 movq %r12,0(%rdi) 1629 sbbq 48(%rbp),%r15 1630 leaq 64(%rbp),%rbp 1631 movq %r13,8(%rdi) 1632 movq %r14,16(%rdi) 1633 movq %r15,24(%rdi) 1634 leaq 32(%rdi),%rdi 1635 1636 incq %rcx 1637 jnz .Lsqr4x_sub 1638 movq %r9,%r10 1639 negq %r9 1640 .byte 0xf3,0xc3 1641.size bn_sqr8x_internal,.-bn_sqr8x_internal 1642.globl bn_from_montgomery 1643.hidden bn_from_montgomery 1644.type bn_from_montgomery,@function 1645.align 32 1646bn_from_montgomery: 1647 testl $7,%r9d 1648 jz bn_from_mont8x 1649 xorl %eax,%eax 1650 .byte 0xf3,0xc3 1651.size bn_from_montgomery,.-bn_from_montgomery 1652 1653.type bn_from_mont8x,@function 1654.align 32 1655bn_from_mont8x: 1656.byte 0x67 1657 movq %rsp,%rax 1658 pushq %rbx 1659 pushq %rbp 1660 pushq %r12 1661 pushq %r13 1662 pushq %r14 1663 pushq %r15 1664.byte 0x67 1665 movl %r9d,%r10d 1666 shll $3,%r9d 1667 shll $3+2,%r10d 1668 negq %r9 1669 movq (%r8),%r8 1670 1671 1672 1673 1674 1675 1676 1677 leaq -64(%rsp,%r9,2),%r11 1678 subq %rsi,%r11 1679 andq $4095,%r11 1680 cmpq %r11,%r10 1681 jb .Lfrom_sp_alt 1682 subq %r11,%rsp 1683 leaq -64(%rsp,%r9,2),%rsp 1684 jmp .Lfrom_sp_done 1685 1686.align 32 1687.Lfrom_sp_alt: 1688 leaq 4096-64(,%r9,2),%r10 1689 leaq -64(%rsp,%r9,2),%rsp 1690 subq %r10,%r11 1691 movq $0,%r10 1692 cmovcq %r10,%r11 1693 subq %r11,%rsp 1694.Lfrom_sp_done: 1695 andq $-64,%rsp 1696 movq %r9,%r10 1697 negq %r9 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 movq %r8,32(%rsp) 1709 movq %rax,40(%rsp) 1710.Lfrom_body: 1711 movq %r9,%r11 1712 leaq 48(%rsp),%rax 1713 pxor %xmm0,%xmm0 1714 jmp .Lmul_by_1 1715 1716.align 32 1717.Lmul_by_1: 1718 movdqu (%rsi),%xmm1 1719 movdqu 16(%rsi),%xmm2 1720 movdqu 32(%rsi),%xmm3 1721 movdqa %xmm0,(%rax,%r9,1) 1722 movdqu 48(%rsi),%xmm4 1723 movdqa %xmm0,16(%rax,%r9,1) 1724.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 1725 movdqa %xmm1,(%rax) 1726 movdqa %xmm0,32(%rax,%r9,1) 1727 movdqa %xmm2,16(%rax) 1728 movdqa %xmm0,48(%rax,%r9,1) 1729 movdqa %xmm3,32(%rax) 1730 movdqa %xmm4,48(%rax) 1731 leaq 64(%rax),%rax 1732 subq $64,%r11 1733 jnz .Lmul_by_1 1734 1735.byte 102,72,15,110,207 1736.byte 102,72,15,110,209 1737.byte 0x67 1738 movq %rcx,%rbp 1739.byte 102,73,15,110,218 1740 call sqr8x_reduction 1741 1742 pxor %xmm0,%xmm0 1743 leaq 48(%rsp),%rax 1744 movq 40(%rsp),%rsi 1745 jmp .Lfrom_mont_zero 1746 1747.align 32 1748.Lfrom_mont_zero: 1749 movdqa %xmm0,0(%rax) 1750 movdqa %xmm0,16(%rax) 1751 movdqa %xmm0,32(%rax) 1752 movdqa %xmm0,48(%rax) 1753 leaq 64(%rax),%rax 1754 subq $32,%r9 1755 jnz .Lfrom_mont_zero 1756 1757 movq $1,%rax 1758 movq -48(%rsi),%r15 1759 movq -40(%rsi),%r14 1760 movq -32(%rsi),%r13 1761 movq -24(%rsi),%r12 1762 movq -16(%rsi),%rbp 1763 movq -8(%rsi),%rbx 1764 leaq (%rsi),%rsp 1765.Lfrom_epilogue: 1766 .byte 0xf3,0xc3 1767.size bn_from_mont8x,.-bn_from_mont8x 1768.globl bn_scatter5 1769.hidden bn_scatter5 1770.type bn_scatter5,@function 1771.align 16 1772bn_scatter5: 1773 cmpl $0,%esi 1774 jz .Lscatter_epilogue 1775 leaq (%rdx,%rcx,8),%rdx 1776.Lscatter: 1777 movq (%rdi),%rax 1778 leaq 8(%rdi),%rdi 1779 movq %rax,(%rdx) 1780 leaq 256(%rdx),%rdx 1781 subl $1,%esi 1782 jnz .Lscatter 1783.Lscatter_epilogue: 1784 .byte 0xf3,0xc3 1785.size bn_scatter5,.-bn_scatter5 1786 1787.globl bn_gather5 1788.hidden bn_gather5 1789.type bn_gather5,@function 1790.align 16 1791bn_gather5: 1792 movl %ecx,%r11d 1793 shrl $3,%ecx 1794 andq $7,%r11 1795 notl %ecx 1796 leaq .Lmagic_masks(%rip),%rax 1797 andl $3,%ecx 1798 leaq 128(%rdx,%r11,8),%rdx 1799 movq 0(%rax,%rcx,8),%xmm4 1800 movq 8(%rax,%rcx,8),%xmm5 1801 movq 16(%rax,%rcx,8),%xmm6 1802 movq 24(%rax,%rcx,8),%xmm7 1803 jmp .Lgather 1804.align 16 1805.Lgather: 1806 movq -128(%rdx),%xmm0 1807 movq -64(%rdx),%xmm1 1808 pand %xmm4,%xmm0 1809 movq 0(%rdx),%xmm2 1810 pand %xmm5,%xmm1 1811 movq 64(%rdx),%xmm3 1812 pand %xmm6,%xmm2 1813 por %xmm1,%xmm0 1814 pand %xmm7,%xmm3 1815.byte 0x67,0x67 1816 por %xmm2,%xmm0 1817 leaq 256(%rdx),%rdx 1818 por %xmm3,%xmm0 1819 1820 movq %xmm0,(%rdi) 1821 leaq 8(%rdi),%rdi 1822 subl $1,%esi 1823 jnz .Lgather 1824 .byte 0xf3,0xc3 1825.LSEH_end_bn_gather5: 1826.size bn_gather5,.-bn_gather5 1827.align 64 1828.Lmagic_masks: 1829.long 0,0, 0,0, 0,0, -1,-1 1830.long 0,0, 0,0, 0,0, 0,0 1831.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1832#endif 1833