1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Architecture Group, Microprocessor and Chipset Development, # 42# Israel Development Center, Haifa, Israel # 43# (2) University of Haifa # 44############################################################################## 45# Reference: # 46# [1] S. Gueron, "Efficient Software Implementations of Modular # 47# Exponentiation", http://eprint.iacr.org/2011/239 # 48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # 49# IEEE Proceedings of 9th International Conference on Information # 50# Technology: New Generations (ITNG 2012), 821-823 (2012). # 51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# 52# Journal of Cryptographic Engineering 2:31-43 (2012). # 53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 54# resistant 512-bit and 1024-bit modular exponentiation for optimizing # 55# RSA1024 and RSA2048 on x86_64 platforms", # 56# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# 57############################################################################## 58 59# While original submission covers 512- and 1024-bit exponentiation, 60# this module is limited to 512-bit version only (and as such 61# accelerates RSA1024 sign). This is because improvement for longer 62# keys is not high enough to justify the effort, highest measured 63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 64# for the moment of this writing!] Nor does this module implement 65# "monolithic" complete exponentiation jumbo-subroutine, but adheres 66# to more modular mixture of C and assembly. And it's optimized even 67# for processors other than Intel Core family (see table below for 68# improvement coefficients). 69# <appro@openssl.org> 70# 71# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 72# ----------------+--------------------------- 73# Opteron +13% |+5% +20% 74# Bulldozer -0% |-1% +10% 75# P4 +11% |+7% +8% 76# Westmere +5% |+14% +17% 77# Sandy Bridge +2% |+12% +29% 78# Ivy Bridge +1% |+11% +35% 79# Haswell(**) -0% |+12% +39% 80# Atom +13% |+11% +4% 81# VIA Nano +70% |+9% +25% 82# 83# (*) rsax engine and fips numbers are presented for reference 84# purposes; 85# (**) MULX was attempted, but found to give only marginal improvement; 86 87$flavour = shift; 88$output = shift; 89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 90 91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 92 93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 96die "can't locate x86_64-xlate.pl"; 97 98open OUT,"| \"$^X\" $xlate $flavour $output"; 99*STDOUT=*OUT; 100 101# In upstream, this is controlled by shelling out to the compiler to check 102# versions, but BoringSSL is intended to be used with pre-generated perlasm 103# output, so this isn't useful anyway. 104# 105# TODO(davidben): Enable this after testing. $addx goes up to 1. 106$addx = 0; 107 108($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 109{ 110my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 111 112$code.=<<___; 113.text 114 115.extern OPENSSL_ia32cap_P 116 117.globl rsaz_512_sqr 118.type rsaz_512_sqr,\@function,5 119.align 32 120rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 121 push %rbx 122 push %rbp 123 push %r12 124 push %r13 125 push %r14 126 push %r15 127 128 subq \$128+24, %rsp 129.Lsqr_body: 130 movq $mod, %rbp # common argument 131 movq ($inp), %rdx 132 movq 8($inp), %rax 133 movq $n0, 128(%rsp) 134___ 135$code.=<<___ if ($addx); 136 movl \$0x80100,%r11d 137 andl OPENSSL_ia32cap_P+8(%rip),%r11d 138 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 139 je .Loop_sqrx 140___ 141$code.=<<___; 142 jmp .Loop_sqr 143 144.align 32 145.Loop_sqr: 146 movl $times,128+8(%rsp) 147#first iteration 148 movq %rdx, %rbx 149 mulq %rdx 150 movq %rax, %r8 151 movq 16($inp), %rax 152 movq %rdx, %r9 153 154 mulq %rbx 155 addq %rax, %r9 156 movq 24($inp), %rax 157 movq %rdx, %r10 158 adcq \$0, %r10 159 160 mulq %rbx 161 addq %rax, %r10 162 movq 32($inp), %rax 163 movq %rdx, %r11 164 adcq \$0, %r11 165 166 mulq %rbx 167 addq %rax, %r11 168 movq 40($inp), %rax 169 movq %rdx, %r12 170 adcq \$0, %r12 171 172 mulq %rbx 173 addq %rax, %r12 174 movq 48($inp), %rax 175 movq %rdx, %r13 176 adcq \$0, %r13 177 178 mulq %rbx 179 addq %rax, %r13 180 movq 56($inp), %rax 181 movq %rdx, %r14 182 adcq \$0, %r14 183 184 mulq %rbx 185 addq %rax, %r14 186 movq %rbx, %rax 187 movq %rdx, %r15 188 adcq \$0, %r15 189 190 addq %r8, %r8 #shlq \$1, %r8 191 movq %r9, %rcx 192 adcq %r9, %r9 #shld \$1, %r8, %r9 193 194 mulq %rax 195 movq %rax, (%rsp) 196 addq %rdx, %r8 197 adcq \$0, %r9 198 199 movq %r8, 8(%rsp) 200 shrq \$63, %rcx 201 202#second iteration 203 movq 8($inp), %r8 204 movq 16($inp), %rax 205 mulq %r8 206 addq %rax, %r10 207 movq 24($inp), %rax 208 movq %rdx, %rbx 209 adcq \$0, %rbx 210 211 mulq %r8 212 addq %rax, %r11 213 movq 32($inp), %rax 214 adcq \$0, %rdx 215 addq %rbx, %r11 216 movq %rdx, %rbx 217 adcq \$0, %rbx 218 219 mulq %r8 220 addq %rax, %r12 221 movq 40($inp), %rax 222 adcq \$0, %rdx 223 addq %rbx, %r12 224 movq %rdx, %rbx 225 adcq \$0, %rbx 226 227 mulq %r8 228 addq %rax, %r13 229 movq 48($inp), %rax 230 adcq \$0, %rdx 231 addq %rbx, %r13 232 movq %rdx, %rbx 233 adcq \$0, %rbx 234 235 mulq %r8 236 addq %rax, %r14 237 movq 56($inp), %rax 238 adcq \$0, %rdx 239 addq %rbx, %r14 240 movq %rdx, %rbx 241 adcq \$0, %rbx 242 243 mulq %r8 244 addq %rax, %r15 245 movq %r8, %rax 246 adcq \$0, %rdx 247 addq %rbx, %r15 248 movq %rdx, %r8 249 movq %r10, %rdx 250 adcq \$0, %r8 251 252 add %rdx, %rdx 253 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 254 movq %r11, %rbx 255 adcq %r11, %r11 #shld \$1, %r10, %r11 256 257 mulq %rax 258 addq %rax, %r9 259 adcq %rdx, %r10 260 adcq \$0, %r11 261 262 movq %r9, 16(%rsp) 263 movq %r10, 24(%rsp) 264 shrq \$63, %rbx 265 266#third iteration 267 movq 16($inp), %r9 268 movq 24($inp), %rax 269 mulq %r9 270 addq %rax, %r12 271 movq 32($inp), %rax 272 movq %rdx, %rcx 273 adcq \$0, %rcx 274 275 mulq %r9 276 addq %rax, %r13 277 movq 40($inp), %rax 278 adcq \$0, %rdx 279 addq %rcx, %r13 280 movq %rdx, %rcx 281 adcq \$0, %rcx 282 283 mulq %r9 284 addq %rax, %r14 285 movq 48($inp), %rax 286 adcq \$0, %rdx 287 addq %rcx, %r14 288 movq %rdx, %rcx 289 adcq \$0, %rcx 290 291 mulq %r9 292 movq %r12, %r10 293 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 294 addq %rax, %r15 295 movq 56($inp), %rax 296 adcq \$0, %rdx 297 addq %rcx, %r15 298 movq %rdx, %rcx 299 adcq \$0, %rcx 300 301 mulq %r9 302 shrq \$63, %r10 303 addq %rax, %r8 304 movq %r9, %rax 305 adcq \$0, %rdx 306 addq %rcx, %r8 307 movq %rdx, %r9 308 adcq \$0, %r9 309 310 movq %r13, %rcx 311 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 312 313 mulq %rax 314 addq %rax, %r11 315 adcq %rdx, %r12 316 adcq \$0, %r13 317 318 movq %r11, 32(%rsp) 319 movq %r12, 40(%rsp) 320 shrq \$63, %rcx 321 322#fourth iteration 323 movq 24($inp), %r10 324 movq 32($inp), %rax 325 mulq %r10 326 addq %rax, %r14 327 movq 40($inp), %rax 328 movq %rdx, %rbx 329 adcq \$0, %rbx 330 331 mulq %r10 332 addq %rax, %r15 333 movq 48($inp), %rax 334 adcq \$0, %rdx 335 addq %rbx, %r15 336 movq %rdx, %rbx 337 adcq \$0, %rbx 338 339 mulq %r10 340 movq %r14, %r12 341 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 342 addq %rax, %r8 343 movq 56($inp), %rax 344 adcq \$0, %rdx 345 addq %rbx, %r8 346 movq %rdx, %rbx 347 adcq \$0, %rbx 348 349 mulq %r10 350 shrq \$63, %r12 351 addq %rax, %r9 352 movq %r10, %rax 353 adcq \$0, %rdx 354 addq %rbx, %r9 355 movq %rdx, %r10 356 adcq \$0, %r10 357 358 movq %r15, %rbx 359 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 360 361 mulq %rax 362 addq %rax, %r13 363 adcq %rdx, %r14 364 adcq \$0, %r15 365 366 movq %r13, 48(%rsp) 367 movq %r14, 56(%rsp) 368 shrq \$63, %rbx 369 370#fifth iteration 371 movq 32($inp), %r11 372 movq 40($inp), %rax 373 mulq %r11 374 addq %rax, %r8 375 movq 48($inp), %rax 376 movq %rdx, %rcx 377 adcq \$0, %rcx 378 379 mulq %r11 380 addq %rax, %r9 381 movq 56($inp), %rax 382 adcq \$0, %rdx 383 movq %r8, %r12 384 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 385 addq %rcx, %r9 386 movq %rdx, %rcx 387 adcq \$0, %rcx 388 389 mulq %r11 390 shrq \$63, %r12 391 addq %rax, %r10 392 movq %r11, %rax 393 adcq \$0, %rdx 394 addq %rcx, %r10 395 movq %rdx, %r11 396 adcq \$0, %r11 397 398 movq %r9, %rcx 399 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 400 401 mulq %rax 402 addq %rax, %r15 403 adcq %rdx, %r8 404 adcq \$0, %r9 405 406 movq %r15, 64(%rsp) 407 movq %r8, 72(%rsp) 408 shrq \$63, %rcx 409 410#sixth iteration 411 movq 40($inp), %r12 412 movq 48($inp), %rax 413 mulq %r12 414 addq %rax, %r10 415 movq 56($inp), %rax 416 movq %rdx, %rbx 417 adcq \$0, %rbx 418 419 mulq %r12 420 addq %rax, %r11 421 movq %r12, %rax 422 movq %r10, %r15 423 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 424 adcq \$0, %rdx 425 shrq \$63, %r15 426 addq %rbx, %r11 427 movq %rdx, %r12 428 adcq \$0, %r12 429 430 movq %r11, %rbx 431 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 432 433 mulq %rax 434 addq %rax, %r9 435 adcq %rdx, %r10 436 adcq \$0, %r11 437 438 movq %r9, 80(%rsp) 439 movq %r10, 88(%rsp) 440 441#seventh iteration 442 movq 48($inp), %r13 443 movq 56($inp), %rax 444 mulq %r13 445 addq %rax, %r12 446 movq %r13, %rax 447 movq %rdx, %r13 448 adcq \$0, %r13 449 450 xorq %r14, %r14 451 shlq \$1, %rbx 452 adcq %r12, %r12 #shld \$1, %rbx, %r12 453 adcq %r13, %r13 #shld \$1, %r12, %r13 454 adcq %r14, %r14 #shld \$1, %r13, %r14 455 456 mulq %rax 457 addq %rax, %r11 458 adcq %rdx, %r12 459 adcq \$0, %r13 460 461 movq %r11, 96(%rsp) 462 movq %r12, 104(%rsp) 463 464#eighth iteration 465 movq 56($inp), %rax 466 mulq %rax 467 addq %rax, %r13 468 adcq \$0, %rdx 469 470 addq %rdx, %r14 471 472 movq %r13, 112(%rsp) 473 movq %r14, 120(%rsp) 474 475 movq (%rsp), %r8 476 movq 8(%rsp), %r9 477 movq 16(%rsp), %r10 478 movq 24(%rsp), %r11 479 movq 32(%rsp), %r12 480 movq 40(%rsp), %r13 481 movq 48(%rsp), %r14 482 movq 56(%rsp), %r15 483 484 call __rsaz_512_reduce 485 486 addq 64(%rsp), %r8 487 adcq 72(%rsp), %r9 488 adcq 80(%rsp), %r10 489 adcq 88(%rsp), %r11 490 adcq 96(%rsp), %r12 491 adcq 104(%rsp), %r13 492 adcq 112(%rsp), %r14 493 adcq 120(%rsp), %r15 494 sbbq %rcx, %rcx 495 496 call __rsaz_512_subtract 497 498 movq %r8, %rdx 499 movq %r9, %rax 500 movl 128+8(%rsp), $times 501 movq $out, $inp 502 503 decl $times 504 jnz .Loop_sqr 505___ 506if ($addx) { 507$code.=<<___; 508 jmp .Lsqr_tail 509 510.align 32 511.Loop_sqrx: 512 movl $times,128+8(%rsp) 513 movq $out, %xmm0 # off-load 514 movq %rbp, %xmm1 # off-load 515#first iteration 516 mulx %rax, %r8, %r9 517 518 mulx 16($inp), %rcx, %r10 519 xor %rbp, %rbp # cf=0, of=0 520 521 mulx 24($inp), %rax, %r11 522 adcx %rcx, %r9 523 524 mulx 32($inp), %rcx, %r12 525 adcx %rax, %r10 526 527 mulx 40($inp), %rax, %r13 528 adcx %rcx, %r11 529 530 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 531 adcx %rax, %r12 532 adcx %rcx, %r13 533 534 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 535 adcx %rax, %r14 536 adcx %rbp, %r15 # %rbp is 0 537 538 mov %r9, %rcx 539 shld \$1, %r8, %r9 540 shl \$1, %r8 541 542 xor %ebp, %ebp 543 mulx %rdx, %rax, %rdx 544 adcx %rdx, %r8 545 mov 8($inp), %rdx 546 adcx %rbp, %r9 547 548 mov %rax, (%rsp) 549 mov %r8, 8(%rsp) 550 551#second iteration 552 mulx 16($inp), %rax, %rbx 553 adox %rax, %r10 554 adcx %rbx, %r11 555 556 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 557 adox $out, %r11 558 adcx %r8, %r12 559 560 mulx 32($inp), %rax, %rbx 561 adox %rax, %r12 562 adcx %rbx, %r13 563 564 mulx 40($inp), $out, %r8 565 adox $out, %r13 566 adcx %r8, %r14 567 568 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 569 adox %rax, %r14 570 adcx %rbx, %r15 571 572 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 573 adox $out, %r15 574 adcx %rbp, %r8 575 adox %rbp, %r8 576 577 mov %r11, %rbx 578 shld \$1, %r10, %r11 579 shld \$1, %rcx, %r10 580 581 xor %ebp,%ebp 582 mulx %rdx, %rax, %rcx 583 mov 16($inp), %rdx 584 adcx %rax, %r9 585 adcx %rcx, %r10 586 adcx %rbp, %r11 587 588 mov %r9, 16(%rsp) 589 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 590 591#third iteration 592 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 593 adox $out, %r12 594 adcx %r9, %r13 595 596 mulx 32($inp), %rax, %rcx 597 adox %rax, %r13 598 adcx %rcx, %r14 599 600 mulx 40($inp), $out, %r9 601 adox $out, %r14 602 adcx %r9, %r15 603 604 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 605 adox %rax, %r15 606 adcx %rcx, %r8 607 608 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 609 adox $out, %r8 610 adcx %rbp, %r9 611 adox %rbp, %r9 612 613 mov %r13, %rcx 614 shld \$1, %r12, %r13 615 shld \$1, %rbx, %r12 616 617 xor %ebp, %ebp 618 mulx %rdx, %rax, %rdx 619 adcx %rax, %r11 620 adcx %rdx, %r12 621 mov 24($inp), %rdx 622 adcx %rbp, %r13 623 624 mov %r11, 32(%rsp) 625 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) 626 627#fourth iteration 628 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx 629 adox %rax, %r14 630 adcx %rbx, %r15 631 632 mulx 40($inp), $out, %r10 633 adox $out, %r15 634 adcx %r10, %r8 635 636 mulx 48($inp), %rax, %rbx 637 adox %rax, %r8 638 adcx %rbx, %r9 639 640 mulx 56($inp), $out, %r10 641 adox $out, %r9 642 adcx %rbp, %r10 643 adox %rbp, %r10 644 645 .byte 0x66 646 mov %r15, %rbx 647 shld \$1, %r14, %r15 648 shld \$1, %rcx, %r14 649 650 xor %ebp, %ebp 651 mulx %rdx, %rax, %rdx 652 adcx %rax, %r13 653 adcx %rdx, %r14 654 mov 32($inp), %rdx 655 adcx %rbp, %r15 656 657 mov %r13, 48(%rsp) 658 mov %r14, 56(%rsp) 659 660#fifth iteration 661 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 662 adox $out, %r8 663 adcx %r11, %r9 664 665 mulx 48($inp), %rax, %rcx 666 adox %rax, %r9 667 adcx %rcx, %r10 668 669 mulx 56($inp), $out, %r11 670 adox $out, %r10 671 adcx %rbp, %r11 672 adox %rbp, %r11 673 674 mov %r9, %rcx 675 shld \$1, %r8, %r9 676 shld \$1, %rbx, %r8 677 678 xor %ebp, %ebp 679 mulx %rdx, %rax, %rdx 680 adcx %rax, %r15 681 adcx %rdx, %r8 682 mov 40($inp), %rdx 683 adcx %rbp, %r9 684 685 mov %r15, 64(%rsp) 686 mov %r8, 72(%rsp) 687 688#sixth iteration 689 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 690 adox %rax, %r10 691 adcx %rbx, %r11 692 693 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 694 adox $out, %r11 695 adcx %rbp, %r12 696 adox %rbp, %r12 697 698 mov %r11, %rbx 699 shld \$1, %r10, %r11 700 shld \$1, %rcx, %r10 701 702 xor %ebp, %ebp 703 mulx %rdx, %rax, %rdx 704 adcx %rax, %r9 705 adcx %rdx, %r10 706 mov 48($inp), %rdx 707 adcx %rbp, %r11 708 709 mov %r9, 80(%rsp) 710 mov %r10, 88(%rsp) 711 712#seventh iteration 713 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 714 adox %rax, %r12 715 adox %rbp, %r13 716 717 xor %r14, %r14 718 shld \$1, %r13, %r14 719 shld \$1, %r12, %r13 720 shld \$1, %rbx, %r12 721 722 xor %ebp, %ebp 723 mulx %rdx, %rax, %rdx 724 adcx %rax, %r11 725 adcx %rdx, %r12 726 mov 56($inp), %rdx 727 adcx %rbp, %r13 728 729 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 730 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 731 732#eighth iteration 733 mulx %rdx, %rax, %rdx 734 adox %rax, %r13 735 adox %rbp, %rdx 736 737 .byte 0x66 738 add %rdx, %r14 739 740 movq %r13, 112(%rsp) 741 movq %r14, 120(%rsp) 742 movq %xmm0, $out 743 movq %xmm1, %rbp 744 745 movq 128(%rsp), %rdx # pull $n0 746 movq (%rsp), %r8 747 movq 8(%rsp), %r9 748 movq 16(%rsp), %r10 749 movq 24(%rsp), %r11 750 movq 32(%rsp), %r12 751 movq 40(%rsp), %r13 752 movq 48(%rsp), %r14 753 movq 56(%rsp), %r15 754 755 call __rsaz_512_reducex 756 757 addq 64(%rsp), %r8 758 adcq 72(%rsp), %r9 759 adcq 80(%rsp), %r10 760 adcq 88(%rsp), %r11 761 adcq 96(%rsp), %r12 762 adcq 104(%rsp), %r13 763 adcq 112(%rsp), %r14 764 adcq 120(%rsp), %r15 765 sbbq %rcx, %rcx 766 767 call __rsaz_512_subtract 768 769 movq %r8, %rdx 770 movq %r9, %rax 771 movl 128+8(%rsp), $times 772 movq $out, $inp 773 774 decl $times 775 jnz .Loop_sqrx 776 777.Lsqr_tail: 778___ 779} 780$code.=<<___; 781 782 leaq 128+24+48(%rsp), %rax 783 movq -48(%rax), %r15 784 movq -40(%rax), %r14 785 movq -32(%rax), %r13 786 movq -24(%rax), %r12 787 movq -16(%rax), %rbp 788 movq -8(%rax), %rbx 789 leaq (%rax), %rsp 790.Lsqr_epilogue: 791 ret 792.size rsaz_512_sqr,.-rsaz_512_sqr 793___ 794} 795{ 796my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 797$code.=<<___; 798.globl rsaz_512_mul 799.type rsaz_512_mul,\@function,5 800.align 32 801rsaz_512_mul: 802 push %rbx 803 push %rbp 804 push %r12 805 push %r13 806 push %r14 807 push %r15 808 809 subq \$128+24, %rsp 810.Lmul_body: 811 movq $out, %xmm0 # off-load arguments 812 movq $mod, %xmm1 813 movq $n0, 128(%rsp) 814___ 815$code.=<<___ if ($addx); 816 movl \$0x80100,%r11d 817 andl OPENSSL_ia32cap_P+8(%rip),%r11d 818 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 819 je .Lmulx 820___ 821$code.=<<___; 822 movq ($bp), %rbx # pass b[0] 823 movq $bp, %rbp # pass argument 824 call __rsaz_512_mul 825 826 movq %xmm0, $out 827 movq %xmm1, %rbp 828 829 movq (%rsp), %r8 830 movq 8(%rsp), %r9 831 movq 16(%rsp), %r10 832 movq 24(%rsp), %r11 833 movq 32(%rsp), %r12 834 movq 40(%rsp), %r13 835 movq 48(%rsp), %r14 836 movq 56(%rsp), %r15 837 838 call __rsaz_512_reduce 839___ 840$code.=<<___ if ($addx); 841 jmp .Lmul_tail 842 843.align 32 844.Lmulx: 845 movq $bp, %rbp # pass argument 846 movq ($bp), %rdx # pass b[0] 847 call __rsaz_512_mulx 848 849 movq %xmm0, $out 850 movq %xmm1, %rbp 851 852 movq 128(%rsp), %rdx # pull $n0 853 movq (%rsp), %r8 854 movq 8(%rsp), %r9 855 movq 16(%rsp), %r10 856 movq 24(%rsp), %r11 857 movq 32(%rsp), %r12 858 movq 40(%rsp), %r13 859 movq 48(%rsp), %r14 860 movq 56(%rsp), %r15 861 862 call __rsaz_512_reducex 863.Lmul_tail: 864___ 865$code.=<<___; 866 addq 64(%rsp), %r8 867 adcq 72(%rsp), %r9 868 adcq 80(%rsp), %r10 869 adcq 88(%rsp), %r11 870 adcq 96(%rsp), %r12 871 adcq 104(%rsp), %r13 872 adcq 112(%rsp), %r14 873 adcq 120(%rsp), %r15 874 sbbq %rcx, %rcx 875 876 call __rsaz_512_subtract 877 878 leaq 128+24+48(%rsp), %rax 879 movq -48(%rax), %r15 880 movq -40(%rax), %r14 881 movq -32(%rax), %r13 882 movq -24(%rax), %r12 883 movq -16(%rax), %rbp 884 movq -8(%rax), %rbx 885 leaq (%rax), %rsp 886.Lmul_epilogue: 887 ret 888.size rsaz_512_mul,.-rsaz_512_mul 889___ 890} 891{ 892my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 893$code.=<<___; 894.globl rsaz_512_mul_gather4 895.type rsaz_512_mul_gather4,\@function,6 896.align 32 897rsaz_512_mul_gather4: 898 push %rbx 899 push %rbp 900 push %r12 901 push %r13 902 push %r14 903 push %r15 904 905 mov $pwr, $pwr 906 subq \$128+24, %rsp 907.Lmul_gather4_body: 908___ 909$code.=<<___ if ($addx); 910 movl \$0x80100,%r11d 911 andl OPENSSL_ia32cap_P+8(%rip),%r11d 912 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 913 je .Lmulx_gather 914___ 915$code.=<<___; 916 movl 64($bp,$pwr,4), %eax 917 movq $out, %xmm0 # off-load arguments 918 movl ($bp,$pwr,4), %ebx 919 movq $mod, %xmm1 920 movq $n0, 128(%rsp) 921 922 shlq \$32, %rax 923 or %rax, %rbx 924 movq ($ap), %rax 925 movq 8($ap), %rcx 926 leaq 128($bp,$pwr,4), %rbp 927 mulq %rbx # 0 iteration 928 movq %rax, (%rsp) 929 movq %rcx, %rax 930 movq %rdx, %r8 931 932 mulq %rbx 933 movd (%rbp), %xmm4 934 addq %rax, %r8 935 movq 16($ap), %rax 936 movq %rdx, %r9 937 adcq \$0, %r9 938 939 mulq %rbx 940 movd 64(%rbp), %xmm5 941 addq %rax, %r9 942 movq 24($ap), %rax 943 movq %rdx, %r10 944 adcq \$0, %r10 945 946 mulq %rbx 947 pslldq \$4, %xmm5 948 addq %rax, %r10 949 movq 32($ap), %rax 950 movq %rdx, %r11 951 adcq \$0, %r11 952 953 mulq %rbx 954 por %xmm5, %xmm4 955 addq %rax, %r11 956 movq 40($ap), %rax 957 movq %rdx, %r12 958 adcq \$0, %r12 959 960 mulq %rbx 961 addq %rax, %r12 962 movq 48($ap), %rax 963 movq %rdx, %r13 964 adcq \$0, %r13 965 966 mulq %rbx 967 leaq 128(%rbp), %rbp 968 addq %rax, %r13 969 movq 56($ap), %rax 970 movq %rdx, %r14 971 adcq \$0, %r14 972 973 mulq %rbx 974 movq %xmm4, %rbx 975 addq %rax, %r14 976 movq ($ap), %rax 977 movq %rdx, %r15 978 adcq \$0, %r15 979 980 leaq 8(%rsp), %rdi 981 movl \$7, %ecx 982 jmp .Loop_mul_gather 983 984.align 32 985.Loop_mul_gather: 986 mulq %rbx 987 addq %rax, %r8 988 movq 8($ap), %rax 989 movq %r8, (%rdi) 990 movq %rdx, %r8 991 adcq \$0, %r8 992 993 mulq %rbx 994 movd (%rbp), %xmm4 995 addq %rax, %r9 996 movq 16($ap), %rax 997 adcq \$0, %rdx 998 addq %r9, %r8 999 movq %rdx, %r9 1000 adcq \$0, %r9 1001 1002 mulq %rbx 1003 movd 64(%rbp), %xmm5 1004 addq %rax, %r10 1005 movq 24($ap), %rax 1006 adcq \$0, %rdx 1007 addq %r10, %r9 1008 movq %rdx, %r10 1009 adcq \$0, %r10 1010 1011 mulq %rbx 1012 pslldq \$4, %xmm5 1013 addq %rax, %r11 1014 movq 32($ap), %rax 1015 adcq \$0, %rdx 1016 addq %r11, %r10 1017 movq %rdx, %r11 1018 adcq \$0, %r11 1019 1020 mulq %rbx 1021 por %xmm5, %xmm4 1022 addq %rax, %r12 1023 movq 40($ap), %rax 1024 adcq \$0, %rdx 1025 addq %r12, %r11 1026 movq %rdx, %r12 1027 adcq \$0, %r12 1028 1029 mulq %rbx 1030 addq %rax, %r13 1031 movq 48($ap), %rax 1032 adcq \$0, %rdx 1033 addq %r13, %r12 1034 movq %rdx, %r13 1035 adcq \$0, %r13 1036 1037 mulq %rbx 1038 addq %rax, %r14 1039 movq 56($ap), %rax 1040 adcq \$0, %rdx 1041 addq %r14, %r13 1042 movq %rdx, %r14 1043 adcq \$0, %r14 1044 1045 mulq %rbx 1046 movq %xmm4, %rbx 1047 addq %rax, %r15 1048 movq ($ap), %rax 1049 adcq \$0, %rdx 1050 addq %r15, %r14 1051 movq %rdx, %r15 1052 adcq \$0, %r15 1053 1054 leaq 128(%rbp), %rbp 1055 leaq 8(%rdi), %rdi 1056 1057 decl %ecx 1058 jnz .Loop_mul_gather 1059 1060 movq %r8, (%rdi) 1061 movq %r9, 8(%rdi) 1062 movq %r10, 16(%rdi) 1063 movq %r11, 24(%rdi) 1064 movq %r12, 32(%rdi) 1065 movq %r13, 40(%rdi) 1066 movq %r14, 48(%rdi) 1067 movq %r15, 56(%rdi) 1068 1069 movq %xmm0, $out 1070 movq %xmm1, %rbp 1071 1072 movq (%rsp), %r8 1073 movq 8(%rsp), %r9 1074 movq 16(%rsp), %r10 1075 movq 24(%rsp), %r11 1076 movq 32(%rsp), %r12 1077 movq 40(%rsp), %r13 1078 movq 48(%rsp), %r14 1079 movq 56(%rsp), %r15 1080 1081 call __rsaz_512_reduce 1082___ 1083$code.=<<___ if ($addx); 1084 jmp .Lmul_gather_tail 1085 1086.align 32 1087.Lmulx_gather: 1088 mov 64($bp,$pwr,4), %eax 1089 movq $out, %xmm0 # off-load arguments 1090 lea 128($bp,$pwr,4), %rbp 1091 mov ($bp,$pwr,4), %edx 1092 movq $mod, %xmm1 1093 mov $n0, 128(%rsp) 1094 1095 shl \$32, %rax 1096 or %rax, %rdx 1097 mulx ($ap), %rbx, %r8 # 0 iteration 1098 mov %rbx, (%rsp) 1099 xor %edi, %edi # cf=0, of=0 1100 1101 mulx 8($ap), %rax, %r9 1102 movd (%rbp), %xmm4 1103 1104 mulx 16($ap), %rbx, %r10 1105 movd 64(%rbp), %xmm5 1106 adcx %rax, %r8 1107 1108 mulx 24($ap), %rax, %r11 1109 pslldq \$4, %xmm5 1110 adcx %rbx, %r9 1111 1112 mulx 32($ap), %rbx, %r12 1113 por %xmm5, %xmm4 1114 adcx %rax, %r10 1115 1116 mulx 40($ap), %rax, %r13 1117 adcx %rbx, %r11 1118 1119 mulx 48($ap), %rbx, %r14 1120 lea 128(%rbp), %rbp 1121 adcx %rax, %r12 1122 1123 mulx 56($ap), %rax, %r15 1124 movq %xmm4, %rdx 1125 adcx %rbx, %r13 1126 adcx %rax, %r14 1127 mov %r8, %rbx 1128 adcx %rdi, %r15 # %rdi is 0 1129 1130 mov \$-7, %rcx 1131 jmp .Loop_mulx_gather 1132 1133.align 32 1134.Loop_mulx_gather: 1135 mulx ($ap), %rax, %r8 1136 adcx %rax, %rbx 1137 adox %r9, %r8 1138 1139 mulx 8($ap), %rax, %r9 1140 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 1141 adcx %rax, %r8 1142 adox %r10, %r9 1143 1144 mulx 16($ap), %rax, %r10 1145 movd 64(%rbp), %xmm5 1146 lea 128(%rbp), %rbp 1147 adcx %rax, %r9 1148 adox %r11, %r10 1149 1150 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1151 pslldq \$4, %xmm5 1152 por %xmm5, %xmm4 1153 adcx %rax, %r10 1154 adox %r12, %r11 1155 1156 mulx 32($ap), %rax, %r12 1157 adcx %rax, %r11 1158 adox %r13, %r12 1159 1160 mulx 40($ap), %rax, %r13 1161 adcx %rax, %r12 1162 adox %r14, %r13 1163 1164 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1165 adcx %rax, %r13 1166 adox %r15, %r14 1167 1168 mulx 56($ap), %rax, %r15 1169 movq %xmm4, %rdx 1170 mov %rbx, 64(%rsp,%rcx,8) 1171 adcx %rax, %r14 1172 adox %rdi, %r15 1173 mov %r8, %rbx 1174 adcx %rdi, %r15 # cf=0 1175 1176 inc %rcx # of=0 1177 jnz .Loop_mulx_gather 1178 1179 mov %r8, 64(%rsp) 1180 mov %r9, 64+8(%rsp) 1181 mov %r10, 64+16(%rsp) 1182 mov %r11, 64+24(%rsp) 1183 mov %r12, 64+32(%rsp) 1184 mov %r13, 64+40(%rsp) 1185 mov %r14, 64+48(%rsp) 1186 mov %r15, 64+56(%rsp) 1187 1188 movq %xmm0, $out 1189 movq %xmm1, %rbp 1190 1191 mov 128(%rsp), %rdx # pull $n0 1192 mov (%rsp), %r8 1193 mov 8(%rsp), %r9 1194 mov 16(%rsp), %r10 1195 mov 24(%rsp), %r11 1196 mov 32(%rsp), %r12 1197 mov 40(%rsp), %r13 1198 mov 48(%rsp), %r14 1199 mov 56(%rsp), %r15 1200 1201 call __rsaz_512_reducex 1202 1203.Lmul_gather_tail: 1204___ 1205$code.=<<___; 1206 addq 64(%rsp), %r8 1207 adcq 72(%rsp), %r9 1208 adcq 80(%rsp), %r10 1209 adcq 88(%rsp), %r11 1210 adcq 96(%rsp), %r12 1211 adcq 104(%rsp), %r13 1212 adcq 112(%rsp), %r14 1213 adcq 120(%rsp), %r15 1214 sbbq %rcx, %rcx 1215 1216 call __rsaz_512_subtract 1217 1218 leaq 128+24+48(%rsp), %rax 1219 movq -48(%rax), %r15 1220 movq -40(%rax), %r14 1221 movq -32(%rax), %r13 1222 movq -24(%rax), %r12 1223 movq -16(%rax), %rbp 1224 movq -8(%rax), %rbx 1225 leaq (%rax), %rsp 1226.Lmul_gather4_epilogue: 1227 ret 1228.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1229___ 1230} 1231{ 1232my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1233$code.=<<___; 1234.globl rsaz_512_mul_scatter4 1235.type rsaz_512_mul_scatter4,\@function,6 1236.align 32 1237rsaz_512_mul_scatter4: 1238 push %rbx 1239 push %rbp 1240 push %r12 1241 push %r13 1242 push %r14 1243 push %r15 1244 1245 mov $pwr, $pwr 1246 subq \$128+24, %rsp 1247.Lmul_scatter4_body: 1248 leaq ($tbl,$pwr,4), $tbl 1249 movq $out, %xmm0 # off-load arguments 1250 movq $mod, %xmm1 1251 movq $tbl, %xmm2 1252 movq $n0, 128(%rsp) 1253 1254 movq $out, %rbp 1255___ 1256$code.=<<___ if ($addx); 1257 movl \$0x80100,%r11d 1258 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1259 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1260 je .Lmulx_scatter 1261___ 1262$code.=<<___; 1263 movq ($out),%rbx # pass b[0] 1264 call __rsaz_512_mul 1265 1266 movq %xmm0, $out 1267 movq %xmm1, %rbp 1268 1269 movq (%rsp), %r8 1270 movq 8(%rsp), %r9 1271 movq 16(%rsp), %r10 1272 movq 24(%rsp), %r11 1273 movq 32(%rsp), %r12 1274 movq 40(%rsp), %r13 1275 movq 48(%rsp), %r14 1276 movq 56(%rsp), %r15 1277 1278 call __rsaz_512_reduce 1279___ 1280$code.=<<___ if ($addx); 1281 jmp .Lmul_scatter_tail 1282 1283.align 32 1284.Lmulx_scatter: 1285 movq ($out), %rdx # pass b[0] 1286 call __rsaz_512_mulx 1287 1288 movq %xmm0, $out 1289 movq %xmm1, %rbp 1290 1291 movq 128(%rsp), %rdx # pull $n0 1292 movq (%rsp), %r8 1293 movq 8(%rsp), %r9 1294 movq 16(%rsp), %r10 1295 movq 24(%rsp), %r11 1296 movq 32(%rsp), %r12 1297 movq 40(%rsp), %r13 1298 movq 48(%rsp), %r14 1299 movq 56(%rsp), %r15 1300 1301 call __rsaz_512_reducex 1302 1303.Lmul_scatter_tail: 1304___ 1305$code.=<<___; 1306 addq 64(%rsp), %r8 1307 adcq 72(%rsp), %r9 1308 adcq 80(%rsp), %r10 1309 adcq 88(%rsp), %r11 1310 adcq 96(%rsp), %r12 1311 adcq 104(%rsp), %r13 1312 adcq 112(%rsp), %r14 1313 adcq 120(%rsp), %r15 1314 movq %xmm2, $inp 1315 sbbq %rcx, %rcx 1316 1317 call __rsaz_512_subtract 1318 1319 movl %r8d, 64*0($inp) # scatter 1320 shrq \$32, %r8 1321 movl %r9d, 64*2($inp) 1322 shrq \$32, %r9 1323 movl %r10d, 64*4($inp) 1324 shrq \$32, %r10 1325 movl %r11d, 64*6($inp) 1326 shrq \$32, %r11 1327 movl %r12d, 64*8($inp) 1328 shrq \$32, %r12 1329 movl %r13d, 64*10($inp) 1330 shrq \$32, %r13 1331 movl %r14d, 64*12($inp) 1332 shrq \$32, %r14 1333 movl %r15d, 64*14($inp) 1334 shrq \$32, %r15 1335 movl %r8d, 64*1($inp) 1336 movl %r9d, 64*3($inp) 1337 movl %r10d, 64*5($inp) 1338 movl %r11d, 64*7($inp) 1339 movl %r12d, 64*9($inp) 1340 movl %r13d, 64*11($inp) 1341 movl %r14d, 64*13($inp) 1342 movl %r15d, 64*15($inp) 1343 1344 leaq 128+24+48(%rsp), %rax 1345 movq -48(%rax), %r15 1346 movq -40(%rax), %r14 1347 movq -32(%rax), %r13 1348 movq -24(%rax), %r12 1349 movq -16(%rax), %rbp 1350 movq -8(%rax), %rbx 1351 leaq (%rax), %rsp 1352.Lmul_scatter4_epilogue: 1353 ret 1354.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1355___ 1356} 1357{ 1358my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1359$code.=<<___; 1360.globl rsaz_512_mul_by_one 1361.type rsaz_512_mul_by_one,\@function,4 1362.align 32 1363rsaz_512_mul_by_one: 1364 push %rbx 1365 push %rbp 1366 push %r12 1367 push %r13 1368 push %r14 1369 push %r15 1370 1371 subq \$128+24, %rsp 1372.Lmul_by_one_body: 1373___ 1374$code.=<<___ if ($addx); 1375 movl OPENSSL_ia32cap_P+8(%rip),%eax 1376___ 1377$code.=<<___; 1378 movq $mod, %rbp # reassign argument 1379 movq $n0, 128(%rsp) 1380 1381 movq ($inp), %r8 1382 pxor %xmm0, %xmm0 1383 movq 8($inp), %r9 1384 movq 16($inp), %r10 1385 movq 24($inp), %r11 1386 movq 32($inp), %r12 1387 movq 40($inp), %r13 1388 movq 48($inp), %r14 1389 movq 56($inp), %r15 1390 1391 movdqa %xmm0, (%rsp) 1392 movdqa %xmm0, 16(%rsp) 1393 movdqa %xmm0, 32(%rsp) 1394 movdqa %xmm0, 48(%rsp) 1395 movdqa %xmm0, 64(%rsp) 1396 movdqa %xmm0, 80(%rsp) 1397 movdqa %xmm0, 96(%rsp) 1398___ 1399$code.=<<___ if ($addx); 1400 andl \$0x80100,%eax 1401 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1402 je .Lby_one_callx 1403___ 1404$code.=<<___; 1405 call __rsaz_512_reduce 1406___ 1407$code.=<<___ if ($addx); 1408 jmp .Lby_one_tail 1409.align 32 1410.Lby_one_callx: 1411 movq 128(%rsp), %rdx # pull $n0 1412 call __rsaz_512_reducex 1413.Lby_one_tail: 1414___ 1415$code.=<<___; 1416 movq %r8, ($out) 1417 movq %r9, 8($out) 1418 movq %r10, 16($out) 1419 movq %r11, 24($out) 1420 movq %r12, 32($out) 1421 movq %r13, 40($out) 1422 movq %r14, 48($out) 1423 movq %r15, 56($out) 1424 1425 leaq 128+24+48(%rsp), %rax 1426 movq -48(%rax), %r15 1427 movq -40(%rax), %r14 1428 movq -32(%rax), %r13 1429 movq -24(%rax), %r12 1430 movq -16(%rax), %rbp 1431 movq -8(%rax), %rbx 1432 leaq (%rax), %rsp 1433.Lmul_by_one_epilogue: 1434 ret 1435.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1436___ 1437} 1438{ # __rsaz_512_reduce 1439 # 1440 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1441 # output: %r8-%r15 1442 # clobbers: everything except %rbp and %rdi 1443$code.=<<___; 1444.type __rsaz_512_reduce,\@abi-omnipotent 1445.align 32 1446__rsaz_512_reduce: 1447 movq %r8, %rbx 1448 imulq 128+8(%rsp), %rbx 1449 movq 0(%rbp), %rax 1450 movl \$8, %ecx 1451 jmp .Lreduction_loop 1452 1453.align 32 1454.Lreduction_loop: 1455 mulq %rbx 1456 movq 8(%rbp), %rax 1457 negq %r8 1458 movq %rdx, %r8 1459 adcq \$0, %r8 1460 1461 mulq %rbx 1462 addq %rax, %r9 1463 movq 16(%rbp), %rax 1464 adcq \$0, %rdx 1465 addq %r9, %r8 1466 movq %rdx, %r9 1467 adcq \$0, %r9 1468 1469 mulq %rbx 1470 addq %rax, %r10 1471 movq 24(%rbp), %rax 1472 adcq \$0, %rdx 1473 addq %r10, %r9 1474 movq %rdx, %r10 1475 adcq \$0, %r10 1476 1477 mulq %rbx 1478 addq %rax, %r11 1479 movq 32(%rbp), %rax 1480 adcq \$0, %rdx 1481 addq %r11, %r10 1482 movq 128+8(%rsp), %rsi 1483 #movq %rdx, %r11 1484 #adcq \$0, %r11 1485 adcq \$0, %rdx 1486 movq %rdx, %r11 1487 1488 mulq %rbx 1489 addq %rax, %r12 1490 movq 40(%rbp), %rax 1491 adcq \$0, %rdx 1492 imulq %r8, %rsi 1493 addq %r12, %r11 1494 movq %rdx, %r12 1495 adcq \$0, %r12 1496 1497 mulq %rbx 1498 addq %rax, %r13 1499 movq 48(%rbp), %rax 1500 adcq \$0, %rdx 1501 addq %r13, %r12 1502 movq %rdx, %r13 1503 adcq \$0, %r13 1504 1505 mulq %rbx 1506 addq %rax, %r14 1507 movq 56(%rbp), %rax 1508 adcq \$0, %rdx 1509 addq %r14, %r13 1510 movq %rdx, %r14 1511 adcq \$0, %r14 1512 1513 mulq %rbx 1514 movq %rsi, %rbx 1515 addq %rax, %r15 1516 movq 0(%rbp), %rax 1517 adcq \$0, %rdx 1518 addq %r15, %r14 1519 movq %rdx, %r15 1520 adcq \$0, %r15 1521 1522 decl %ecx 1523 jne .Lreduction_loop 1524 1525 ret 1526.size __rsaz_512_reduce,.-__rsaz_512_reduce 1527___ 1528} 1529if ($addx) { 1530 # __rsaz_512_reducex 1531 # 1532 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1533 # output: %r8-%r15 1534 # clobbers: everything except %rbp and %rdi 1535$code.=<<___; 1536.type __rsaz_512_reducex,\@abi-omnipotent 1537.align 32 1538__rsaz_512_reducex: 1539 #movq 128+8(%rsp), %rdx # pull $n0 1540 imulq %r8, %rdx 1541 xorq %rsi, %rsi # cf=0,of=0 1542 movl \$8, %ecx 1543 jmp .Lreduction_loopx 1544 1545.align 32 1546.Lreduction_loopx: 1547 mov %r8, %rbx 1548 mulx 0(%rbp), %rax, %r8 1549 adcx %rbx, %rax 1550 adox %r9, %r8 1551 1552 mulx 8(%rbp), %rax, %r9 1553 adcx %rax, %r8 1554 adox %r10, %r9 1555 1556 mulx 16(%rbp), %rbx, %r10 1557 adcx %rbx, %r9 1558 adox %r11, %r10 1559 1560 mulx 24(%rbp), %rbx, %r11 1561 adcx %rbx, %r10 1562 adox %r12, %r11 1563 1564 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1565 mov %rdx, %rax 1566 mov %r8, %rdx 1567 adcx %rbx, %r11 1568 adox %r13, %r12 1569 1570 mulx 128+8(%rsp), %rbx, %rdx 1571 mov %rax, %rdx 1572 1573 mulx 40(%rbp), %rax, %r13 1574 adcx %rax, %r12 1575 adox %r14, %r13 1576 1577 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1578 adcx %rax, %r13 1579 adox %r15, %r14 1580 1581 mulx 56(%rbp), %rax, %r15 1582 mov %rbx, %rdx 1583 adcx %rax, %r14 1584 adox %rsi, %r15 # %rsi is 0 1585 adcx %rsi, %r15 # cf=0 1586 1587 decl %ecx # of=0 1588 jne .Lreduction_loopx 1589 1590 ret 1591.size __rsaz_512_reducex,.-__rsaz_512_reducex 1592___ 1593} 1594{ # __rsaz_512_subtract 1595 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1596 # output: 1597 # clobbers: everything but %rdi, %rsi and %rbp 1598$code.=<<___; 1599.type __rsaz_512_subtract,\@abi-omnipotent 1600.align 32 1601__rsaz_512_subtract: 1602 movq %r8, ($out) 1603 movq %r9, 8($out) 1604 movq %r10, 16($out) 1605 movq %r11, 24($out) 1606 movq %r12, 32($out) 1607 movq %r13, 40($out) 1608 movq %r14, 48($out) 1609 movq %r15, 56($out) 1610 1611 movq 0($mod), %r8 1612 movq 8($mod), %r9 1613 negq %r8 1614 notq %r9 1615 andq %rcx, %r8 1616 movq 16($mod), %r10 1617 andq %rcx, %r9 1618 notq %r10 1619 movq 24($mod), %r11 1620 andq %rcx, %r10 1621 notq %r11 1622 movq 32($mod), %r12 1623 andq %rcx, %r11 1624 notq %r12 1625 movq 40($mod), %r13 1626 andq %rcx, %r12 1627 notq %r13 1628 movq 48($mod), %r14 1629 andq %rcx, %r13 1630 notq %r14 1631 movq 56($mod), %r15 1632 andq %rcx, %r14 1633 notq %r15 1634 andq %rcx, %r15 1635 1636 addq ($out), %r8 1637 adcq 8($out), %r9 1638 adcq 16($out), %r10 1639 adcq 24($out), %r11 1640 adcq 32($out), %r12 1641 adcq 40($out), %r13 1642 adcq 48($out), %r14 1643 adcq 56($out), %r15 1644 1645 movq %r8, ($out) 1646 movq %r9, 8($out) 1647 movq %r10, 16($out) 1648 movq %r11, 24($out) 1649 movq %r12, 32($out) 1650 movq %r13, 40($out) 1651 movq %r14, 48($out) 1652 movq %r15, 56($out) 1653 1654 ret 1655.size __rsaz_512_subtract,.-__rsaz_512_subtract 1656___ 1657} 1658{ # __rsaz_512_mul 1659 # 1660 # input: %rsi - ap, %rbp - bp 1661 # ouput: 1662 # clobbers: everything 1663my ($ap,$bp) = ("%rsi","%rbp"); 1664$code.=<<___; 1665.type __rsaz_512_mul,\@abi-omnipotent 1666.align 32 1667__rsaz_512_mul: 1668 leaq 8(%rsp), %rdi 1669 1670 movq ($ap), %rax 1671 mulq %rbx 1672 movq %rax, (%rdi) 1673 movq 8($ap), %rax 1674 movq %rdx, %r8 1675 1676 mulq %rbx 1677 addq %rax, %r8 1678 movq 16($ap), %rax 1679 movq %rdx, %r9 1680 adcq \$0, %r9 1681 1682 mulq %rbx 1683 addq %rax, %r9 1684 movq 24($ap), %rax 1685 movq %rdx, %r10 1686 adcq \$0, %r10 1687 1688 mulq %rbx 1689 addq %rax, %r10 1690 movq 32($ap), %rax 1691 movq %rdx, %r11 1692 adcq \$0, %r11 1693 1694 mulq %rbx 1695 addq %rax, %r11 1696 movq 40($ap), %rax 1697 movq %rdx, %r12 1698 adcq \$0, %r12 1699 1700 mulq %rbx 1701 addq %rax, %r12 1702 movq 48($ap), %rax 1703 movq %rdx, %r13 1704 adcq \$0, %r13 1705 1706 mulq %rbx 1707 addq %rax, %r13 1708 movq 56($ap), %rax 1709 movq %rdx, %r14 1710 adcq \$0, %r14 1711 1712 mulq %rbx 1713 addq %rax, %r14 1714 movq ($ap), %rax 1715 movq %rdx, %r15 1716 adcq \$0, %r15 1717 1718 leaq 8($bp), $bp 1719 leaq 8(%rdi), %rdi 1720 1721 movl \$7, %ecx 1722 jmp .Loop_mul 1723 1724.align 32 1725.Loop_mul: 1726 movq ($bp), %rbx 1727 mulq %rbx 1728 addq %rax, %r8 1729 movq 8($ap), %rax 1730 movq %r8, (%rdi) 1731 movq %rdx, %r8 1732 adcq \$0, %r8 1733 1734 mulq %rbx 1735 addq %rax, %r9 1736 movq 16($ap), %rax 1737 adcq \$0, %rdx 1738 addq %r9, %r8 1739 movq %rdx, %r9 1740 adcq \$0, %r9 1741 1742 mulq %rbx 1743 addq %rax, %r10 1744 movq 24($ap), %rax 1745 adcq \$0, %rdx 1746 addq %r10, %r9 1747 movq %rdx, %r10 1748 adcq \$0, %r10 1749 1750 mulq %rbx 1751 addq %rax, %r11 1752 movq 32($ap), %rax 1753 adcq \$0, %rdx 1754 addq %r11, %r10 1755 movq %rdx, %r11 1756 adcq \$0, %r11 1757 1758 mulq %rbx 1759 addq %rax, %r12 1760 movq 40($ap), %rax 1761 adcq \$0, %rdx 1762 addq %r12, %r11 1763 movq %rdx, %r12 1764 adcq \$0, %r12 1765 1766 mulq %rbx 1767 addq %rax, %r13 1768 movq 48($ap), %rax 1769 adcq \$0, %rdx 1770 addq %r13, %r12 1771 movq %rdx, %r13 1772 adcq \$0, %r13 1773 1774 mulq %rbx 1775 addq %rax, %r14 1776 movq 56($ap), %rax 1777 adcq \$0, %rdx 1778 addq %r14, %r13 1779 movq %rdx, %r14 1780 leaq 8($bp), $bp 1781 adcq \$0, %r14 1782 1783 mulq %rbx 1784 addq %rax, %r15 1785 movq ($ap), %rax 1786 adcq \$0, %rdx 1787 addq %r15, %r14 1788 movq %rdx, %r15 1789 adcq \$0, %r15 1790 1791 leaq 8(%rdi), %rdi 1792 1793 decl %ecx 1794 jnz .Loop_mul 1795 1796 movq %r8, (%rdi) 1797 movq %r9, 8(%rdi) 1798 movq %r10, 16(%rdi) 1799 movq %r11, 24(%rdi) 1800 movq %r12, 32(%rdi) 1801 movq %r13, 40(%rdi) 1802 movq %r14, 48(%rdi) 1803 movq %r15, 56(%rdi) 1804 1805 ret 1806.size __rsaz_512_mul,.-__rsaz_512_mul 1807___ 1808} 1809if ($addx) { 1810 # __rsaz_512_mulx 1811 # 1812 # input: %rsi - ap, %rbp - bp 1813 # ouput: 1814 # clobbers: everything 1815my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1816$code.=<<___; 1817.type __rsaz_512_mulx,\@abi-omnipotent 1818.align 32 1819__rsaz_512_mulx: 1820 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1821 mov \$-6, %rcx 1822 1823 mulx 8($ap), %rax, %r9 1824 movq %rbx, 8(%rsp) 1825 1826 mulx 16($ap), %rbx, %r10 1827 adc %rax, %r8 1828 1829 mulx 24($ap), %rax, %r11 1830 adc %rbx, %r9 1831 1832 mulx 32($ap), %rbx, %r12 1833 adc %rax, %r10 1834 1835 mulx 40($ap), %rax, %r13 1836 adc %rbx, %r11 1837 1838 mulx 48($ap), %rbx, %r14 1839 adc %rax, %r12 1840 1841 mulx 56($ap), %rax, %r15 1842 mov 8($bp), %rdx 1843 adc %rbx, %r13 1844 adc %rax, %r14 1845 adc \$0, %r15 1846 1847 xor $zero, $zero # cf=0,of=0 1848 jmp .Loop_mulx 1849 1850.align 32 1851.Loop_mulx: 1852 movq %r8, %rbx 1853 mulx ($ap), %rax, %r8 1854 adcx %rax, %rbx 1855 adox %r9, %r8 1856 1857 mulx 8($ap), %rax, %r9 1858 adcx %rax, %r8 1859 adox %r10, %r9 1860 1861 mulx 16($ap), %rax, %r10 1862 adcx %rax, %r9 1863 adox %r11, %r10 1864 1865 mulx 24($ap), %rax, %r11 1866 adcx %rax, %r10 1867 adox %r12, %r11 1868 1869 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 1870 adcx %rax, %r11 1871 adox %r13, %r12 1872 1873 mulx 40($ap), %rax, %r13 1874 adcx %rax, %r12 1875 adox %r14, %r13 1876 1877 mulx 48($ap), %rax, %r14 1878 adcx %rax, %r13 1879 adox %r15, %r14 1880 1881 mulx 56($ap), %rax, %r15 1882 movq 64($bp,%rcx,8), %rdx 1883 movq %rbx, 8+64-8(%rsp,%rcx,8) 1884 adcx %rax, %r14 1885 adox $zero, %r15 1886 adcx $zero, %r15 # cf=0 1887 1888 inc %rcx # of=0 1889 jnz .Loop_mulx 1890 1891 movq %r8, %rbx 1892 mulx ($ap), %rax, %r8 1893 adcx %rax, %rbx 1894 adox %r9, %r8 1895 1896 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 1897 adcx %rax, %r8 1898 adox %r10, %r9 1899 1900 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 1901 adcx %rax, %r9 1902 adox %r11, %r10 1903 1904 mulx 24($ap), %rax, %r11 1905 adcx %rax, %r10 1906 adox %r12, %r11 1907 1908 mulx 32($ap), %rax, %r12 1909 adcx %rax, %r11 1910 adox %r13, %r12 1911 1912 mulx 40($ap), %rax, %r13 1913 adcx %rax, %r12 1914 adox %r14, %r13 1915 1916 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1917 adcx %rax, %r13 1918 adox %r15, %r14 1919 1920 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 1921 adcx %rax, %r14 1922 adox $zero, %r15 1923 adcx $zero, %r15 1924 1925 mov %rbx, 8+64-8(%rsp) 1926 mov %r8, 8+64(%rsp) 1927 mov %r9, 8+64+8(%rsp) 1928 mov %r10, 8+64+16(%rsp) 1929 mov %r11, 8+64+24(%rsp) 1930 mov %r12, 8+64+32(%rsp) 1931 mov %r13, 8+64+40(%rsp) 1932 mov %r14, 8+64+48(%rsp) 1933 mov %r15, 8+64+56(%rsp) 1934 1935 ret 1936.size __rsaz_512_mulx,.-__rsaz_512_mulx 1937___ 1938} 1939{ 1940my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1941$code.=<<___; 1942.globl rsaz_512_scatter4 1943.type rsaz_512_scatter4,\@abi-omnipotent 1944.align 16 1945rsaz_512_scatter4: 1946 leaq ($out,$power,4), $out 1947 movl \$8, %r9d 1948 jmp .Loop_scatter 1949.align 16 1950.Loop_scatter: 1951 movq ($inp), %rax 1952 leaq 8($inp), $inp 1953 movl %eax, ($out) 1954 shrq \$32, %rax 1955 movl %eax, 64($out) 1956 leaq 128($out), $out 1957 decl %r9d 1958 jnz .Loop_scatter 1959 ret 1960.size rsaz_512_scatter4,.-rsaz_512_scatter4 1961 1962.globl rsaz_512_gather4 1963.type rsaz_512_gather4,\@abi-omnipotent 1964.align 16 1965rsaz_512_gather4: 1966 leaq ($inp,$power,4), $inp 1967 movl \$8, %r9d 1968 jmp .Loop_gather 1969.align 16 1970.Loop_gather: 1971 movl ($inp), %eax 1972 movl 64($inp), %r8d 1973 leaq 128($inp), $inp 1974 shlq \$32, %r8 1975 or %r8, %rax 1976 movq %rax, ($out) 1977 leaq 8($out), $out 1978 decl %r9d 1979 jnz .Loop_gather 1980 ret 1981.size rsaz_512_gather4,.-rsaz_512_gather4 1982___ 1983} 1984 1985# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1986# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1987if ($win64) { 1988$rec="%rcx"; 1989$frame="%rdx"; 1990$context="%r8"; 1991$disp="%r9"; 1992 1993$code.=<<___; 1994.extern __imp_RtlVirtualUnwind 1995.type se_handler,\@abi-omnipotent 1996.align 16 1997se_handler: 1998 push %rsi 1999 push %rdi 2000 push %rbx 2001 push %rbp 2002 push %r12 2003 push %r13 2004 push %r14 2005 push %r15 2006 pushfq 2007 sub \$64,%rsp 2008 2009 mov 120($context),%rax # pull context->Rax 2010 mov 248($context),%rbx # pull context->Rip 2011 2012 mov 8($disp),%rsi # disp->ImageBase 2013 mov 56($disp),%r11 # disp->HandlerData 2014 2015 mov 0(%r11),%r10d # HandlerData[0] 2016 lea (%rsi,%r10),%r10 # end of prologue label 2017 cmp %r10,%rbx # context->Rip<end of prologue label 2018 jb .Lcommon_seh_tail 2019 2020 mov 152($context),%rax # pull context->Rsp 2021 2022 mov 4(%r11),%r10d # HandlerData[1] 2023 lea (%rsi,%r10),%r10 # epilogue label 2024 cmp %r10,%rbx # context->Rip>=epilogue label 2025 jae .Lcommon_seh_tail 2026 2027 lea 128+24+48(%rax),%rax 2028 2029 mov -8(%rax),%rbx 2030 mov -16(%rax),%rbp 2031 mov -24(%rax),%r12 2032 mov -32(%rax),%r13 2033 mov -40(%rax),%r14 2034 mov -48(%rax),%r15 2035 mov %rbx,144($context) # restore context->Rbx 2036 mov %rbp,160($context) # restore context->Rbp 2037 mov %r12,216($context) # restore context->R12 2038 mov %r13,224($context) # restore context->R13 2039 mov %r14,232($context) # restore context->R14 2040 mov %r15,240($context) # restore context->R15 2041 2042.Lcommon_seh_tail: 2043 mov 8(%rax),%rdi 2044 mov 16(%rax),%rsi 2045 mov %rax,152($context) # restore context->Rsp 2046 mov %rsi,168($context) # restore context->Rsi 2047 mov %rdi,176($context) # restore context->Rdi 2048 2049 mov 40($disp),%rdi # disp->ContextRecord 2050 mov $context,%rsi # context 2051 mov \$154,%ecx # sizeof(CONTEXT) 2052 .long 0xa548f3fc # cld; rep movsq 2053 2054 mov $disp,%rsi 2055 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2056 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2057 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2058 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2059 mov 40(%rsi),%r10 # disp->ContextRecord 2060 lea 56(%rsi),%r11 # &disp->HandlerData 2061 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2062 mov %r10,32(%rsp) # arg5 2063 mov %r11,40(%rsp) # arg6 2064 mov %r12,48(%rsp) # arg7 2065 mov %rcx,56(%rsp) # arg8, (NULL) 2066 call *__imp_RtlVirtualUnwind(%rip) 2067 2068 mov \$1,%eax # ExceptionContinueSearch 2069 add \$64,%rsp 2070 popfq 2071 pop %r15 2072 pop %r14 2073 pop %r13 2074 pop %r12 2075 pop %rbp 2076 pop %rbx 2077 pop %rdi 2078 pop %rsi 2079 ret 2080.size sqr_handler,.-sqr_handler 2081 2082.section .pdata 2083.align 4 2084 .rva .LSEH_begin_rsaz_512_sqr 2085 .rva .LSEH_end_rsaz_512_sqr 2086 .rva .LSEH_info_rsaz_512_sqr 2087 2088 .rva .LSEH_begin_rsaz_512_mul 2089 .rva .LSEH_end_rsaz_512_mul 2090 .rva .LSEH_info_rsaz_512_mul 2091 2092 .rva .LSEH_begin_rsaz_512_mul_gather4 2093 .rva .LSEH_end_rsaz_512_mul_gather4 2094 .rva .LSEH_info_rsaz_512_mul_gather4 2095 2096 .rva .LSEH_begin_rsaz_512_mul_scatter4 2097 .rva .LSEH_end_rsaz_512_mul_scatter4 2098 .rva .LSEH_info_rsaz_512_mul_scatter4 2099 2100 .rva .LSEH_begin_rsaz_512_mul_by_one 2101 .rva .LSEH_end_rsaz_512_mul_by_one 2102 .rva .LSEH_info_rsaz_512_mul_by_one 2103 2104.section .xdata 2105.align 8 2106.LSEH_info_rsaz_512_sqr: 2107 .byte 9,0,0,0 2108 .rva se_handler 2109 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2110.LSEH_info_rsaz_512_mul: 2111 .byte 9,0,0,0 2112 .rva se_handler 2113 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2114.LSEH_info_rsaz_512_mul_gather4: 2115 .byte 9,0,0,0 2116 .rva se_handler 2117 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2118.LSEH_info_rsaz_512_mul_scatter4: 2119 .byte 9,0,0,0 2120 .rva se_handler 2121 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2122.LSEH_info_rsaz_512_mul_by_one: 2123 .byte 9,0,0,0 2124 .rva se_handler 2125 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2126___ 2127} 2128 2129$code =~ s/\`([^\`]*)\`/eval $1/gem; 2130print $code; 2131close STDOUT; 2132