1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# March 2015 11# 12# "Teaser" Montgomery multiplication module for ARMv8. Needs more 13# work. While it does improve RSA sign performance by 20-30% (less for 14# longer keys) on most processors, for some reason RSA2048 is not 15# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 16# instruction issue rate is limited on processor in question, meaning 17# that dedicated squaring procedure is a must. Well, actually all 18# contemporary AArch64 processors seem to have limited multiplication 19# issue rate, i.e. they can't issue multiplication every cycle, which 20# explains moderate improvement coefficients in comparison to 21# compiler-generated code. Recall that compiler is instructed to use 22# umulh and therefore uses same amount of multiplication instructions 23# to do the job. Assembly's edge is to minimize number of "collateral" 24# instructions and of course instruction scheduling. 25# 26# April 2015 27# 28# Squaring procedure that handles lengths divisible by 8 improves 29# RSA/DSA performance by 25-40-60% depending on processor and key 30# length. Overall improvement coefficients are always positive in 31# comparison to compiler-generated code. On Cortex-A57 improvement 32# is still modest on longest key lengths, while others exhibit e.g. 33# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 34# on Cortex-A57 and ~60-100% faster on others. 35 36$flavour = shift; 37$output = shift; 38 39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 41( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 42die "can't locate arm-xlate.pl"; 43 44open OUT,"| \"$^X\" $xlate $flavour $output"; 45*STDOUT=*OUT; 46 47($lo0,$hi0,$aj,$m0,$alo,$ahi, 48 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 49 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 50 51# int bn_mul_mont( 52$rp="x0"; # BN_ULONG *rp, 53$ap="x1"; # const BN_ULONG *ap, 54$bp="x2"; # const BN_ULONG *bp, 55$np="x3"; # const BN_ULONG *np, 56$n0="x4"; # const BN_ULONG *n0, 57$num="x5"; # int num); 58 59$code.=<<___; 60.text 61 62.globl bn_mul_mont 63.type bn_mul_mont,%function 64.align 5 65bn_mul_mont: 66 tst $num,#7 67 b.eq __bn_sqr8x_mont 68 tst $num,#3 69 b.eq __bn_mul4x_mont 70.Lmul_mont: 71 stp x29,x30,[sp,#-64]! 72 add x29,sp,#0 73 stp x19,x20,[sp,#16] 74 stp x21,x22,[sp,#32] 75 stp x23,x24,[sp,#48] 76 77 ldr $m0,[$bp],#8 // bp[0] 78 sub $tp,sp,$num,lsl#3 79 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 80 lsl $num,$num,#3 81 ldr $n0,[$n0] // *n0 82 and $tp,$tp,#-16 // ABI says so 83 ldp $hi1,$nj,[$np],#16 // np[0..1] 84 85 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 86 sub $j,$num,#16 // j=num-2 87 umulh $hi0,$hi0,$m0 88 mul $alo,$aj,$m0 // ap[1]*bp[0] 89 umulh $ahi,$aj,$m0 90 91 mul $m1,$lo0,$n0 // "tp[0]"*n0 92 mov sp,$tp // alloca 93 94 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 95 umulh $hi1,$hi1,$m1 96 mul $nlo,$nj,$m1 // np[1]*m1 97 // (*) adds $lo1,$lo1,$lo0 // discarded 98 // (*) As for removal of first multiplication and addition 99 // instructions. The outcome of first addition is 100 // guaranteed to be zero, which leaves two computationally 101 // significant outcomes: it either carries or not. Then 102 // question is when does it carry? Is there alternative 103 // way to deduce it? If you follow operations, you can 104 // observe that condition for carry is quite simple: 105 // $lo0 being non-zero. So that carry can be calculated 106 // by adding -1 to $lo0. That's what next instruction does. 107 subs xzr,$lo0,#1 // (*) 108 umulh $nhi,$nj,$m1 109 adc $hi1,$hi1,xzr 110 cbz $j,.L1st_skip 111 112.L1st: 113 ldr $aj,[$ap],#8 114 adds $lo0,$alo,$hi0 115 sub $j,$j,#8 // j-- 116 adc $hi0,$ahi,xzr 117 118 ldr $nj,[$np],#8 119 adds $lo1,$nlo,$hi1 120 mul $alo,$aj,$m0 // ap[j]*bp[0] 121 adc $hi1,$nhi,xzr 122 umulh $ahi,$aj,$m0 123 124 adds $lo1,$lo1,$lo0 125 mul $nlo,$nj,$m1 // np[j]*m1 126 adc $hi1,$hi1,xzr 127 umulh $nhi,$nj,$m1 128 str $lo1,[$tp],#8 // tp[j-1] 129 cbnz $j,.L1st 130 131.L1st_skip: 132 adds $lo0,$alo,$hi0 133 sub $ap,$ap,$num // rewind $ap 134 adc $hi0,$ahi,xzr 135 136 adds $lo1,$nlo,$hi1 137 sub $np,$np,$num // rewind $np 138 adc $hi1,$nhi,xzr 139 140 adds $lo1,$lo1,$lo0 141 sub $i,$num,#8 // i=num-1 142 adcs $hi1,$hi1,$hi0 143 144 adc $ovf,xzr,xzr // upmost overflow bit 145 stp $lo1,$hi1,[$tp] 146 147.Louter: 148 ldr $m0,[$bp],#8 // bp[i] 149 ldp $hi0,$aj,[$ap],#16 150 ldr $tj,[sp] // tp[0] 151 add $tp,sp,#8 152 153 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 154 sub $j,$num,#16 // j=num-2 155 umulh $hi0,$hi0,$m0 156 ldp $hi1,$nj,[$np],#16 157 mul $alo,$aj,$m0 // ap[1]*bp[i] 158 adds $lo0,$lo0,$tj 159 umulh $ahi,$aj,$m0 160 adc $hi0,$hi0,xzr 161 162 mul $m1,$lo0,$n0 163 sub $i,$i,#8 // i-- 164 165 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 166 umulh $hi1,$hi1,$m1 167 mul $nlo,$nj,$m1 // np[1]*m1 168 // (*) adds $lo1,$lo1,$lo0 169 subs xzr,$lo0,#1 // (*) 170 umulh $nhi,$nj,$m1 171 cbz $j,.Linner_skip 172 173.Linner: 174 ldr $aj,[$ap],#8 175 adc $hi1,$hi1,xzr 176 ldr $tj,[$tp],#8 // tp[j] 177 adds $lo0,$alo,$hi0 178 sub $j,$j,#8 // j-- 179 adc $hi0,$ahi,xzr 180 181 adds $lo1,$nlo,$hi1 182 ldr $nj,[$np],#8 183 adc $hi1,$nhi,xzr 184 185 mul $alo,$aj,$m0 // ap[j]*bp[i] 186 adds $lo0,$lo0,$tj 187 umulh $ahi,$aj,$m0 188 adc $hi0,$hi0,xzr 189 190 mul $nlo,$nj,$m1 // np[j]*m1 191 adds $lo1,$lo1,$lo0 192 umulh $nhi,$nj,$m1 193 str $lo1,[$tp,#-16] // tp[j-1] 194 cbnz $j,.Linner 195 196.Linner_skip: 197 ldr $tj,[$tp],#8 // tp[j] 198 adc $hi1,$hi1,xzr 199 adds $lo0,$alo,$hi0 200 sub $ap,$ap,$num // rewind $ap 201 adc $hi0,$ahi,xzr 202 203 adds $lo1,$nlo,$hi1 204 sub $np,$np,$num // rewind $np 205 adcs $hi1,$nhi,$ovf 206 adc $ovf,xzr,xzr 207 208 adds $lo0,$lo0,$tj 209 adc $hi0,$hi0,xzr 210 211 adds $lo1,$lo1,$lo0 212 adcs $hi1,$hi1,$hi0 213 adc $ovf,$ovf,xzr // upmost overflow bit 214 stp $lo1,$hi1,[$tp,#-16] 215 216 cbnz $i,.Louter 217 218 // Final step. We see if result is larger than modulus, and 219 // if it is, subtract the modulus. But comparison implies 220 // subtraction. So we subtract modulus, see if it borrowed, 221 // and conditionally copy original value. 222 ldr $tj,[sp] // tp[0] 223 add $tp,sp,#8 224 ldr $nj,[$np],#8 // np[0] 225 subs $j,$num,#8 // j=num-1 and clear borrow 226 mov $ap,$rp 227.Lsub: 228 sbcs $aj,$tj,$nj // tp[j]-np[j] 229 ldr $tj,[$tp],#8 230 sub $j,$j,#8 // j-- 231 ldr $nj,[$np],#8 232 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 233 cbnz $j,.Lsub 234 235 sbcs $aj,$tj,$nj 236 sbcs $ovf,$ovf,xzr // did it borrow? 237 str $aj,[$ap],#8 // rp[num-1] 238 239 ldr $tj,[sp] // tp[0] 240 add $tp,sp,#8 241 ldr $aj,[$rp],#8 // rp[0] 242 sub $num,$num,#8 // num-- 243 nop 244.Lcond_copy: 245 sub $num,$num,#8 // num-- 246 csel $nj,$tj,$aj,lo // did it borrow? 247 ldr $tj,[$tp],#8 248 ldr $aj,[$rp],#8 249 str xzr,[$tp,#-16] // wipe tp 250 str $nj,[$rp,#-16] 251 cbnz $num,.Lcond_copy 252 253 csel $nj,$tj,$aj,lo 254 str xzr,[$tp,#-8] // wipe tp 255 str $nj,[$rp,#-8] 256 257 ldp x19,x20,[x29,#16] 258 mov sp,x29 259 ldp x21,x22,[x29,#32] 260 mov x0,#1 261 ldp x23,x24,[x29,#48] 262 ldr x29,[sp],#64 263 ret 264.size bn_mul_mont,.-bn_mul_mont 265___ 266{ 267######################################################################## 268# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 269 270my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 271my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 272my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 273my ($cnt,$carry,$topmost)=("x27","x28","x30"); 274my ($tp,$ap_end,$na0)=($bp,$np,$carry); 275 276$code.=<<___; 277.type __bn_sqr8x_mont,%function 278.align 5 279__bn_sqr8x_mont: 280 cmp $ap,$bp 281 b.ne __bn_mul4x_mont 282.Lsqr8x_mont: 283 stp x29,x30,[sp,#-128]! 284 add x29,sp,#0 285 stp x19,x20,[sp,#16] 286 stp x21,x22,[sp,#32] 287 stp x23,x24,[sp,#48] 288 stp x25,x26,[sp,#64] 289 stp x27,x28,[sp,#80] 290 stp $rp,$np,[sp,#96] // offload rp and np 291 292 ldp $a0,$a1,[$ap,#8*0] 293 ldp $a2,$a3,[$ap,#8*2] 294 ldp $a4,$a5,[$ap,#8*4] 295 ldp $a6,$a7,[$ap,#8*6] 296 297 sub $tp,sp,$num,lsl#4 298 lsl $num,$num,#3 299 ldr $n0,[$n0] // *n0 300 mov sp,$tp // alloca 301 sub $cnt,$num,#8*8 302 b .Lsqr8x_zero_start 303 304.Lsqr8x_zero: 305 sub $cnt,$cnt,#8*8 306 stp xzr,xzr,[$tp,#8*0] 307 stp xzr,xzr,[$tp,#8*2] 308 stp xzr,xzr,[$tp,#8*4] 309 stp xzr,xzr,[$tp,#8*6] 310.Lsqr8x_zero_start: 311 stp xzr,xzr,[$tp,#8*8] 312 stp xzr,xzr,[$tp,#8*10] 313 stp xzr,xzr,[$tp,#8*12] 314 stp xzr,xzr,[$tp,#8*14] 315 add $tp,$tp,#8*16 316 cbnz $cnt,.Lsqr8x_zero 317 318 add $ap_end,$ap,$num 319 add $ap,$ap,#8*8 320 mov $acc0,xzr 321 mov $acc1,xzr 322 mov $acc2,xzr 323 mov $acc3,xzr 324 mov $acc4,xzr 325 mov $acc5,xzr 326 mov $acc6,xzr 327 mov $acc7,xzr 328 mov $tp,sp 329 str $n0,[x29,#112] // offload n0 330 331 // Multiply everything but a[i]*a[i] 332.align 4 333.Lsqr8x_outer_loop: 334 // a[1]a[0] (i) 335 // a[2]a[0] 336 // a[3]a[0] 337 // a[4]a[0] 338 // a[5]a[0] 339 // a[6]a[0] 340 // a[7]a[0] 341 // a[2]a[1] (ii) 342 // a[3]a[1] 343 // a[4]a[1] 344 // a[5]a[1] 345 // a[6]a[1] 346 // a[7]a[1] 347 // a[3]a[2] (iii) 348 // a[4]a[2] 349 // a[5]a[2] 350 // a[6]a[2] 351 // a[7]a[2] 352 // a[4]a[3] (iv) 353 // a[5]a[3] 354 // a[6]a[3] 355 // a[7]a[3] 356 // a[5]a[4] (v) 357 // a[6]a[4] 358 // a[7]a[4] 359 // a[6]a[5] (vi) 360 // a[7]a[5] 361 // a[7]a[6] (vii) 362 363 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 364 mul $t1,$a2,$a0 365 mul $t2,$a3,$a0 366 mul $t3,$a4,$a0 367 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 368 mul $t0,$a5,$a0 369 adcs $acc2,$acc2,$t1 370 mul $t1,$a6,$a0 371 adcs $acc3,$acc3,$t2 372 mul $t2,$a7,$a0 373 adcs $acc4,$acc4,$t3 374 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 375 adcs $acc5,$acc5,$t0 376 umulh $t0,$a2,$a0 377 adcs $acc6,$acc6,$t1 378 umulh $t1,$a3,$a0 379 adcs $acc7,$acc7,$t2 380 umulh $t2,$a4,$a0 381 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 382 adc $acc0,xzr,xzr // t[8] 383 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 384 umulh $t3,$a5,$a0 385 adcs $acc3,$acc3,$t0 386 umulh $t0,$a6,$a0 387 adcs $acc4,$acc4,$t1 388 umulh $t1,$a7,$a0 389 adcs $acc5,$acc5,$t2 390 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 391 adcs $acc6,$acc6,$t3 392 mul $t3,$a3,$a1 393 adcs $acc7,$acc7,$t0 394 mul $t0,$a4,$a1 395 adc $acc0,$acc0,$t1 396 397 mul $t1,$a5,$a1 398 adds $acc3,$acc3,$t2 399 mul $t2,$a6,$a1 400 adcs $acc4,$acc4,$t3 401 mul $t3,$a7,$a1 402 adcs $acc5,$acc5,$t0 403 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 404 adcs $acc6,$acc6,$t1 405 umulh $t1,$a3,$a1 406 adcs $acc7,$acc7,$t2 407 umulh $t2,$a4,$a1 408 adcs $acc0,$acc0,$t3 409 umulh $t3,$a5,$a1 410 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 411 adc $acc1,xzr,xzr // t[9] 412 adds $acc4,$acc4,$t0 413 umulh $t0,$a6,$a1 414 adcs $acc5,$acc5,$t1 415 umulh $t1,$a7,$a1 416 adcs $acc6,$acc6,$t2 417 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 418 adcs $acc7,$acc7,$t3 419 mul $t3,$a4,$a2 420 adcs $acc0,$acc0,$t0 421 mul $t0,$a5,$a2 422 adc $acc1,$acc1,$t1 423 424 mul $t1,$a6,$a2 425 adds $acc5,$acc5,$t2 426 mul $t2,$a7,$a2 427 adcs $acc6,$acc6,$t3 428 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 429 adcs $acc7,$acc7,$t0 430 umulh $t0,$a4,$a2 431 adcs $acc0,$acc0,$t1 432 umulh $t1,$a5,$a2 433 adcs $acc1,$acc1,$t2 434 umulh $t2,$a6,$a2 435 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 436 adc $acc2,xzr,xzr // t[10] 437 adds $acc6,$acc6,$t3 438 umulh $t3,$a7,$a2 439 adcs $acc7,$acc7,$t0 440 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 441 adcs $acc0,$acc0,$t1 442 mul $t1,$a5,$a3 443 adcs $acc1,$acc1,$t2 444 mul $t2,$a6,$a3 445 adc $acc2,$acc2,$t3 446 447 mul $t3,$a7,$a3 448 adds $acc7,$acc7,$t0 449 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 450 adcs $acc0,$acc0,$t1 451 umulh $t1,$a5,$a3 452 adcs $acc1,$acc1,$t2 453 umulh $t2,$a6,$a3 454 adcs $acc2,$acc2,$t3 455 umulh $t3,$a7,$a3 456 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 457 adc $acc3,xzr,xzr // t[11] 458 adds $acc0,$acc0,$t0 459 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 460 adcs $acc1,$acc1,$t1 461 mul $t1,$a6,$a4 462 adcs $acc2,$acc2,$t2 463 mul $t2,$a7,$a4 464 adc $acc3,$acc3,$t3 465 466 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 467 adds $acc1,$acc1,$t0 468 umulh $t0,$a6,$a4 469 adcs $acc2,$acc2,$t1 470 umulh $t1,$a7,$a4 471 adcs $acc3,$acc3,$t2 472 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 473 adc $acc4,xzr,xzr // t[12] 474 adds $acc2,$acc2,$t3 475 mul $t3,$a7,$a5 476 adcs $acc3,$acc3,$t0 477 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 478 adc $acc4,$acc4,$t1 479 480 umulh $t1,$a7,$a5 481 adds $acc3,$acc3,$t2 482 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 483 adcs $acc4,$acc4,$t3 484 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 485 adc $acc5,xzr,xzr // t[13] 486 adds $acc4,$acc4,$t0 487 sub $cnt,$ap_end,$ap // done yet? 488 adc $acc5,$acc5,$t1 489 490 adds $acc5,$acc5,$t2 491 sub $t0,$ap_end,$num // rewinded ap 492 adc $acc6,xzr,xzr // t[14] 493 add $acc6,$acc6,$t3 494 495 cbz $cnt,.Lsqr8x_outer_break 496 497 mov $n0,$a0 498 ldp $a0,$a1,[$tp,#8*0] 499 ldp $a2,$a3,[$tp,#8*2] 500 ldp $a4,$a5,[$tp,#8*4] 501 ldp $a6,$a7,[$tp,#8*6] 502 adds $acc0,$acc0,$a0 503 adcs $acc1,$acc1,$a1 504 ldp $a0,$a1,[$ap,#8*0] 505 adcs $acc2,$acc2,$a2 506 adcs $acc3,$acc3,$a3 507 ldp $a2,$a3,[$ap,#8*2] 508 adcs $acc4,$acc4,$a4 509 adcs $acc5,$acc5,$a5 510 ldp $a4,$a5,[$ap,#8*4] 511 adcs $acc6,$acc6,$a6 512 mov $rp,$ap 513 adcs $acc7,xzr,$a7 514 ldp $a6,$a7,[$ap,#8*6] 515 add $ap,$ap,#8*8 516 //adc $carry,xzr,xzr // moved below 517 mov $cnt,#-8*8 518 519 // a[8]a[0] 520 // a[9]a[0] 521 // a[a]a[0] 522 // a[b]a[0] 523 // a[c]a[0] 524 // a[d]a[0] 525 // a[e]a[0] 526 // a[f]a[0] 527 // a[8]a[1] 528 // a[f]a[1]........................ 529 // a[8]a[2] 530 // a[f]a[2]........................ 531 // a[8]a[3] 532 // a[f]a[3]........................ 533 // a[8]a[4] 534 // a[f]a[4]........................ 535 // a[8]a[5] 536 // a[f]a[5]........................ 537 // a[8]a[6] 538 // a[f]a[6]........................ 539 // a[8]a[7] 540 // a[f]a[7]........................ 541.Lsqr8x_mul: 542 mul $t0,$a0,$n0 543 adc $carry,xzr,xzr // carry bit, modulo-scheduled 544 mul $t1,$a1,$n0 545 add $cnt,$cnt,#8 546 mul $t2,$a2,$n0 547 mul $t3,$a3,$n0 548 adds $acc0,$acc0,$t0 549 mul $t0,$a4,$n0 550 adcs $acc1,$acc1,$t1 551 mul $t1,$a5,$n0 552 adcs $acc2,$acc2,$t2 553 mul $t2,$a6,$n0 554 adcs $acc3,$acc3,$t3 555 mul $t3,$a7,$n0 556 adcs $acc4,$acc4,$t0 557 umulh $t0,$a0,$n0 558 adcs $acc5,$acc5,$t1 559 umulh $t1,$a1,$n0 560 adcs $acc6,$acc6,$t2 561 umulh $t2,$a2,$n0 562 adcs $acc7,$acc7,$t3 563 umulh $t3,$a3,$n0 564 adc $carry,$carry,xzr 565 str $acc0,[$tp],#8 566 adds $acc0,$acc1,$t0 567 umulh $t0,$a4,$n0 568 adcs $acc1,$acc2,$t1 569 umulh $t1,$a5,$n0 570 adcs $acc2,$acc3,$t2 571 umulh $t2,$a6,$n0 572 adcs $acc3,$acc4,$t3 573 umulh $t3,$a7,$n0 574 ldr $n0,[$rp,$cnt] 575 adcs $acc4,$acc5,$t0 576 adcs $acc5,$acc6,$t1 577 adcs $acc6,$acc7,$t2 578 adcs $acc7,$carry,$t3 579 //adc $carry,xzr,xzr // moved above 580 cbnz $cnt,.Lsqr8x_mul 581 // note that carry flag is guaranteed 582 // to be zero at this point 583 cmp $ap,$ap_end // done yet? 584 b.eq .Lsqr8x_break 585 586 ldp $a0,$a1,[$tp,#8*0] 587 ldp $a2,$a3,[$tp,#8*2] 588 ldp $a4,$a5,[$tp,#8*4] 589 ldp $a6,$a7,[$tp,#8*6] 590 adds $acc0,$acc0,$a0 591 ldr $n0,[$rp,#-8*8] 592 adcs $acc1,$acc1,$a1 593 ldp $a0,$a1,[$ap,#8*0] 594 adcs $acc2,$acc2,$a2 595 adcs $acc3,$acc3,$a3 596 ldp $a2,$a3,[$ap,#8*2] 597 adcs $acc4,$acc4,$a4 598 adcs $acc5,$acc5,$a5 599 ldp $a4,$a5,[$ap,#8*4] 600 adcs $acc6,$acc6,$a6 601 mov $cnt,#-8*8 602 adcs $acc7,$acc7,$a7 603 ldp $a6,$a7,[$ap,#8*6] 604 add $ap,$ap,#8*8 605 //adc $carry,xzr,xzr // moved above 606 b .Lsqr8x_mul 607 608.align 4 609.Lsqr8x_break: 610 ldp $a0,$a1,[$rp,#8*0] 611 add $ap,$rp,#8*8 612 ldp $a2,$a3,[$rp,#8*2] 613 sub $t0,$ap_end,$ap // is it last iteration? 614 ldp $a4,$a5,[$rp,#8*4] 615 sub $t1,$tp,$t0 616 ldp $a6,$a7,[$rp,#8*6] 617 cbz $t0,.Lsqr8x_outer_loop 618 619 stp $acc0,$acc1,[$tp,#8*0] 620 ldp $acc0,$acc1,[$t1,#8*0] 621 stp $acc2,$acc3,[$tp,#8*2] 622 ldp $acc2,$acc3,[$t1,#8*2] 623 stp $acc4,$acc5,[$tp,#8*4] 624 ldp $acc4,$acc5,[$t1,#8*4] 625 stp $acc6,$acc7,[$tp,#8*6] 626 mov $tp,$t1 627 ldp $acc6,$acc7,[$t1,#8*6] 628 b .Lsqr8x_outer_loop 629 630.align 4 631.Lsqr8x_outer_break: 632 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 633 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 634 ldp $t1,$t2,[sp,#8*1] 635 ldp $a5,$a7,[$t0,#8*2] 636 add $ap,$t0,#8*4 637 ldp $t3,$t0,[sp,#8*3] 638 639 stp $acc0,$acc1,[$tp,#8*0] 640 mul $acc0,$a1,$a1 641 stp $acc2,$acc3,[$tp,#8*2] 642 umulh $a1,$a1,$a1 643 stp $acc4,$acc5,[$tp,#8*4] 644 mul $a2,$a3,$a3 645 stp $acc6,$acc7,[$tp,#8*6] 646 mov $tp,sp 647 umulh $a3,$a3,$a3 648 adds $acc1,$a1,$t1,lsl#1 649 extr $t1,$t2,$t1,#63 650 sub $cnt,$num,#8*4 651 652.Lsqr4x_shift_n_add: 653 adcs $acc2,$a2,$t1 654 extr $t2,$t3,$t2,#63 655 sub $cnt,$cnt,#8*4 656 adcs $acc3,$a3,$t2 657 ldp $t1,$t2,[$tp,#8*5] 658 mul $a4,$a5,$a5 659 ldp $a1,$a3,[$ap],#8*2 660 umulh $a5,$a5,$a5 661 mul $a6,$a7,$a7 662 umulh $a7,$a7,$a7 663 extr $t3,$t0,$t3,#63 664 stp $acc0,$acc1,[$tp,#8*0] 665 adcs $acc4,$a4,$t3 666 extr $t0,$t1,$t0,#63 667 stp $acc2,$acc3,[$tp,#8*2] 668 adcs $acc5,$a5,$t0 669 ldp $t3,$t0,[$tp,#8*7] 670 extr $t1,$t2,$t1,#63 671 adcs $acc6,$a6,$t1 672 extr $t2,$t3,$t2,#63 673 adcs $acc7,$a7,$t2 674 ldp $t1,$t2,[$tp,#8*9] 675 mul $a0,$a1,$a1 676 ldp $a5,$a7,[$ap],#8*2 677 umulh $a1,$a1,$a1 678 mul $a2,$a3,$a3 679 umulh $a3,$a3,$a3 680 stp $acc4,$acc5,[$tp,#8*4] 681 extr $t3,$t0,$t3,#63 682 stp $acc6,$acc7,[$tp,#8*6] 683 add $tp,$tp,#8*8 684 adcs $acc0,$a0,$t3 685 extr $t0,$t1,$t0,#63 686 adcs $acc1,$a1,$t0 687 ldp $t3,$t0,[$tp,#8*3] 688 extr $t1,$t2,$t1,#63 689 cbnz $cnt,.Lsqr4x_shift_n_add 690___ 691my ($np,$np_end)=($ap,$ap_end); 692$code.=<<___; 693 ldp $np,$n0,[x29,#104] // pull np and n0 694 695 adcs $acc2,$a2,$t1 696 extr $t2,$t3,$t2,#63 697 adcs $acc3,$a3,$t2 698 ldp $t1,$t2,[$tp,#8*5] 699 mul $a4,$a5,$a5 700 umulh $a5,$a5,$a5 701 stp $acc0,$acc1,[$tp,#8*0] 702 mul $a6,$a7,$a7 703 umulh $a7,$a7,$a7 704 stp $acc2,$acc3,[$tp,#8*2] 705 extr $t3,$t0,$t3,#63 706 adcs $acc4,$a4,$t3 707 extr $t0,$t1,$t0,#63 708 ldp $acc0,$acc1,[sp,#8*0] 709 adcs $acc5,$a5,$t0 710 extr $t1,$t2,$t1,#63 711 ldp $a0,$a1,[$np,#8*0] 712 adcs $acc6,$a6,$t1 713 extr $t2,xzr,$t2,#63 714 ldp $a2,$a3,[$np,#8*2] 715 adc $acc7,$a7,$t2 716 ldp $a4,$a5,[$np,#8*4] 717 718 // Reduce by 512 bits per iteration 719 mul $na0,$n0,$acc0 // t[0]*n0 720 ldp $a6,$a7,[$np,#8*6] 721 add $np_end,$np,$num 722 ldp $acc2,$acc3,[sp,#8*2] 723 stp $acc4,$acc5,[$tp,#8*4] 724 ldp $acc4,$acc5,[sp,#8*4] 725 stp $acc6,$acc7,[$tp,#8*6] 726 ldp $acc6,$acc7,[sp,#8*6] 727 add $np,$np,#8*8 728 mov $topmost,xzr // initial top-most carry 729 mov $tp,sp 730 mov $cnt,#8 731 732.Lsqr8x_reduction: 733 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 734 mul $t1,$a1,$na0 735 sub $cnt,$cnt,#1 736 mul $t2,$a2,$na0 737 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 738 mul $t3,$a3,$na0 739 // (*) adds xzr,$acc0,$t0 740 subs xzr,$acc0,#1 // (*) 741 mul $t0,$a4,$na0 742 adcs $acc0,$acc1,$t1 743 mul $t1,$a5,$na0 744 adcs $acc1,$acc2,$t2 745 mul $t2,$a6,$na0 746 adcs $acc2,$acc3,$t3 747 mul $t3,$a7,$na0 748 adcs $acc3,$acc4,$t0 749 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 750 adcs $acc4,$acc5,$t1 751 umulh $t1,$a1,$na0 752 adcs $acc5,$acc6,$t2 753 umulh $t2,$a2,$na0 754 adcs $acc6,$acc7,$t3 755 umulh $t3,$a3,$na0 756 adc $acc7,xzr,xzr 757 adds $acc0,$acc0,$t0 758 umulh $t0,$a4,$na0 759 adcs $acc1,$acc1,$t1 760 umulh $t1,$a5,$na0 761 adcs $acc2,$acc2,$t2 762 umulh $t2,$a6,$na0 763 adcs $acc3,$acc3,$t3 764 umulh $t3,$a7,$na0 765 mul $na0,$n0,$acc0 // next t[0]*n0 766 adcs $acc4,$acc4,$t0 767 adcs $acc5,$acc5,$t1 768 adcs $acc6,$acc6,$t2 769 adc $acc7,$acc7,$t3 770 cbnz $cnt,.Lsqr8x_reduction 771 772 ldp $t0,$t1,[$tp,#8*0] 773 ldp $t2,$t3,[$tp,#8*2] 774 mov $rp,$tp 775 sub $cnt,$np_end,$np // done yet? 776 adds $acc0,$acc0,$t0 777 adcs $acc1,$acc1,$t1 778 ldp $t0,$t1,[$tp,#8*4] 779 adcs $acc2,$acc2,$t2 780 adcs $acc3,$acc3,$t3 781 ldp $t2,$t3,[$tp,#8*6] 782 adcs $acc4,$acc4,$t0 783 adcs $acc5,$acc5,$t1 784 adcs $acc6,$acc6,$t2 785 adcs $acc7,$acc7,$t3 786 //adc $carry,xzr,xzr // moved below 787 cbz $cnt,.Lsqr8x8_post_condition 788 789 ldr $n0,[$tp,#-8*8] 790 ldp $a0,$a1,[$np,#8*0] 791 ldp $a2,$a3,[$np,#8*2] 792 ldp $a4,$a5,[$np,#8*4] 793 mov $cnt,#-8*8 794 ldp $a6,$a7,[$np,#8*6] 795 add $np,$np,#8*8 796 797.Lsqr8x_tail: 798 mul $t0,$a0,$n0 799 adc $carry,xzr,xzr // carry bit, modulo-scheduled 800 mul $t1,$a1,$n0 801 add $cnt,$cnt,#8 802 mul $t2,$a2,$n0 803 mul $t3,$a3,$n0 804 adds $acc0,$acc0,$t0 805 mul $t0,$a4,$n0 806 adcs $acc1,$acc1,$t1 807 mul $t1,$a5,$n0 808 adcs $acc2,$acc2,$t2 809 mul $t2,$a6,$n0 810 adcs $acc3,$acc3,$t3 811 mul $t3,$a7,$n0 812 adcs $acc4,$acc4,$t0 813 umulh $t0,$a0,$n0 814 adcs $acc5,$acc5,$t1 815 umulh $t1,$a1,$n0 816 adcs $acc6,$acc6,$t2 817 umulh $t2,$a2,$n0 818 adcs $acc7,$acc7,$t3 819 umulh $t3,$a3,$n0 820 adc $carry,$carry,xzr 821 str $acc0,[$tp],#8 822 adds $acc0,$acc1,$t0 823 umulh $t0,$a4,$n0 824 adcs $acc1,$acc2,$t1 825 umulh $t1,$a5,$n0 826 adcs $acc2,$acc3,$t2 827 umulh $t2,$a6,$n0 828 adcs $acc3,$acc4,$t3 829 umulh $t3,$a7,$n0 830 ldr $n0,[$rp,$cnt] 831 adcs $acc4,$acc5,$t0 832 adcs $acc5,$acc6,$t1 833 adcs $acc6,$acc7,$t2 834 adcs $acc7,$carry,$t3 835 //adc $carry,xzr,xzr // moved above 836 cbnz $cnt,.Lsqr8x_tail 837 // note that carry flag is guaranteed 838 // to be zero at this point 839 ldp $a0,$a1,[$tp,#8*0] 840 sub $cnt,$np_end,$np // done yet? 841 sub $t2,$np_end,$num // rewinded np 842 ldp $a2,$a3,[$tp,#8*2] 843 ldp $a4,$a5,[$tp,#8*4] 844 ldp $a6,$a7,[$tp,#8*6] 845 cbz $cnt,.Lsqr8x_tail_break 846 847 ldr $n0,[$rp,#-8*8] 848 adds $acc0,$acc0,$a0 849 adcs $acc1,$acc1,$a1 850 ldp $a0,$a1,[$np,#8*0] 851 adcs $acc2,$acc2,$a2 852 adcs $acc3,$acc3,$a3 853 ldp $a2,$a3,[$np,#8*2] 854 adcs $acc4,$acc4,$a4 855 adcs $acc5,$acc5,$a5 856 ldp $a4,$a5,[$np,#8*4] 857 adcs $acc6,$acc6,$a6 858 mov $cnt,#-8*8 859 adcs $acc7,$acc7,$a7 860 ldp $a6,$a7,[$np,#8*6] 861 add $np,$np,#8*8 862 //adc $carry,xzr,xzr // moved above 863 b .Lsqr8x_tail 864 865.align 4 866.Lsqr8x_tail_break: 867 ldr $n0,[x29,#112] // pull n0 868 add $cnt,$tp,#8*8 // end of current t[num] window 869 870 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 871 adcs $t0,$acc0,$a0 872 adcs $t1,$acc1,$a1 873 ldp $acc0,$acc1,[$rp,#8*0] 874 adcs $acc2,$acc2,$a2 875 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 876 adcs $acc3,$acc3,$a3 877 ldp $a2,$a3,[$t2,#8*2] 878 adcs $acc4,$acc4,$a4 879 adcs $acc5,$acc5,$a5 880 ldp $a4,$a5,[$t2,#8*4] 881 adcs $acc6,$acc6,$a6 882 adcs $acc7,$acc7,$a7 883 ldp $a6,$a7,[$t2,#8*6] 884 add $np,$t2,#8*8 885 adc $topmost,xzr,xzr // top-most carry 886 mul $na0,$n0,$acc0 887 stp $t0,$t1,[$tp,#8*0] 888 stp $acc2,$acc3,[$tp,#8*2] 889 ldp $acc2,$acc3,[$rp,#8*2] 890 stp $acc4,$acc5,[$tp,#8*4] 891 ldp $acc4,$acc5,[$rp,#8*4] 892 cmp $cnt,x29 // did we hit the bottom? 893 stp $acc6,$acc7,[$tp,#8*6] 894 mov $tp,$rp // slide the window 895 ldp $acc6,$acc7,[$rp,#8*6] 896 mov $cnt,#8 897 b.ne .Lsqr8x_reduction 898 899 // Final step. We see if result is larger than modulus, and 900 // if it is, subtract the modulus. But comparison implies 901 // subtraction. So we subtract modulus, see if it borrowed, 902 // and conditionally copy original value. 903 ldr $rp,[x29,#96] // pull rp 904 add $tp,$tp,#8*8 905 subs $t0,$acc0,$a0 906 sbcs $t1,$acc1,$a1 907 sub $cnt,$num,#8*8 908 mov $ap_end,$rp // $rp copy 909 910.Lsqr8x_sub: 911 sbcs $t2,$acc2,$a2 912 ldp $a0,$a1,[$np,#8*0] 913 sbcs $t3,$acc3,$a3 914 stp $t0,$t1,[$rp,#8*0] 915 sbcs $t0,$acc4,$a4 916 ldp $a2,$a3,[$np,#8*2] 917 sbcs $t1,$acc5,$a5 918 stp $t2,$t3,[$rp,#8*2] 919 sbcs $t2,$acc6,$a6 920 ldp $a4,$a5,[$np,#8*4] 921 sbcs $t3,$acc7,$a7 922 ldp $a6,$a7,[$np,#8*6] 923 add $np,$np,#8*8 924 ldp $acc0,$acc1,[$tp,#8*0] 925 sub $cnt,$cnt,#8*8 926 ldp $acc2,$acc3,[$tp,#8*2] 927 ldp $acc4,$acc5,[$tp,#8*4] 928 ldp $acc6,$acc7,[$tp,#8*6] 929 add $tp,$tp,#8*8 930 stp $t0,$t1,[$rp,#8*4] 931 sbcs $t0,$acc0,$a0 932 stp $t2,$t3,[$rp,#8*6] 933 add $rp,$rp,#8*8 934 sbcs $t1,$acc1,$a1 935 cbnz $cnt,.Lsqr8x_sub 936 937 sbcs $t2,$acc2,$a2 938 mov $tp,sp 939 add $ap,sp,$num 940 ldp $a0,$a1,[$ap_end,#8*0] 941 sbcs $t3,$acc3,$a3 942 stp $t0,$t1,[$rp,#8*0] 943 sbcs $t0,$acc4,$a4 944 ldp $a2,$a3,[$ap_end,#8*2] 945 sbcs $t1,$acc5,$a5 946 stp $t2,$t3,[$rp,#8*2] 947 sbcs $t2,$acc6,$a6 948 ldp $acc0,$acc1,[$ap,#8*0] 949 sbcs $t3,$acc7,$a7 950 ldp $acc2,$acc3,[$ap,#8*2] 951 sbcs xzr,$topmost,xzr // did it borrow? 952 ldr x30,[x29,#8] // pull return address 953 stp $t0,$t1,[$rp,#8*4] 954 stp $t2,$t3,[$rp,#8*6] 955 956 sub $cnt,$num,#8*4 957.Lsqr4x_cond_copy: 958 sub $cnt,$cnt,#8*4 959 csel $t0,$acc0,$a0,lo 960 stp xzr,xzr,[$tp,#8*0] 961 csel $t1,$acc1,$a1,lo 962 ldp $a0,$a1,[$ap_end,#8*4] 963 ldp $acc0,$acc1,[$ap,#8*4] 964 csel $t2,$acc2,$a2,lo 965 stp xzr,xzr,[$tp,#8*2] 966 add $tp,$tp,#8*4 967 csel $t3,$acc3,$a3,lo 968 ldp $a2,$a3,[$ap_end,#8*6] 969 ldp $acc2,$acc3,[$ap,#8*6] 970 add $ap,$ap,#8*4 971 stp $t0,$t1,[$ap_end,#8*0] 972 stp $t2,$t3,[$ap_end,#8*2] 973 add $ap_end,$ap_end,#8*4 974 stp xzr,xzr,[$ap,#8*0] 975 stp xzr,xzr,[$ap,#8*2] 976 cbnz $cnt,.Lsqr4x_cond_copy 977 978 csel $t0,$acc0,$a0,lo 979 stp xzr,xzr,[$tp,#8*0] 980 csel $t1,$acc1,$a1,lo 981 stp xzr,xzr,[$tp,#8*2] 982 csel $t2,$acc2,$a2,lo 983 csel $t3,$acc3,$a3,lo 984 stp $t0,$t1,[$ap_end,#8*0] 985 stp $t2,$t3,[$ap_end,#8*2] 986 987 b .Lsqr8x_done 988 989.align 4 990.Lsqr8x8_post_condition: 991 adc $carry,xzr,xzr 992 ldr x30,[x29,#8] // pull return address 993 // $acc0-7,$carry hold result, $a0-7 hold modulus 994 subs $a0,$acc0,$a0 995 ldr $ap,[x29,#96] // pull rp 996 sbcs $a1,$acc1,$a1 997 stp xzr,xzr,[sp,#8*0] 998 sbcs $a2,$acc2,$a2 999 stp xzr,xzr,[sp,#8*2] 1000 sbcs $a3,$acc3,$a3 1001 stp xzr,xzr,[sp,#8*4] 1002 sbcs $a4,$acc4,$a4 1003 stp xzr,xzr,[sp,#8*6] 1004 sbcs $a5,$acc5,$a5 1005 stp xzr,xzr,[sp,#8*8] 1006 sbcs $a6,$acc6,$a6 1007 stp xzr,xzr,[sp,#8*10] 1008 sbcs $a7,$acc7,$a7 1009 stp xzr,xzr,[sp,#8*12] 1010 sbcs $carry,$carry,xzr // did it borrow? 1011 stp xzr,xzr,[sp,#8*14] 1012 1013 // $a0-7 hold result-modulus 1014 csel $a0,$acc0,$a0,lo 1015 csel $a1,$acc1,$a1,lo 1016 csel $a2,$acc2,$a2,lo 1017 csel $a3,$acc3,$a3,lo 1018 stp $a0,$a1,[$ap,#8*0] 1019 csel $a4,$acc4,$a4,lo 1020 csel $a5,$acc5,$a5,lo 1021 stp $a2,$a3,[$ap,#8*2] 1022 csel $a6,$acc6,$a6,lo 1023 csel $a7,$acc7,$a7,lo 1024 stp $a4,$a5,[$ap,#8*4] 1025 stp $a6,$a7,[$ap,#8*6] 1026 1027.Lsqr8x_done: 1028 ldp x19,x20,[x29,#16] 1029 mov sp,x29 1030 ldp x21,x22,[x29,#32] 1031 mov x0,#1 1032 ldp x23,x24,[x29,#48] 1033 ldp x25,x26,[x29,#64] 1034 ldp x27,x28,[x29,#80] 1035 ldr x29,[sp],#128 1036 ret 1037.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1038___ 1039} 1040 1041{ 1042######################################################################## 1043# Even though this might look as ARMv8 adaptation of mulx4x_mont from 1044# x86_64-mont5 module, it's different in sense that it performs 1045# reduction 256 bits at a time. 1046 1047my ($a0,$a1,$a2,$a3, 1048 $t0,$t1,$t2,$t3, 1049 $m0,$m1,$m2,$m3, 1050 $acc0,$acc1,$acc2,$acc3,$acc4, 1051 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1052my $bp_end=$rp; 1053my ($carry,$topmost) = ($rp,"x30"); 1054 1055$code.=<<___; 1056.type __bn_mul4x_mont,%function 1057.align 5 1058__bn_mul4x_mont: 1059 stp x29,x30,[sp,#-128]! 1060 add x29,sp,#0 1061 stp x19,x20,[sp,#16] 1062 stp x21,x22,[sp,#32] 1063 stp x23,x24,[sp,#48] 1064 stp x25,x26,[sp,#64] 1065 stp x27,x28,[sp,#80] 1066 1067 sub $tp,sp,$num,lsl#3 1068 lsl $num,$num,#3 1069 ldr $n0,[$n0] // *n0 1070 sub sp,$tp,#8*4 // alloca 1071 1072 add $t0,$bp,$num 1073 add $ap_end,$ap,$num 1074 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1075 1076 ldr $bi,[$bp,#8*0] // b[0] 1077 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1078 ldp $a2,$a3,[$ap,#8*2] 1079 add $ap,$ap,#8*4 1080 mov $acc0,xzr 1081 mov $acc1,xzr 1082 mov $acc2,xzr 1083 mov $acc3,xzr 1084 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1085 ldp $m2,$m3,[$np,#8*2] 1086 adds $np,$np,#8*4 // clear carry bit 1087 mov $carry,xzr 1088 mov $cnt,#0 1089 mov $tp,sp 1090 1091.Loop_mul4x_1st_reduction: 1092 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1093 adc $carry,$carry,xzr // modulo-scheduled 1094 mul $t1,$a1,$bi 1095 add $cnt,$cnt,#8 1096 mul $t2,$a2,$bi 1097 and $cnt,$cnt,#31 1098 mul $t3,$a3,$bi 1099 adds $acc0,$acc0,$t0 1100 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1101 adcs $acc1,$acc1,$t1 1102 mul $mi,$acc0,$n0 // t[0]*n0 1103 adcs $acc2,$acc2,$t2 1104 umulh $t1,$a1,$bi 1105 adcs $acc3,$acc3,$t3 1106 umulh $t2,$a2,$bi 1107 adc $acc4,xzr,xzr 1108 umulh $t3,$a3,$bi 1109 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1110 adds $acc1,$acc1,$t0 1111 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1112 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1113 adcs $acc2,$acc2,$t1 1114 mul $t1,$m1,$mi 1115 adcs $acc3,$acc3,$t2 1116 mul $t2,$m2,$mi 1117 adc $acc4,$acc4,$t3 // can't overflow 1118 mul $t3,$m3,$mi 1119 // (*) adds xzr,$acc0,$t0 1120 subs xzr,$acc0,#1 // (*) 1121 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1122 adcs $acc0,$acc1,$t1 1123 umulh $t1,$m1,$mi 1124 adcs $acc1,$acc2,$t2 1125 umulh $t2,$m2,$mi 1126 adcs $acc2,$acc3,$t3 1127 umulh $t3,$m3,$mi 1128 adcs $acc3,$acc4,$carry 1129 adc $carry,xzr,xzr 1130 adds $acc0,$acc0,$t0 1131 sub $t0,$ap_end,$ap 1132 adcs $acc1,$acc1,$t1 1133 adcs $acc2,$acc2,$t2 1134 adcs $acc3,$acc3,$t3 1135 //adc $carry,$carry,xzr 1136 cbnz $cnt,.Loop_mul4x_1st_reduction 1137 1138 cbz $t0,.Lmul4x4_post_condition 1139 1140 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1141 ldp $a2,$a3,[$ap,#8*2] 1142 add $ap,$ap,#8*4 1143 ldr $mi,[sp] // a[0]*n0 1144 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1145 ldp $m2,$m3,[$np,#8*2] 1146 add $np,$np,#8*4 1147 1148.Loop_mul4x_1st_tail: 1149 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1150 adc $carry,$carry,xzr // modulo-scheduled 1151 mul $t1,$a1,$bi 1152 add $cnt,$cnt,#8 1153 mul $t2,$a2,$bi 1154 and $cnt,$cnt,#31 1155 mul $t3,$a3,$bi 1156 adds $acc0,$acc0,$t0 1157 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1158 adcs $acc1,$acc1,$t1 1159 umulh $t1,$a1,$bi 1160 adcs $acc2,$acc2,$t2 1161 umulh $t2,$a2,$bi 1162 adcs $acc3,$acc3,$t3 1163 umulh $t3,$a3,$bi 1164 adc $acc4,xzr,xzr 1165 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1166 adds $acc1,$acc1,$t0 1167 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1168 adcs $acc2,$acc2,$t1 1169 mul $t1,$m1,$mi 1170 adcs $acc3,$acc3,$t2 1171 mul $t2,$m2,$mi 1172 adc $acc4,$acc4,$t3 // can't overflow 1173 mul $t3,$m3,$mi 1174 adds $acc0,$acc0,$t0 1175 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1176 adcs $acc1,$acc1,$t1 1177 umulh $t1,$m1,$mi 1178 adcs $acc2,$acc2,$t2 1179 umulh $t2,$m2,$mi 1180 adcs $acc3,$acc3,$t3 1181 adcs $acc4,$acc4,$carry 1182 umulh $t3,$m3,$mi 1183 adc $carry,xzr,xzr 1184 ldr $mi,[sp,$cnt] // next t[0]*n0 1185 str $acc0,[$tp],#8 // result!!! 1186 adds $acc0,$acc1,$t0 1187 sub $t0,$ap_end,$ap // done yet? 1188 adcs $acc1,$acc2,$t1 1189 adcs $acc2,$acc3,$t2 1190 adcs $acc3,$acc4,$t3 1191 //adc $carry,$carry,xzr 1192 cbnz $cnt,.Loop_mul4x_1st_tail 1193 1194 sub $t1,$ap_end,$num // rewinded $ap 1195 cbz $t0,.Lmul4x_proceed 1196 1197 ldp $a0,$a1,[$ap,#8*0] 1198 ldp $a2,$a3,[$ap,#8*2] 1199 add $ap,$ap,#8*4 1200 ldp $m0,$m1,[$np,#8*0] 1201 ldp $m2,$m3,[$np,#8*2] 1202 add $np,$np,#8*4 1203 b .Loop_mul4x_1st_tail 1204 1205.align 5 1206.Lmul4x_proceed: 1207 ldr $bi,[$bp,#8*4]! // *++b 1208 adc $topmost,$carry,xzr 1209 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1210 sub $np,$np,$num // rewind np 1211 ldp $a2,$a3,[$t1,#8*2] 1212 add $ap,$t1,#8*4 1213 1214 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1215 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1216 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1217 ldp $acc2,$acc3,[sp,#8*6] 1218 1219 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1220 mov $tp,sp 1221 ldp $m2,$m3,[$np,#8*2] 1222 adds $np,$np,#8*4 // clear carry bit 1223 mov $carry,xzr 1224 1225.align 4 1226.Loop_mul4x_reduction: 1227 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1228 adc $carry,$carry,xzr // modulo-scheduled 1229 mul $t1,$a1,$bi 1230 add $cnt,$cnt,#8 1231 mul $t2,$a2,$bi 1232 and $cnt,$cnt,#31 1233 mul $t3,$a3,$bi 1234 adds $acc0,$acc0,$t0 1235 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1236 adcs $acc1,$acc1,$t1 1237 mul $mi,$acc0,$n0 // t[0]*n0 1238 adcs $acc2,$acc2,$t2 1239 umulh $t1,$a1,$bi 1240 adcs $acc3,$acc3,$t3 1241 umulh $t2,$a2,$bi 1242 adc $acc4,xzr,xzr 1243 umulh $t3,$a3,$bi 1244 ldr $bi,[$bp,$cnt] // next b[i] 1245 adds $acc1,$acc1,$t0 1246 // (*) mul $t0,$m0,$mi 1247 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1248 adcs $acc2,$acc2,$t1 1249 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1250 adcs $acc3,$acc3,$t2 1251 mul $t2,$m2,$mi 1252 adc $acc4,$acc4,$t3 // can't overflow 1253 mul $t3,$m3,$mi 1254 // (*) adds xzr,$acc0,$t0 1255 subs xzr,$acc0,#1 // (*) 1256 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1257 adcs $acc0,$acc1,$t1 1258 umulh $t1,$m1,$mi 1259 adcs $acc1,$acc2,$t2 1260 umulh $t2,$m2,$mi 1261 adcs $acc2,$acc3,$t3 1262 umulh $t3,$m3,$mi 1263 adcs $acc3,$acc4,$carry 1264 adc $carry,xzr,xzr 1265 adds $acc0,$acc0,$t0 1266 adcs $acc1,$acc1,$t1 1267 adcs $acc2,$acc2,$t2 1268 adcs $acc3,$acc3,$t3 1269 //adc $carry,$carry,xzr 1270 cbnz $cnt,.Loop_mul4x_reduction 1271 1272 adc $carry,$carry,xzr 1273 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1274 ldp $t2,$t3,[$tp,#8*6] 1275 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1276 ldp $a2,$a3,[$ap,#8*2] 1277 add $ap,$ap,#8*4 1278 adds $acc0,$acc0,$t0 1279 adcs $acc1,$acc1,$t1 1280 adcs $acc2,$acc2,$t2 1281 adcs $acc3,$acc3,$t3 1282 //adc $carry,$carry,xzr 1283 1284 ldr $mi,[sp] // t[0]*n0 1285 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1286 ldp $m2,$m3,[$np,#8*2] 1287 add $np,$np,#8*4 1288 1289.align 4 1290.Loop_mul4x_tail: 1291 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1292 adc $carry,$carry,xzr // modulo-scheduled 1293 mul $t1,$a1,$bi 1294 add $cnt,$cnt,#8 1295 mul $t2,$a2,$bi 1296 and $cnt,$cnt,#31 1297 mul $t3,$a3,$bi 1298 adds $acc0,$acc0,$t0 1299 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1300 adcs $acc1,$acc1,$t1 1301 umulh $t1,$a1,$bi 1302 adcs $acc2,$acc2,$t2 1303 umulh $t2,$a2,$bi 1304 adcs $acc3,$acc3,$t3 1305 umulh $t3,$a3,$bi 1306 adc $acc4,xzr,xzr 1307 ldr $bi,[$bp,$cnt] // next b[i] 1308 adds $acc1,$acc1,$t0 1309 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1310 adcs $acc2,$acc2,$t1 1311 mul $t1,$m1,$mi 1312 adcs $acc3,$acc3,$t2 1313 mul $t2,$m2,$mi 1314 adc $acc4,$acc4,$t3 // can't overflow 1315 mul $t3,$m3,$mi 1316 adds $acc0,$acc0,$t0 1317 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1318 adcs $acc1,$acc1,$t1 1319 umulh $t1,$m1,$mi 1320 adcs $acc2,$acc2,$t2 1321 umulh $t2,$m2,$mi 1322 adcs $acc3,$acc3,$t3 1323 umulh $t3,$m3,$mi 1324 adcs $acc4,$acc4,$carry 1325 ldr $mi,[sp,$cnt] // next a[0]*n0 1326 adc $carry,xzr,xzr 1327 str $acc0,[$tp],#8 // result!!! 1328 adds $acc0,$acc1,$t0 1329 sub $t0,$ap_end,$ap // done yet? 1330 adcs $acc1,$acc2,$t1 1331 adcs $acc2,$acc3,$t2 1332 adcs $acc3,$acc4,$t3 1333 //adc $carry,$carry,xzr 1334 cbnz $cnt,.Loop_mul4x_tail 1335 1336 sub $t1,$np,$num // rewinded np? 1337 adc $carry,$carry,xzr 1338 cbz $t0,.Loop_mul4x_break 1339 1340 ldp $t0,$t1,[$tp,#8*4] 1341 ldp $t2,$t3,[$tp,#8*6] 1342 ldp $a0,$a1,[$ap,#8*0] 1343 ldp $a2,$a3,[$ap,#8*2] 1344 add $ap,$ap,#8*4 1345 adds $acc0,$acc0,$t0 1346 adcs $acc1,$acc1,$t1 1347 adcs $acc2,$acc2,$t2 1348 adcs $acc3,$acc3,$t3 1349 //adc $carry,$carry,xzr 1350 ldp $m0,$m1,[$np,#8*0] 1351 ldp $m2,$m3,[$np,#8*2] 1352 add $np,$np,#8*4 1353 b .Loop_mul4x_tail 1354 1355.align 4 1356.Loop_mul4x_break: 1357 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1358 adds $acc0,$acc0,$topmost 1359 add $bp,$bp,#8*4 // bp++ 1360 adcs $acc1,$acc1,xzr 1361 sub $ap,$ap,$num // rewind ap 1362 adcs $acc2,$acc2,xzr 1363 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1364 adcs $acc3,$acc3,xzr 1365 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1366 adc $topmost,$carry,xzr 1367 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1368 cmp $bp,$t3 // done yet? 1369 ldp $acc2,$acc3,[sp,#8*6] 1370 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1371 ldp $m2,$m3,[$t1,#8*2] 1372 add $np,$t1,#8*4 1373 b.eq .Lmul4x_post 1374 1375 ldr $bi,[$bp] 1376 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1377 ldp $a2,$a3,[$ap,#8*2] 1378 adds $ap,$ap,#8*4 // clear carry bit 1379 mov $carry,xzr 1380 mov $tp,sp 1381 b .Loop_mul4x_reduction 1382 1383.align 4 1384.Lmul4x_post: 1385 // Final step. We see if result is larger than modulus, and 1386 // if it is, subtract the modulus. But comparison implies 1387 // subtraction. So we subtract modulus, see if it borrowed, 1388 // and conditionally copy original value. 1389 mov $rp,$t2 1390 mov $ap_end,$t2 // $rp copy 1391 subs $t0,$acc0,$m0 1392 add $tp,sp,#8*8 1393 sbcs $t1,$acc1,$m1 1394 sub $cnt,$num,#8*4 1395 1396.Lmul4x_sub: 1397 sbcs $t2,$acc2,$m2 1398 ldp $m0,$m1,[$np,#8*0] 1399 sub $cnt,$cnt,#8*4 1400 ldp $acc0,$acc1,[$tp,#8*0] 1401 sbcs $t3,$acc3,$m3 1402 ldp $m2,$m3,[$np,#8*2] 1403 add $np,$np,#8*4 1404 ldp $acc2,$acc3,[$tp,#8*2] 1405 add $tp,$tp,#8*4 1406 stp $t0,$t1,[$rp,#8*0] 1407 sbcs $t0,$acc0,$m0 1408 stp $t2,$t3,[$rp,#8*2] 1409 add $rp,$rp,#8*4 1410 sbcs $t1,$acc1,$m1 1411 cbnz $cnt,.Lmul4x_sub 1412 1413 sbcs $t2,$acc2,$m2 1414 mov $tp,sp 1415 add $ap,sp,#8*4 1416 ldp $a0,$a1,[$ap_end,#8*0] 1417 sbcs $t3,$acc3,$m3 1418 stp $t0,$t1,[$rp,#8*0] 1419 ldp $a2,$a3,[$ap_end,#8*2] 1420 stp $t2,$t3,[$rp,#8*2] 1421 ldp $acc0,$acc1,[$ap,#8*0] 1422 ldp $acc2,$acc3,[$ap,#8*2] 1423 sbcs xzr,$topmost,xzr // did it borrow? 1424 ldr x30,[x29,#8] // pull return address 1425 1426 sub $cnt,$num,#8*4 1427.Lmul4x_cond_copy: 1428 sub $cnt,$cnt,#8*4 1429 csel $t0,$acc0,$a0,lo 1430 stp xzr,xzr,[$tp,#8*0] 1431 csel $t1,$acc1,$a1,lo 1432 ldp $a0,$a1,[$ap_end,#8*4] 1433 ldp $acc0,$acc1,[$ap,#8*4] 1434 csel $t2,$acc2,$a2,lo 1435 stp xzr,xzr,[$tp,#8*2] 1436 add $tp,$tp,#8*4 1437 csel $t3,$acc3,$a3,lo 1438 ldp $a2,$a3,[$ap_end,#8*6] 1439 ldp $acc2,$acc3,[$ap,#8*6] 1440 add $ap,$ap,#8*4 1441 stp $t0,$t1,[$ap_end,#8*0] 1442 stp $t2,$t3,[$ap_end,#8*2] 1443 add $ap_end,$ap_end,#8*4 1444 cbnz $cnt,.Lmul4x_cond_copy 1445 1446 csel $t0,$acc0,$a0,lo 1447 stp xzr,xzr,[$tp,#8*0] 1448 csel $t1,$acc1,$a1,lo 1449 stp xzr,xzr,[$tp,#8*2] 1450 csel $t2,$acc2,$a2,lo 1451 stp xzr,xzr,[$tp,#8*3] 1452 csel $t3,$acc3,$a3,lo 1453 stp xzr,xzr,[$tp,#8*4] 1454 stp $t0,$t1,[$ap_end,#8*0] 1455 stp $t2,$t3,[$ap_end,#8*2] 1456 1457 b .Lmul4x_done 1458 1459.align 4 1460.Lmul4x4_post_condition: 1461 adc $carry,$carry,xzr 1462 ldr $ap,[x29,#96] // pull rp 1463 // $acc0-3,$carry hold result, $m0-7 hold modulus 1464 subs $a0,$acc0,$m0 1465 ldr x30,[x29,#8] // pull return address 1466 sbcs $a1,$acc1,$m1 1467 stp xzr,xzr,[sp,#8*0] 1468 sbcs $a2,$acc2,$m2 1469 stp xzr,xzr,[sp,#8*2] 1470 sbcs $a3,$acc3,$m3 1471 stp xzr,xzr,[sp,#8*4] 1472 sbcs xzr,$carry,xzr // did it borrow? 1473 stp xzr,xzr,[sp,#8*6] 1474 1475 // $a0-3 hold result-modulus 1476 csel $a0,$acc0,$a0,lo 1477 csel $a1,$acc1,$a1,lo 1478 csel $a2,$acc2,$a2,lo 1479 csel $a3,$acc3,$a3,lo 1480 stp $a0,$a1,[$ap,#8*0] 1481 stp $a2,$a3,[$ap,#8*2] 1482 1483.Lmul4x_done: 1484 ldp x19,x20,[x29,#16] 1485 mov sp,x29 1486 ldp x21,x22,[x29,#32] 1487 mov x0,#1 1488 ldp x23,x24,[x29,#48] 1489 ldp x25,x26,[x29,#64] 1490 ldp x27,x28,[x29,#80] 1491 ldr x29,[sp],#128 1492 ret 1493.size __bn_mul4x_mont,.-__bn_mul4x_mont 1494___ 1495} 1496$code.=<<___; 1497.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1498.align 4 1499___ 1500 1501print $code; 1502 1503close STDOUT; 1504