1/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2 * All rights reserved. 3 * 4 * This package is an SSL implementation written 5 * by Eric Young (eay@cryptsoft.com). 6 * The implementation was written so as to conform with Netscapes SSL. 7 * 8 * This library is free for commercial and non-commercial use as long as 9 * the following conditions are aheared to. The following conditions 10 * apply to all code found in this distribution, be it the RC4, RSA, 11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 12 * included with this distribution is covered by the same copyright terms 13 * except that the holder is Tim Hudson (tjh@cryptsoft.com). 14 * 15 * Copyright remains Eric Young's, and as such any Copyright notices in 16 * the code are not to be removed. 17 * If this package is used in a product, Eric Young should be given attribution 18 * as the author of the parts of the library used. 19 * This can be in the form of a textual message at program startup or 20 * in documentation (online or textual) provided with the package. 21 * 22 * Redistribution and use in source and binary forms, with or without 23 * modification, are permitted provided that the following conditions 24 * are met: 25 * 1. Redistributions of source code must retain the copyright 26 * notice, this list of conditions and the following disclaimer. 27 * 2. Redistributions in binary form must reproduce the above copyright 28 * notice, this list of conditions and the following disclaimer in the 29 * documentation and/or other materials provided with the distribution. 30 * 3. All advertising materials mentioning features or use of this software 31 * must display the following acknowledgement: 32 * "This product includes cryptographic software written by 33 * Eric Young (eay@cryptsoft.com)" 34 * The word 'cryptographic' can be left out if the rouines from the library 35 * being used are not cryptographic related :-). 36 * 4. If you include any Windows specific code (or a derivative thereof) from 37 * the apps directory (application code) you must include an acknowledgement: 38 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 39 * 40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 * 52 * The licence and distribution terms for any publically available version or 53 * derivative of this code cannot be changed. i.e. this code cannot simply be 54 * copied and put under another distribution licence 55 * [including the GNU Public Licence.] */ 56 57#include <openssl/bn.h> 58 59#include <assert.h> 60 61#include "internal.h" 62 63 64#if defined(OPENSSL_WINDOWS) || defined(OPENSSL_NO_ASM) || \ 65 (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) 66 67#if defined(OPENSSL_WINDOWS) 68#define alloca _alloca 69#else 70#include <alloca.h> 71#endif 72 73#ifdef BN_LLONG 74#define mul_add(r, a, w, c) \ 75 { \ 76 BN_ULLONG t; \ 77 t = (BN_ULLONG)w * (a) + (r) + (c); \ 78 (r) = Lw(t); \ 79 (c) = Hw(t); \ 80 } 81 82#define mul(r, a, w, c) \ 83 { \ 84 BN_ULLONG t; \ 85 t = (BN_ULLONG)w * (a) + (c); \ 86 (r) = Lw(t); \ 87 (c) = Hw(t); \ 88 } 89 90#define sqr(r0, r1, a) \ 91 { \ 92 BN_ULLONG t; \ 93 t = (BN_ULLONG)(a) * (a); \ 94 (r0) = Lw(t); \ 95 (r1) = Hw(t); \ 96 } 97 98#elif defined(BN_UMULT_LOHI) 99#define mul_add(r, a, w, c) \ 100 { \ 101 BN_ULONG high, low, ret, tmp = (a); \ 102 ret = (r); \ 103 BN_UMULT_LOHI(low, high, w, tmp); \ 104 ret += (c); \ 105 (c) = (ret < (c)) ? 1 : 0; \ 106 (c) += high; \ 107 ret += low; \ 108 (c) += (ret < low) ? 1 : 0; \ 109 (r) = ret; \ 110 } 111 112#define mul(r, a, w, c) \ 113 { \ 114 BN_ULONG high, low, ret, ta = (a); \ 115 BN_UMULT_LOHI(low, high, w, ta); \ 116 ret = low + (c); \ 117 (c) = high; \ 118 (c) += (ret < low) ? 1 : 0; \ 119 (r) = ret; \ 120 } 121 122#define sqr(r0, r1, a) \ 123 { \ 124 BN_ULONG tmp = (a); \ 125 BN_UMULT_LOHI(r0, r1, tmp, tmp); \ 126 } 127 128#elif defined(BN_UMULT_HIGH) 129#define mul_add(r, a, w, c) \ 130 { \ 131 BN_ULONG high, low, ret, tmp = (a); \ 132 ret = (r); \ 133 high = BN_UMULT_HIGH(w, tmp); \ 134 ret += (c); \ 135 low = (w) * tmp; \ 136 (c) = (ret < (c)) ? 1 : 0; \ 137 (c) += high; \ 138 ret += low; \ 139 (c) += (ret < low) ? 1 : 0; \ 140 (r) = ret; \ 141 } 142 143#define mul(r, a, w, c) \ 144 { \ 145 BN_ULONG high, low, ret, ta = (a); \ 146 low = (w) * ta; \ 147 high = BN_UMULT_HIGH(w, ta); \ 148 ret = low + (c); \ 149 (c) = high; \ 150 (c) += (ret < low) ? 1 : 0; \ 151 (r) = ret; \ 152 } 153 154#define sqr(r0, r1, a) \ 155 { \ 156 BN_ULONG tmp = (a); \ 157 (r0) = tmp * tmp; \ 158 (r1) = BN_UMULT_HIGH(tmp, tmp); \ 159 } 160 161#else 162/************************************************************* 163 * No long long type 164 */ 165 166#define LBITS(a) ((a) & BN_MASK2l) 167#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l) 168#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2) 169 170#define LLBITS(a) ((a) & BN_MASKl) 171#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl) 172#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2) 173 174#define mul64(l, h, bl, bh) \ 175 { \ 176 BN_ULONG m, m1, lt, ht; \ 177 \ 178 lt = l; \ 179 ht = h; \ 180 m = (bh) * (lt); \ 181 lt = (bl) * (lt); \ 182 m1 = (bl) * (ht); \ 183 ht = (bh) * (ht); \ 184 m = (m + m1) & BN_MASK2; \ 185 if (m < m1) \ 186 ht += L2HBITS((BN_ULONG)1); \ 187 ht += HBITS(m); \ 188 m1 = L2HBITS(m); \ 189 lt = (lt + m1) & BN_MASK2; \ 190 if (lt < m1) \ 191 ht++; \ 192 (l) = lt; \ 193 (h) = ht; \ 194 } 195 196#define sqr64(lo, ho, in) \ 197 { \ 198 BN_ULONG l, h, m; \ 199 \ 200 h = (in); \ 201 l = LBITS(h); \ 202 h = HBITS(h); \ 203 m = (l) * (h); \ 204 l *= l; \ 205 h *= h; \ 206 h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \ 207 m = (m & BN_MASK2l) << (BN_BITS4 + 1); \ 208 l = (l + m) & BN_MASK2; \ 209 if (l < m) \ 210 h++; \ 211 (lo) = l; \ 212 (ho) = h; \ 213 } 214 215#define mul_add(r, a, bl, bh, c) \ 216 { \ 217 BN_ULONG l, h; \ 218 \ 219 h = (a); \ 220 l = LBITS(h); \ 221 h = HBITS(h); \ 222 mul64(l, h, (bl), (bh)); \ 223 \ 224 /* non-multiply part */ \ 225 l = (l + (c)) & BN_MASK2; \ 226 if (l < (c)) \ 227 h++; \ 228 (c) = (r); \ 229 l = (l + (c)) & BN_MASK2; \ 230 if (l < (c)) \ 231 h++; \ 232 (c) = h & BN_MASK2; \ 233 (r) = l; \ 234 } 235 236#define mul(r, a, bl, bh, c) \ 237 { \ 238 BN_ULONG l, h; \ 239 \ 240 h = (a); \ 241 l = LBITS(h); \ 242 h = HBITS(h); \ 243 mul64(l, h, (bl), (bh)); \ 244 \ 245 /* non-multiply part */ \ 246 l += (c); \ 247 if ((l & BN_MASK2) < (c)) \ 248 h++; \ 249 (c) = h & BN_MASK2; \ 250 (r) = l & BN_MASK2; \ 251 } 252#endif /* !BN_LLONG */ 253 254#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 255 256BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 257 BN_ULONG w) { 258 BN_ULONG c1 = 0; 259 260 assert(num >= 0); 261 if (num <= 0) { 262 return c1; 263 } 264 265 while (num & ~3) { 266 mul_add(rp[0], ap[0], w, c1); 267 mul_add(rp[1], ap[1], w, c1); 268 mul_add(rp[2], ap[2], w, c1); 269 mul_add(rp[3], ap[3], w, c1); 270 ap += 4; 271 rp += 4; 272 num -= 4; 273 } 274 275 while (num) { 276 mul_add(rp[0], ap[0], w, c1); 277 ap++; 278 rp++; 279 num--; 280 } 281 282 return c1; 283} 284 285BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { 286 BN_ULONG c1 = 0; 287 288 assert(num >= 0); 289 if (num <= 0) { 290 return c1; 291 } 292 293 while (num & ~3) { 294 mul(rp[0], ap[0], w, c1); 295 mul(rp[1], ap[1], w, c1); 296 mul(rp[2], ap[2], w, c1); 297 mul(rp[3], ap[3], w, c1); 298 ap += 4; 299 rp += 4; 300 num -= 4; 301 } 302 while (num) { 303 mul(rp[0], ap[0], w, c1); 304 ap++; 305 rp++; 306 num--; 307 } 308 return c1; 309} 310 311void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) { 312 assert(n >= 0); 313 if (n <= 0) { 314 return; 315 } 316 317 while (n & ~3) { 318 sqr(r[0], r[1], a[0]); 319 sqr(r[2], r[3], a[1]); 320 sqr(r[4], r[5], a[2]); 321 sqr(r[6], r[7], a[3]); 322 a += 4; 323 r += 8; 324 n -= 4; 325 } 326 while (n) { 327 sqr(r[0], r[1], a[0]); 328 a++; 329 r += 2; 330 n--; 331 } 332} 333 334#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ 335 336BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 337 BN_ULONG w) { 338 BN_ULONG c = 0; 339 BN_ULONG bl, bh; 340 341 assert(num >= 0); 342 if (num <= 0) { 343 return (BN_ULONG)0; 344 } 345 346 bl = LBITS(w); 347 bh = HBITS(w); 348 349 while (num & ~3) { 350 mul_add(rp[0], ap[0], bl, bh, c); 351 mul_add(rp[1], ap[1], bl, bh, c); 352 mul_add(rp[2], ap[2], bl, bh, c); 353 mul_add(rp[3], ap[3], bl, bh, c); 354 ap += 4; 355 rp += 4; 356 num -= 4; 357 } 358 while (num) { 359 mul_add(rp[0], ap[0], bl, bh, c); 360 ap++; 361 rp++; 362 num--; 363 } 364 return c; 365} 366 367BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { 368 BN_ULONG carry = 0; 369 BN_ULONG bl, bh; 370 371 assert(num >= 0); 372 if (num <= 0) { 373 return (BN_ULONG)0; 374 } 375 376 bl = LBITS(w); 377 bh = HBITS(w); 378 379 while (num & ~3) { 380 mul(rp[0], ap[0], bl, bh, carry); 381 mul(rp[1], ap[1], bl, bh, carry); 382 mul(rp[2], ap[2], bl, bh, carry); 383 mul(rp[3], ap[3], bl, bh, carry); 384 ap += 4; 385 rp += 4; 386 num -= 4; 387 } 388 while (num) { 389 mul(rp[0], ap[0], bl, bh, carry); 390 ap++; 391 rp++; 392 num--; 393 } 394 return carry; 395} 396 397void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) { 398 assert(n >= 0); 399 if (n <= 0) { 400 return; 401 } 402 403 while (n & ~3) { 404 sqr64(r[0], r[1], a[0]); 405 sqr64(r[2], r[3], a[1]); 406 sqr64(r[4], r[5], a[2]); 407 sqr64(r[6], r[7], a[3]); 408 a += 4; 409 r += 8; 410 n -= 4; 411 } 412 while (n) { 413 sqr64(r[0], r[1], a[0]); 414 a++; 415 r += 2; 416 n--; 417 } 418} 419 420#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ 421 422#if defined(BN_LLONG) && defined(BN_DIV2W) 423 424BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) { 425 return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d); 426} 427 428#else 429 430/* Divide h,l by d and return the result. */ 431BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) { 432 BN_ULONG dh, dl, q, ret = 0, th, tl, t; 433 int i, count = 2; 434 435 if (d == 0) { 436 return BN_MASK2; 437 } 438 439 i = BN_num_bits_word(d); 440 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 441 442 i = BN_BITS2 - i; 443 if (h >= d) { 444 h -= d; 445 } 446 447 if (i) { 448 d <<= i; 449 h = (h << i) | (l >> (BN_BITS2 - i)); 450 l <<= i; 451 } 452 dh = (d & BN_MASK2h) >> BN_BITS4; 453 dl = (d & BN_MASK2l); 454 for (;;) { 455 if ((h >> BN_BITS4) == dh) { 456 q = BN_MASK2l; 457 } else { 458 q = h / dh; 459 } 460 461 th = q * dh; 462 tl = dl * q; 463 for (;;) { 464 t = h - th; 465 if ((t & BN_MASK2h) || 466 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) { 467 break; 468 } 469 q--; 470 th -= dh; 471 tl -= dl; 472 } 473 t = (tl >> BN_BITS4); 474 tl = (tl << BN_BITS4) & BN_MASK2h; 475 th += t; 476 477 if (l < tl) { 478 th++; 479 } 480 l -= tl; 481 if (h < th) { 482 h += d; 483 q--; 484 } 485 h -= th; 486 487 if (--count == 0) { 488 break; 489 } 490 491 ret = q << BN_BITS4; 492 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 493 l = (l & BN_MASK2l) << BN_BITS4; 494 } 495 496 ret |= q; 497 return ret; 498} 499 500#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 501 502#ifdef BN_LLONG 503BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 504 int n) { 505 BN_ULLONG ll = 0; 506 507 assert(n >= 0); 508 if (n <= 0) { 509 return (BN_ULONG)0; 510 } 511 512 while (n & ~3) { 513 ll += (BN_ULLONG)a[0] + b[0]; 514 r[0] = (BN_ULONG)ll & BN_MASK2; 515 ll >>= BN_BITS2; 516 ll += (BN_ULLONG)a[1] + b[1]; 517 r[1] = (BN_ULONG)ll & BN_MASK2; 518 ll >>= BN_BITS2; 519 ll += (BN_ULLONG)a[2] + b[2]; 520 r[2] = (BN_ULONG)ll & BN_MASK2; 521 ll >>= BN_BITS2; 522 ll += (BN_ULLONG)a[3] + b[3]; 523 r[3] = (BN_ULONG)ll & BN_MASK2; 524 ll >>= BN_BITS2; 525 a += 4; 526 b += 4; 527 r += 4; 528 n -= 4; 529 } 530 while (n) { 531 ll += (BN_ULLONG)a[0] + b[0]; 532 r[0] = (BN_ULONG)ll & BN_MASK2; 533 ll >>= BN_BITS2; 534 a++; 535 b++; 536 r++; 537 n--; 538 } 539 return (BN_ULONG)ll; 540} 541 542#else /* !BN_LLONG */ 543 544BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 545 int n) { 546 BN_ULONG c, l, t; 547 548 assert(n >= 0); 549 if (n <= 0) { 550 return (BN_ULONG)0; 551 } 552 553 c = 0; 554 while (n & ~3) { 555 t = a[0]; 556 t = (t + c) & BN_MASK2; 557 c = (t < c); 558 l = (t + b[0]) & BN_MASK2; 559 c += (l < t); 560 r[0] = l; 561 t = a[1]; 562 t = (t + c) & BN_MASK2; 563 c = (t < c); 564 l = (t + b[1]) & BN_MASK2; 565 c += (l < t); 566 r[1] = l; 567 t = a[2]; 568 t = (t + c) & BN_MASK2; 569 c = (t < c); 570 l = (t + b[2]) & BN_MASK2; 571 c += (l < t); 572 r[2] = l; 573 t = a[3]; 574 t = (t + c) & BN_MASK2; 575 c = (t < c); 576 l = (t + b[3]) & BN_MASK2; 577 c += (l < t); 578 r[3] = l; 579 a += 4; 580 b += 4; 581 r += 4; 582 n -= 4; 583 } 584 while (n) { 585 t = a[0]; 586 t = (t + c) & BN_MASK2; 587 c = (t < c); 588 l = (t + b[0]) & BN_MASK2; 589 c += (l < t); 590 r[0] = l; 591 a++; 592 b++; 593 r++; 594 n--; 595 } 596 return (BN_ULONG)c; 597} 598 599#endif /* !BN_LLONG */ 600 601BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 602 int n) { 603 BN_ULONG t1, t2; 604 int c = 0; 605 606 assert(n >= 0); 607 if (n <= 0) { 608 return (BN_ULONG)0; 609 } 610 611 while (n & ~3) { 612 t1 = a[0]; 613 t2 = b[0]; 614 r[0] = (t1 - t2 - c) & BN_MASK2; 615 if (t1 != t2) 616 c = (t1 < t2); 617 t1 = a[1]; 618 t2 = b[1]; 619 r[1] = (t1 - t2 - c) & BN_MASK2; 620 if (t1 != t2) 621 c = (t1 < t2); 622 t1 = a[2]; 623 t2 = b[2]; 624 r[2] = (t1 - t2 - c) & BN_MASK2; 625 if (t1 != t2) 626 c = (t1 < t2); 627 t1 = a[3]; 628 t2 = b[3]; 629 r[3] = (t1 - t2 - c) & BN_MASK2; 630 if (t1 != t2) 631 c = (t1 < t2); 632 a += 4; 633 b += 4; 634 r += 4; 635 n -= 4; 636 } 637 while (n) { 638 t1 = a[0]; 639 t2 = b[0]; 640 r[0] = (t1 - t2 - c) & BN_MASK2; 641 if (t1 != t2) 642 c = (t1 < t2); 643 a++; 644 b++; 645 r++; 646 n--; 647 } 648 return c; 649} 650 651/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 652/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 653/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 654/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 655 656#ifdef BN_LLONG 657#define mul_add_c(a, b, c0, c1, c2) \ 658 t = (BN_ULLONG)a * b; \ 659 t1 = (BN_ULONG)Lw(t); \ 660 t2 = (BN_ULONG)Hw(t); \ 661 c0 = (c0 + t1) & BN_MASK2; \ 662 if ((c0) < t1) \ 663 t2++; \ 664 c1 = (c1 + t2) & BN_MASK2; \ 665 if ((c1) < t2) \ 666 c2++; 667 668#define mul_add_c2(a, b, c0, c1, c2) \ 669 t = (BN_ULLONG)a * b; \ 670 tt = (t + t) & BN_MASK; \ 671 if (tt < t) \ 672 c2++; \ 673 t1 = (BN_ULONG)Lw(tt); \ 674 t2 = (BN_ULONG)Hw(tt); \ 675 c0 = (c0 + t1) & BN_MASK2; \ 676 if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \ 677 c2++; \ 678 c1 = (c1 + t2) & BN_MASK2; \ 679 if ((c1) < t2) \ 680 c2++; 681 682#define sqr_add_c(a, i, c0, c1, c2) \ 683 t = (BN_ULLONG)a[i] * a[i]; \ 684 t1 = (BN_ULONG)Lw(t); \ 685 t2 = (BN_ULONG)Hw(t); \ 686 c0 = (c0 + t1) & BN_MASK2; \ 687 if ((c0) < t1) \ 688 t2++; \ 689 c1 = (c1 + t2) & BN_MASK2; \ 690 if ((c1) < t2) \ 691 c2++; 692 693#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 694 695#elif defined(BN_UMULT_LOHI) 696 697#define mul_add_c(a, b, c0, c1, c2) \ 698 { \ 699 BN_ULONG ta = (a), tb = (b); \ 700 BN_UMULT_LOHI(t1, t2, ta, tb); \ 701 c0 += t1; \ 702 t2 += (c0 < t1) ? 1 : 0; \ 703 c1 += t2; \ 704 c2 += (c1 < t2) ? 1 : 0; \ 705 } 706 707#define mul_add_c2(a, b, c0, c1, c2) \ 708 { \ 709 BN_ULONG ta = (a), tb = (b), t0; \ 710 BN_UMULT_LOHI(t0, t1, ta, tb); \ 711 t2 = t1 + t1; \ 712 c2 += (t2 < t1) ? 1 : 0; \ 713 t1 = t0 + t0; \ 714 t2 += (t1 < t0) ? 1 : 0; \ 715 c0 += t1; \ 716 t2 += (c0 < t1) ? 1 : 0; \ 717 c1 += t2; \ 718 c2 += (c1 < t2) ? 1 : 0; \ 719 } 720 721#define sqr_add_c(a, i, c0, c1, c2) \ 722 { \ 723 BN_ULONG ta = (a)[i]; \ 724 BN_UMULT_LOHI(t1, t2, ta, ta); \ 725 c0 += t1; \ 726 t2 += (c0 < t1) ? 1 : 0; \ 727 c1 += t2; \ 728 c2 += (c1 < t2) ? 1 : 0; \ 729 } 730 731#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 732 733#elif defined(BN_UMULT_HIGH) 734 735#define mul_add_c(a, b, c0, c1, c2) \ 736 { \ 737 BN_ULONG ta = (a), tb = (b); \ 738 t1 = ta * tb; \ 739 t2 = BN_UMULT_HIGH(ta, tb); \ 740 c0 += t1; \ 741 t2 += (c0 < t1) ? 1 : 0; \ 742 c1 += t2; \ 743 c2 += (c1 < t2) ? 1 : 0; \ 744 } 745 746#define mul_add_c2(a, b, c0, c1, c2) \ 747 { \ 748 BN_ULONG ta = (a), tb = (b), t0; \ 749 t1 = BN_UMULT_HIGH(ta, tb); \ 750 t0 = ta * tb; \ 751 t2 = t1 + t1; \ 752 c2 += (t2 < t1) ? 1 : 0; \ 753 t1 = t0 + t0; \ 754 t2 += (t1 < t0) ? 1 : 0; \ 755 c0 += t1; \ 756 t2 += (c0 < t1) ? 1 : 0; \ 757 c1 += t2; \ 758 c2 += (c1 < t2) ? 1 : 0; \ 759 } 760 761#define sqr_add_c(a, i, c0, c1, c2) \ 762 { \ 763 BN_ULONG ta = (a)[i]; \ 764 t1 = ta * ta; \ 765 t2 = BN_UMULT_HIGH(ta, ta); \ 766 c0 += t1; \ 767 t2 += (c0 < t1) ? 1 : 0; \ 768 c1 += t2; \ 769 c2 += (c1 < t2) ? 1 : 0; \ 770 } 771 772#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 773 774#else /* !BN_LLONG */ 775#define mul_add_c(a, b, c0, c1, c2) \ 776 t1 = LBITS(a); \ 777 t2 = HBITS(a); \ 778 bl = LBITS(b); \ 779 bh = HBITS(b); \ 780 mul64(t1, t2, bl, bh); \ 781 c0 = (c0 + t1) & BN_MASK2; \ 782 if ((c0) < t1) \ 783 t2++; \ 784 c1 = (c1 + t2) & BN_MASK2; \ 785 if ((c1) < t2) \ 786 c2++; 787 788#define mul_add_c2(a, b, c0, c1, c2) \ 789 t1 = LBITS(a); \ 790 t2 = HBITS(a); \ 791 bl = LBITS(b); \ 792 bh = HBITS(b); \ 793 mul64(t1, t2, bl, bh); \ 794 if (t2 & BN_TBIT) \ 795 c2++; \ 796 t2 = (t2 + t2) & BN_MASK2; \ 797 if (t1 & BN_TBIT) \ 798 t2++; \ 799 t1 = (t1 + t1) & BN_MASK2; \ 800 c0 = (c0 + t1) & BN_MASK2; \ 801 if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \ 802 c2++; \ 803 c1 = (c1 + t2) & BN_MASK2; \ 804 if ((c1) < t2) \ 805 c2++; 806 807#define sqr_add_c(a, i, c0, c1, c2) \ 808 sqr64(t1, t2, (a)[i]); \ 809 c0 = (c0 + t1) & BN_MASK2; \ 810 if ((c0) < t1) \ 811 t2++; \ 812 c1 = (c1 + t2) & BN_MASK2; \ 813 if ((c1) < t2) \ 814 c2++; 815 816#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 817#endif /* !BN_LLONG */ 818 819void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { 820#ifdef BN_LLONG 821 BN_ULLONG t; 822#else 823 BN_ULONG bl, bh; 824#endif 825 BN_ULONG t1, t2; 826 BN_ULONG c1, c2, c3; 827 828 c1 = 0; 829 c2 = 0; 830 c3 = 0; 831 mul_add_c(a[0], b[0], c1, c2, c3); 832 r[0] = c1; 833 c1 = 0; 834 mul_add_c(a[0], b[1], c2, c3, c1); 835 mul_add_c(a[1], b[0], c2, c3, c1); 836 r[1] = c2; 837 c2 = 0; 838 mul_add_c(a[2], b[0], c3, c1, c2); 839 mul_add_c(a[1], b[1], c3, c1, c2); 840 mul_add_c(a[0], b[2], c3, c1, c2); 841 r[2] = c3; 842 c3 = 0; 843 mul_add_c(a[0], b[3], c1, c2, c3); 844 mul_add_c(a[1], b[2], c1, c2, c3); 845 mul_add_c(a[2], b[1], c1, c2, c3); 846 mul_add_c(a[3], b[0], c1, c2, c3); 847 r[3] = c1; 848 c1 = 0; 849 mul_add_c(a[4], b[0], c2, c3, c1); 850 mul_add_c(a[3], b[1], c2, c3, c1); 851 mul_add_c(a[2], b[2], c2, c3, c1); 852 mul_add_c(a[1], b[3], c2, c3, c1); 853 mul_add_c(a[0], b[4], c2, c3, c1); 854 r[4] = c2; 855 c2 = 0; 856 mul_add_c(a[0], b[5], c3, c1, c2); 857 mul_add_c(a[1], b[4], c3, c1, c2); 858 mul_add_c(a[2], b[3], c3, c1, c2); 859 mul_add_c(a[3], b[2], c3, c1, c2); 860 mul_add_c(a[4], b[1], c3, c1, c2); 861 mul_add_c(a[5], b[0], c3, c1, c2); 862 r[5] = c3; 863 c3 = 0; 864 mul_add_c(a[6], b[0], c1, c2, c3); 865 mul_add_c(a[5], b[1], c1, c2, c3); 866 mul_add_c(a[4], b[2], c1, c2, c3); 867 mul_add_c(a[3], b[3], c1, c2, c3); 868 mul_add_c(a[2], b[4], c1, c2, c3); 869 mul_add_c(a[1], b[5], c1, c2, c3); 870 mul_add_c(a[0], b[6], c1, c2, c3); 871 r[6] = c1; 872 c1 = 0; 873 mul_add_c(a[0], b[7], c2, c3, c1); 874 mul_add_c(a[1], b[6], c2, c3, c1); 875 mul_add_c(a[2], b[5], c2, c3, c1); 876 mul_add_c(a[3], b[4], c2, c3, c1); 877 mul_add_c(a[4], b[3], c2, c3, c1); 878 mul_add_c(a[5], b[2], c2, c3, c1); 879 mul_add_c(a[6], b[1], c2, c3, c1); 880 mul_add_c(a[7], b[0], c2, c3, c1); 881 r[7] = c2; 882 c2 = 0; 883 mul_add_c(a[7], b[1], c3, c1, c2); 884 mul_add_c(a[6], b[2], c3, c1, c2); 885 mul_add_c(a[5], b[3], c3, c1, c2); 886 mul_add_c(a[4], b[4], c3, c1, c2); 887 mul_add_c(a[3], b[5], c3, c1, c2); 888 mul_add_c(a[2], b[6], c3, c1, c2); 889 mul_add_c(a[1], b[7], c3, c1, c2); 890 r[8] = c3; 891 c3 = 0; 892 mul_add_c(a[2], b[7], c1, c2, c3); 893 mul_add_c(a[3], b[6], c1, c2, c3); 894 mul_add_c(a[4], b[5], c1, c2, c3); 895 mul_add_c(a[5], b[4], c1, c2, c3); 896 mul_add_c(a[6], b[3], c1, c2, c3); 897 mul_add_c(a[7], b[2], c1, c2, c3); 898 r[9] = c1; 899 c1 = 0; 900 mul_add_c(a[7], b[3], c2, c3, c1); 901 mul_add_c(a[6], b[4], c2, c3, c1); 902 mul_add_c(a[5], b[5], c2, c3, c1); 903 mul_add_c(a[4], b[6], c2, c3, c1); 904 mul_add_c(a[3], b[7], c2, c3, c1); 905 r[10] = c2; 906 c2 = 0; 907 mul_add_c(a[4], b[7], c3, c1, c2); 908 mul_add_c(a[5], b[6], c3, c1, c2); 909 mul_add_c(a[6], b[5], c3, c1, c2); 910 mul_add_c(a[7], b[4], c3, c1, c2); 911 r[11] = c3; 912 c3 = 0; 913 mul_add_c(a[7], b[5], c1, c2, c3); 914 mul_add_c(a[6], b[6], c1, c2, c3); 915 mul_add_c(a[5], b[7], c1, c2, c3); 916 r[12] = c1; 917 c1 = 0; 918 mul_add_c(a[6], b[7], c2, c3, c1); 919 mul_add_c(a[7], b[6], c2, c3, c1); 920 r[13] = c2; 921 c2 = 0; 922 mul_add_c(a[7], b[7], c3, c1, c2); 923 r[14] = c3; 924 r[15] = c1; 925} 926 927void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { 928#ifdef BN_LLONG 929 BN_ULLONG t; 930#else 931 BN_ULONG bl, bh; 932#endif 933 BN_ULONG t1, t2; 934 BN_ULONG c1, c2, c3; 935 936 c1 = 0; 937 c2 = 0; 938 c3 = 0; 939 mul_add_c(a[0], b[0], c1, c2, c3); 940 r[0] = c1; 941 c1 = 0; 942 mul_add_c(a[0], b[1], c2, c3, c1); 943 mul_add_c(a[1], b[0], c2, c3, c1); 944 r[1] = c2; 945 c2 = 0; 946 mul_add_c(a[2], b[0], c3, c1, c2); 947 mul_add_c(a[1], b[1], c3, c1, c2); 948 mul_add_c(a[0], b[2], c3, c1, c2); 949 r[2] = c3; 950 c3 = 0; 951 mul_add_c(a[0], b[3], c1, c2, c3); 952 mul_add_c(a[1], b[2], c1, c2, c3); 953 mul_add_c(a[2], b[1], c1, c2, c3); 954 mul_add_c(a[3], b[0], c1, c2, c3); 955 r[3] = c1; 956 c1 = 0; 957 mul_add_c(a[3], b[1], c2, c3, c1); 958 mul_add_c(a[2], b[2], c2, c3, c1); 959 mul_add_c(a[1], b[3], c2, c3, c1); 960 r[4] = c2; 961 c2 = 0; 962 mul_add_c(a[2], b[3], c3, c1, c2); 963 mul_add_c(a[3], b[2], c3, c1, c2); 964 r[5] = c3; 965 c3 = 0; 966 mul_add_c(a[3], b[3], c1, c2, c3); 967 r[6] = c1; 968 r[7] = c2; 969} 970 971void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { 972#ifdef BN_LLONG 973 BN_ULLONG t, tt; 974#else 975 BN_ULONG bl, bh; 976#endif 977 BN_ULONG t1, t2; 978 BN_ULONG c1, c2, c3; 979 980 c1 = 0; 981 c2 = 0; 982 c3 = 0; 983 sqr_add_c(a, 0, c1, c2, c3); 984 r[0] = c1; 985 c1 = 0; 986 sqr_add_c2(a, 1, 0, c2, c3, c1); 987 r[1] = c2; 988 c2 = 0; 989 sqr_add_c(a, 1, c3, c1, c2); 990 sqr_add_c2(a, 2, 0, c3, c1, c2); 991 r[2] = c3; 992 c3 = 0; 993 sqr_add_c2(a, 3, 0, c1, c2, c3); 994 sqr_add_c2(a, 2, 1, c1, c2, c3); 995 r[3] = c1; 996 c1 = 0; 997 sqr_add_c(a, 2, c2, c3, c1); 998 sqr_add_c2(a, 3, 1, c2, c3, c1); 999 sqr_add_c2(a, 4, 0, c2, c3, c1); 1000 r[4] = c2; 1001 c2 = 0; 1002 sqr_add_c2(a, 5, 0, c3, c1, c2); 1003 sqr_add_c2(a, 4, 1, c3, c1, c2); 1004 sqr_add_c2(a, 3, 2, c3, c1, c2); 1005 r[5] = c3; 1006 c3 = 0; 1007 sqr_add_c(a, 3, c1, c2, c3); 1008 sqr_add_c2(a, 4, 2, c1, c2, c3); 1009 sqr_add_c2(a, 5, 1, c1, c2, c3); 1010 sqr_add_c2(a, 6, 0, c1, c2, c3); 1011 r[6] = c1; 1012 c1 = 0; 1013 sqr_add_c2(a, 7, 0, c2, c3, c1); 1014 sqr_add_c2(a, 6, 1, c2, c3, c1); 1015 sqr_add_c2(a, 5, 2, c2, c3, c1); 1016 sqr_add_c2(a, 4, 3, c2, c3, c1); 1017 r[7] = c2; 1018 c2 = 0; 1019 sqr_add_c(a, 4, c3, c1, c2); 1020 sqr_add_c2(a, 5, 3, c3, c1, c2); 1021 sqr_add_c2(a, 6, 2, c3, c1, c2); 1022 sqr_add_c2(a, 7, 1, c3, c1, c2); 1023 r[8] = c3; 1024 c3 = 0; 1025 sqr_add_c2(a, 7, 2, c1, c2, c3); 1026 sqr_add_c2(a, 6, 3, c1, c2, c3); 1027 sqr_add_c2(a, 5, 4, c1, c2, c3); 1028 r[9] = c1; 1029 c1 = 0; 1030 sqr_add_c(a, 5, c2, c3, c1); 1031 sqr_add_c2(a, 6, 4, c2, c3, c1); 1032 sqr_add_c2(a, 7, 3, c2, c3, c1); 1033 r[10] = c2; 1034 c2 = 0; 1035 sqr_add_c2(a, 7, 4, c3, c1, c2); 1036 sqr_add_c2(a, 6, 5, c3, c1, c2); 1037 r[11] = c3; 1038 c3 = 0; 1039 sqr_add_c(a, 6, c1, c2, c3); 1040 sqr_add_c2(a, 7, 5, c1, c2, c3); 1041 r[12] = c1; 1042 c1 = 0; 1043 sqr_add_c2(a, 7, 6, c2, c3, c1); 1044 r[13] = c2; 1045 c2 = 0; 1046 sqr_add_c(a, 7, c3, c1, c2); 1047 r[14] = c3; 1048 r[15] = c1; 1049} 1050 1051void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { 1052#ifdef BN_LLONG 1053 BN_ULLONG t, tt; 1054#else 1055 BN_ULONG bl, bh; 1056#endif 1057 BN_ULONG t1, t2; 1058 BN_ULONG c1, c2, c3; 1059 1060 c1 = 0; 1061 c2 = 0; 1062 c3 = 0; 1063 sqr_add_c(a, 0, c1, c2, c3); 1064 r[0] = c1; 1065 c1 = 0; 1066 sqr_add_c2(a, 1, 0, c2, c3, c1); 1067 r[1] = c2; 1068 c2 = 0; 1069 sqr_add_c(a, 1, c3, c1, c2); 1070 sqr_add_c2(a, 2, 0, c3, c1, c2); 1071 r[2] = c3; 1072 c3 = 0; 1073 sqr_add_c2(a, 3, 0, c1, c2, c3); 1074 sqr_add_c2(a, 2, 1, c1, c2, c3); 1075 r[3] = c1; 1076 c1 = 0; 1077 sqr_add_c(a, 2, c2, c3, c1); 1078 sqr_add_c2(a, 3, 1, c2, c3, c1); 1079 r[4] = c2; 1080 c2 = 0; 1081 sqr_add_c2(a, 3, 2, c3, c1, c2); 1082 r[5] = c3; 1083 c3 = 0; 1084 sqr_add_c(a, 3, c1, c2, c3); 1085 r[6] = c1; 1086 r[7] = c2; 1087} 1088 1089#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64)) 1090/* This is essentially reference implementation, which may or may not 1091 * result in performance improvement. E.g. on IA-32 this routine was 1092 * observed to give 40% faster rsa1024 private key operations and 10% 1093 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 1094 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 1095 * reference implementation, one to be used as starting point for 1096 * platform-specific assembler. Mentioned numbers apply to compiler 1097 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 1098 * can vary not only from platform to platform, but even for compiler 1099 * versions. Assembler vs. assembler improvement coefficients can 1100 * [and are known to] differ and are to be documented elsewhere. */ 1101int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1102 const BN_ULONG *np, const BN_ULONG *n0p, int num) { 1103 BN_ULONG c0, c1, ml, *tp, n0; 1104#ifdef mul64 1105 BN_ULONG mh; 1106#endif 1107 volatile BN_ULONG *vp; 1108 int i = 0, j; 1109 1110#if 0 /* template for platform-specific implementation */ 1111 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); 1112#endif 1113 vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 1114 1115 n0 = *n0p; 1116 1117 c0 = 0; 1118 ml = bp[0]; 1119#ifdef mul64 1120 mh = HBITS(ml); 1121 ml = LBITS(ml); 1122 for (j = 0; j < num; ++j) 1123 mul(tp[j], ap[j], ml, mh, c0); 1124#else 1125 for (j = 0; j < num; ++j) 1126 mul(tp[j], ap[j], ml, c0); 1127#endif 1128 1129 tp[num] = c0; 1130 tp[num + 1] = 0; 1131 goto enter; 1132 1133 for (i = 0; i < num; i++) { 1134 c0 = 0; 1135 ml = bp[i]; 1136#ifdef mul64 1137 mh = HBITS(ml); 1138 ml = LBITS(ml); 1139 for (j = 0; j < num; ++j) 1140 mul_add(tp[j], ap[j], ml, mh, c0); 1141#else 1142 for (j = 0; j < num; ++j) 1143 mul_add(tp[j], ap[j], ml, c0); 1144#endif 1145 c1 = (tp[num] + c0) & BN_MASK2; 1146 tp[num] = c1; 1147 tp[num + 1] = (c1 < c0 ? 1 : 0); 1148 enter: 1149 c1 = tp[0]; 1150 ml = (c1 * n0) & BN_MASK2; 1151 c0 = 0; 1152#ifdef mul64 1153 mh = HBITS(ml); 1154 ml = LBITS(ml); 1155 mul_add(c1, np[0], ml, mh, c0); 1156#else 1157 mul_add(c1, ml, np[0], c0); 1158#endif 1159 for (j = 1; j < num; j++) { 1160 c1 = tp[j]; 1161#ifdef mul64 1162 mul_add(c1, np[j], ml, mh, c0); 1163#else 1164 mul_add(c1, ml, np[j], c0); 1165#endif 1166 tp[j - 1] = c1 & BN_MASK2; 1167 } 1168 c1 = (tp[num] + c0) & BN_MASK2; 1169 tp[num - 1] = c1; 1170 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 1171 } 1172 1173 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1174 c0 = bn_sub_words(rp, tp, np, num); 1175 if (tp[num] != 0 || c0 == 0) { 1176 for (i = 0; i < num + 2; i++) 1177 vp[i] = 0; 1178 return 1; 1179 } 1180 } 1181 for (i = 0; i < num; i++) 1182 rp[i] = tp[i], vp[i] = 0; 1183 vp[num] = 0; 1184 vp[num + 1] = 0; 1185 return 1; 1186} 1187#endif 1188 1189#endif 1190