x86_64-gcc.c revision 53b609c9db503488fc065728d214c9db67ad5740
1#include <openssl/bn.h> 2 3#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS) 4 5#include "../internal.h" 6 7/* x86_64 BIGNUM accelerator version 0.1, December 2002. 8 * 9 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 10 * project. 11 * 12 * Rights for redistribution and usage in source and binary forms are 13 * granted according to the OpenSSL license. Warranty of any kind is 14 * disclaimed. 15 * 16 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 17 * versions, like 1.0... 18 * A. Well, that's because this code is basically a quick-n-dirty 19 * proof-of-concept hack. As you can see it's implemented with 20 * inline assembler, which means that you're bound to GCC and that 21 * there might be enough room for further improvement. 22 * 23 * Q. Why inline assembler? 24 * A. x86_64 features own ABI which I'm not familiar with. This is 25 * why I decided to let the compiler take care of subroutine 26 * prologue/epilogue as well as register allocation. For reference. 27 * Win64 implements different ABI for AMD64, different from Linux. 28 * 29 * Q. How much faster does it get? 30 * A. 'apps/openssl speed rsa dsa' output with no-asm: 31 * 32 * sign verify sign/s verify/s 33 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 34 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 35 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 36 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 37 * sign verify sign/s verify/s 38 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 39 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 40 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 41 * 42 * 'apps/openssl speed rsa dsa' output with this module: 43 * 44 * sign verify sign/s verify/s 45 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 46 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 47 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 48 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 49 * sign verify sign/s verify/s 50 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 51 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 52 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 53 * 54 * For the reference. IA-32 assembler implementation performs 55 * very much like 64-bit code compiled with no-asm on the same 56 * machine. 57 */ 58 59 /* TODO(davidben): Get this file working on Windows x64. */ 60 61#undef mul 62#undef mul_add 63 64#define asm __asm__ 65 66/* 67 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; 68 * "g"(0) let the compiler to decide where does it 69 * want to keep the value of zero; 70 */ 71#define mul_add(r, a, word, carry) \ 72 do { \ 73 register BN_ULONG high, low; \ 74 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \ 75 asm("addq %2,%0; adcq %3,%1" \ 76 : "+r"(carry), "+d"(high) \ 77 : "a"(low), "g"(0) \ 78 : "cc"); \ 79 asm("addq %2,%0; adcq %3,%1" \ 80 : "+m"(r), "+d"(high) \ 81 : "r"(carry), "g"(0) \ 82 : "cc"); \ 83 carry = high; \ 84 } while (0) 85 86#define mul(r, a, word, carry) \ 87 do { \ 88 register BN_ULONG high, low; \ 89 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \ 90 asm("addq %2,%0; adcq %3,%1" \ 91 : "+r"(carry), "+d"(high) \ 92 : "a"(low), "g"(0) \ 93 : "cc"); \ 94 (r) = carry, carry = high; \ 95 } while (0) 96#undef sqr 97#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc"); 98 99BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 100 BN_ULONG w) { 101 BN_ULONG c1 = 0; 102 103 if (num <= 0) 104 return (c1); 105 106 while (num & ~3) { 107 mul_add(rp[0], ap[0], w, c1); 108 mul_add(rp[1], ap[1], w, c1); 109 mul_add(rp[2], ap[2], w, c1); 110 mul_add(rp[3], ap[3], w, c1); 111 ap += 4; 112 rp += 4; 113 num -= 4; 114 } 115 if (num) { 116 mul_add(rp[0], ap[0], w, c1); 117 if (--num == 0) 118 return c1; 119 mul_add(rp[1], ap[1], w, c1); 120 if (--num == 0) 121 return c1; 122 mul_add(rp[2], ap[2], w, c1); 123 return c1; 124 } 125 126 return (c1); 127} 128 129BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) { 130 BN_ULONG c1 = 0; 131 132 if (num <= 0) 133 return (c1); 134 135 while (num & ~3) { 136 mul(rp[0], ap[0], w, c1); 137 mul(rp[1], ap[1], w, c1); 138 mul(rp[2], ap[2], w, c1); 139 mul(rp[3], ap[3], w, c1); 140 ap += 4; 141 rp += 4; 142 num -= 4; 143 } 144 if (num) { 145 mul(rp[0], ap[0], w, c1); 146 if (--num == 0) 147 return c1; 148 mul(rp[1], ap[1], w, c1); 149 if (--num == 0) 150 return c1; 151 mul(rp[2], ap[2], w, c1); 152 } 153 return (c1); 154} 155 156void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) { 157 if (n <= 0) 158 return; 159 160 while (n & ~3) { 161 sqr(r[0], r[1], a[0]); 162 sqr(r[2], r[3], a[1]); 163 sqr(r[4], r[5], a[2]); 164 sqr(r[6], r[7], a[3]); 165 a += 4; 166 r += 8; 167 n -= 4; 168 } 169 if (n) { 170 sqr(r[0], r[1], a[0]); 171 if (--n == 0) 172 return; 173 sqr(r[2], r[3], a[1]); 174 if (--n == 0) 175 return; 176 sqr(r[4], r[5], a[2]); 177 } 178} 179 180BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) { 181 BN_ULONG ret, waste; 182 183 asm("divq %4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc"); 184 185 return ret; 186} 187 188BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 189 int n) { 190 BN_ULONG ret; 191 size_t i = 0; 192 193 if (n <= 0) 194 return 0; 195 196 asm volatile ( 197 " subq %0,%0 \n" /* clear carry */ 198 " jmp 1f \n" 199 ".p2align 4 \n" 200 "1: movq (%4,%2,8),%0 \n" 201 " adcq (%5,%2,8),%0 \n" 202 " movq %0,(%3,%2,8) \n" 203 " lea 1(%2),%2 \n" 204 " loop 1b \n" 205 " sbbq %0,%0 \n" 206 : "=&r"(ret), "+c"(n), "+r"(i) 207 : "r"(rp), "r"(ap), "r"(bp) 208 : "cc", "memory"); 209 210 return ret & 1; 211} 212 213#ifndef SIMICS 214BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 215 int n) { 216 BN_ULONG ret; 217 size_t i = 0; 218 219 if (n <= 0) 220 return 0; 221 222 asm volatile ( 223 " subq %0,%0 \n" /* clear borrow */ 224 " jmp 1f \n" 225 ".p2align 4 \n" 226 "1: movq (%4,%2,8),%0 \n" 227 " sbbq (%5,%2,8),%0 \n" 228 " movq %0,(%3,%2,8) \n" 229 " lea 1(%2),%2 \n" 230 " loop 1b \n" 231 " sbbq %0,%0 \n" 232 : "=&r"(ret), "+c"(n), "+r"(i) 233 : "r"(rp), "r"(ap), "r"(bp) 234 : "cc", "memory"); 235 236 return ret & 1; 237} 238#else 239/* Simics 1.4<7 has buggy sbbq:-( */ 240#define BN_MASK2 0xffffffffffffffffL 241BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) { 242 BN_ULONG t1, t2; 243 int c = 0; 244 245 if (n <= 0) 246 return ((BN_ULONG)0); 247 248 for (;;) { 249 t1 = a[0]; 250 t2 = b[0]; 251 r[0] = (t1 - t2 - c) & BN_MASK2; 252 if (t1 != t2) 253 c = (t1 < t2); 254 if (--n <= 0) 255 break; 256 257 t1 = a[1]; 258 t2 = b[1]; 259 r[1] = (t1 - t2 - c) & BN_MASK2; 260 if (t1 != t2) 261 c = (t1 < t2); 262 if (--n <= 0) 263 break; 264 265 t1 = a[2]; 266 t2 = b[2]; 267 r[2] = (t1 - t2 - c) & BN_MASK2; 268 if (t1 != t2) 269 c = (t1 < t2); 270 if (--n <= 0) 271 break; 272 273 t1 = a[3]; 274 t2 = b[3]; 275 r[3] = (t1 - t2 - c) & BN_MASK2; 276 if (t1 != t2) 277 c = (t1 < t2); 278 if (--n <= 0) 279 break; 280 281 a += 4; 282 b += 4; 283 r += 4; 284 } 285 return (c); 286} 287#endif 288 289/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 290/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 291/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 292/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) 293 */ 294 295/* Keep in mind that carrying into high part of multiplication result can not 296 * overflow, because it cannot be all-ones. */ 297#define mul_add_c(a, b, c0, c1, c2) \ 298 do { \ 299 BN_ULONG t1, t2; \ 300 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ 301 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 302 : "+r"(c0), "+r"(c1), "+r"(c2) \ 303 : "r"(t1), "r"(t2), "g"(0) \ 304 : "cc"); \ 305 } while (0) 306 307#define sqr_add_c(a, i, c0, c1, c2) \ 308 do { \ 309 BN_ULONG t1, t2; \ 310 asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \ 311 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 312 : "+r"(c0), "+r"(c1), "+r"(c2) \ 313 : "r"(t1), "r"(t2), "g"(0) \ 314 : "cc"); \ 315 } while (0) 316 317#define mul_add_c2(a, b, c0, c1, c2) \ 318 do { \ 319 BN_ULONG t1, t2; \ 320 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ 321 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 322 : "+r"(c0), "+r"(c1), "+r"(c2) \ 323 : "r"(t1), "r"(t2), "g"(0) \ 324 : "cc"); \ 325 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 326 : "+r"(c0), "+r"(c1), "+r"(c2) \ 327 : "r"(t1), "r"(t2), "g"(0) \ 328 : "cc"); \ 329 } while (0) 330 331#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 332 333void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { 334 BN_ULONG c1, c2, c3; 335 336 c1 = 0; 337 c2 = 0; 338 c3 = 0; 339 mul_add_c(a[0], b[0], c1, c2, c3); 340 r[0] = c1; 341 c1 = 0; 342 mul_add_c(a[0], b[1], c2, c3, c1); 343 mul_add_c(a[1], b[0], c2, c3, c1); 344 r[1] = c2; 345 c2 = 0; 346 mul_add_c(a[2], b[0], c3, c1, c2); 347 mul_add_c(a[1], b[1], c3, c1, c2); 348 mul_add_c(a[0], b[2], c3, c1, c2); 349 r[2] = c3; 350 c3 = 0; 351 mul_add_c(a[0], b[3], c1, c2, c3); 352 mul_add_c(a[1], b[2], c1, c2, c3); 353 mul_add_c(a[2], b[1], c1, c2, c3); 354 mul_add_c(a[3], b[0], c1, c2, c3); 355 r[3] = c1; 356 c1 = 0; 357 mul_add_c(a[4], b[0], c2, c3, c1); 358 mul_add_c(a[3], b[1], c2, c3, c1); 359 mul_add_c(a[2], b[2], c2, c3, c1); 360 mul_add_c(a[1], b[3], c2, c3, c1); 361 mul_add_c(a[0], b[4], c2, c3, c1); 362 r[4] = c2; 363 c2 = 0; 364 mul_add_c(a[0], b[5], c3, c1, c2); 365 mul_add_c(a[1], b[4], c3, c1, c2); 366 mul_add_c(a[2], b[3], c3, c1, c2); 367 mul_add_c(a[3], b[2], c3, c1, c2); 368 mul_add_c(a[4], b[1], c3, c1, c2); 369 mul_add_c(a[5], b[0], c3, c1, c2); 370 r[5] = c3; 371 c3 = 0; 372 mul_add_c(a[6], b[0], c1, c2, c3); 373 mul_add_c(a[5], b[1], c1, c2, c3); 374 mul_add_c(a[4], b[2], c1, c2, c3); 375 mul_add_c(a[3], b[3], c1, c2, c3); 376 mul_add_c(a[2], b[4], c1, c2, c3); 377 mul_add_c(a[1], b[5], c1, c2, c3); 378 mul_add_c(a[0], b[6], c1, c2, c3); 379 r[6] = c1; 380 c1 = 0; 381 mul_add_c(a[0], b[7], c2, c3, c1); 382 mul_add_c(a[1], b[6], c2, c3, c1); 383 mul_add_c(a[2], b[5], c2, c3, c1); 384 mul_add_c(a[3], b[4], c2, c3, c1); 385 mul_add_c(a[4], b[3], c2, c3, c1); 386 mul_add_c(a[5], b[2], c2, c3, c1); 387 mul_add_c(a[6], b[1], c2, c3, c1); 388 mul_add_c(a[7], b[0], c2, c3, c1); 389 r[7] = c2; 390 c2 = 0; 391 mul_add_c(a[7], b[1], c3, c1, c2); 392 mul_add_c(a[6], b[2], c3, c1, c2); 393 mul_add_c(a[5], b[3], c3, c1, c2); 394 mul_add_c(a[4], b[4], c3, c1, c2); 395 mul_add_c(a[3], b[5], c3, c1, c2); 396 mul_add_c(a[2], b[6], c3, c1, c2); 397 mul_add_c(a[1], b[7], c3, c1, c2); 398 r[8] = c3; 399 c3 = 0; 400 mul_add_c(a[2], b[7], c1, c2, c3); 401 mul_add_c(a[3], b[6], c1, c2, c3); 402 mul_add_c(a[4], b[5], c1, c2, c3); 403 mul_add_c(a[5], b[4], c1, c2, c3); 404 mul_add_c(a[6], b[3], c1, c2, c3); 405 mul_add_c(a[7], b[2], c1, c2, c3); 406 r[9] = c1; 407 c1 = 0; 408 mul_add_c(a[7], b[3], c2, c3, c1); 409 mul_add_c(a[6], b[4], c2, c3, c1); 410 mul_add_c(a[5], b[5], c2, c3, c1); 411 mul_add_c(a[4], b[6], c2, c3, c1); 412 mul_add_c(a[3], b[7], c2, c3, c1); 413 r[10] = c2; 414 c2 = 0; 415 mul_add_c(a[4], b[7], c3, c1, c2); 416 mul_add_c(a[5], b[6], c3, c1, c2); 417 mul_add_c(a[6], b[5], c3, c1, c2); 418 mul_add_c(a[7], b[4], c3, c1, c2); 419 r[11] = c3; 420 c3 = 0; 421 mul_add_c(a[7], b[5], c1, c2, c3); 422 mul_add_c(a[6], b[6], c1, c2, c3); 423 mul_add_c(a[5], b[7], c1, c2, c3); 424 r[12] = c1; 425 c1 = 0; 426 mul_add_c(a[6], b[7], c2, c3, c1); 427 mul_add_c(a[7], b[6], c2, c3, c1); 428 r[13] = c2; 429 c2 = 0; 430 mul_add_c(a[7], b[7], c3, c1, c2); 431 r[14] = c3; 432 r[15] = c1; 433} 434 435void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { 436 BN_ULONG c1, c2, c3; 437 438 c1 = 0; 439 c2 = 0; 440 c3 = 0; 441 mul_add_c(a[0], b[0], c1, c2, c3); 442 r[0] = c1; 443 c1 = 0; 444 mul_add_c(a[0], b[1], c2, c3, c1); 445 mul_add_c(a[1], b[0], c2, c3, c1); 446 r[1] = c2; 447 c2 = 0; 448 mul_add_c(a[2], b[0], c3, c1, c2); 449 mul_add_c(a[1], b[1], c3, c1, c2); 450 mul_add_c(a[0], b[2], c3, c1, c2); 451 r[2] = c3; 452 c3 = 0; 453 mul_add_c(a[0], b[3], c1, c2, c3); 454 mul_add_c(a[1], b[2], c1, c2, c3); 455 mul_add_c(a[2], b[1], c1, c2, c3); 456 mul_add_c(a[3], b[0], c1, c2, c3); 457 r[3] = c1; 458 c1 = 0; 459 mul_add_c(a[3], b[1], c2, c3, c1); 460 mul_add_c(a[2], b[2], c2, c3, c1); 461 mul_add_c(a[1], b[3], c2, c3, c1); 462 r[4] = c2; 463 c2 = 0; 464 mul_add_c(a[2], b[3], c3, c1, c2); 465 mul_add_c(a[3], b[2], c3, c1, c2); 466 r[5] = c3; 467 c3 = 0; 468 mul_add_c(a[3], b[3], c1, c2, c3); 469 r[6] = c1; 470 r[7] = c2; 471} 472 473void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) { 474 BN_ULONG c1, c2, c3; 475 476 c1 = 0; 477 c2 = 0; 478 c3 = 0; 479 sqr_add_c(a, 0, c1, c2, c3); 480 r[0] = c1; 481 c1 = 0; 482 sqr_add_c2(a, 1, 0, c2, c3, c1); 483 r[1] = c2; 484 c2 = 0; 485 sqr_add_c(a, 1, c3, c1, c2); 486 sqr_add_c2(a, 2, 0, c3, c1, c2); 487 r[2] = c3; 488 c3 = 0; 489 sqr_add_c2(a, 3, 0, c1, c2, c3); 490 sqr_add_c2(a, 2, 1, c1, c2, c3); 491 r[3] = c1; 492 c1 = 0; 493 sqr_add_c(a, 2, c2, c3, c1); 494 sqr_add_c2(a, 3, 1, c2, c3, c1); 495 sqr_add_c2(a, 4, 0, c2, c3, c1); 496 r[4] = c2; 497 c2 = 0; 498 sqr_add_c2(a, 5, 0, c3, c1, c2); 499 sqr_add_c2(a, 4, 1, c3, c1, c2); 500 sqr_add_c2(a, 3, 2, c3, c1, c2); 501 r[5] = c3; 502 c3 = 0; 503 sqr_add_c(a, 3, c1, c2, c3); 504 sqr_add_c2(a, 4, 2, c1, c2, c3); 505 sqr_add_c2(a, 5, 1, c1, c2, c3); 506 sqr_add_c2(a, 6, 0, c1, c2, c3); 507 r[6] = c1; 508 c1 = 0; 509 sqr_add_c2(a, 7, 0, c2, c3, c1); 510 sqr_add_c2(a, 6, 1, c2, c3, c1); 511 sqr_add_c2(a, 5, 2, c2, c3, c1); 512 sqr_add_c2(a, 4, 3, c2, c3, c1); 513 r[7] = c2; 514 c2 = 0; 515 sqr_add_c(a, 4, c3, c1, c2); 516 sqr_add_c2(a, 5, 3, c3, c1, c2); 517 sqr_add_c2(a, 6, 2, c3, c1, c2); 518 sqr_add_c2(a, 7, 1, c3, c1, c2); 519 r[8] = c3; 520 c3 = 0; 521 sqr_add_c2(a, 7, 2, c1, c2, c3); 522 sqr_add_c2(a, 6, 3, c1, c2, c3); 523 sqr_add_c2(a, 5, 4, c1, c2, c3); 524 r[9] = c1; 525 c1 = 0; 526 sqr_add_c(a, 5, c2, c3, c1); 527 sqr_add_c2(a, 6, 4, c2, c3, c1); 528 sqr_add_c2(a, 7, 3, c2, c3, c1); 529 r[10] = c2; 530 c2 = 0; 531 sqr_add_c2(a, 7, 4, c3, c1, c2); 532 sqr_add_c2(a, 6, 5, c3, c1, c2); 533 r[11] = c3; 534 c3 = 0; 535 sqr_add_c(a, 6, c1, c2, c3); 536 sqr_add_c2(a, 7, 5, c1, c2, c3); 537 r[12] = c1; 538 c1 = 0; 539 sqr_add_c2(a, 7, 6, c2, c3, c1); 540 r[13] = c2; 541 c2 = 0; 542 sqr_add_c(a, 7, c3, c1, c2); 543 r[14] = c3; 544 r[15] = c1; 545} 546 547void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) { 548 BN_ULONG c1, c2, c3; 549 550 c1 = 0; 551 c2 = 0; 552 c3 = 0; 553 sqr_add_c(a, 0, c1, c2, c3); 554 r[0] = c1; 555 c1 = 0; 556 sqr_add_c2(a, 1, 0, c2, c3, c1); 557 r[1] = c2; 558 c2 = 0; 559 sqr_add_c(a, 1, c3, c1, c2); 560 sqr_add_c2(a, 2, 0, c3, c1, c2); 561 r[2] = c3; 562 c3 = 0; 563 sqr_add_c2(a, 3, 0, c1, c2, c3); 564 sqr_add_c2(a, 2, 1, c1, c2, c3); 565 r[3] = c1; 566 c1 = 0; 567 sqr_add_c(a, 2, c2, c3, c1); 568 sqr_add_c2(a, 3, 1, c2, c3, c1); 569 r[4] = c2; 570 c2 = 0; 571 sqr_add_c2(a, 3, 2, c3, c1, c2); 572 r[5] = c3; 573 c3 = 0; 574 sqr_add_c(a, 3, c1, c2, c3); 575 r[6] = c1; 576 r[7] = c2; 577} 578 579#endif /* !NO_ASM && X86_64 && !WINDOWS */ 580