1/* x86_64 BIGNUM accelerator version 0.1, December 2002.
2 *
3 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project.
5 *
6 * Rights for redistribution and usage in source and binary forms are
7 * granted according to the OpenSSL license. Warranty of any kind is
8 * disclaimed.
9 *
10 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
11 *    versions, like 1.0...
12 * A. Well, that's because this code is basically a quick-n-dirty
13 *    proof-of-concept hack. As you can see it's implemented with
14 *    inline assembler, which means that you're bound to GCC and that
15 *    there might be enough room for further improvement.
16 *
17 * Q. Why inline assembler?
18 * A. x86_64 features own ABI which I'm not familiar with. This is
19 *    why I decided to let the compiler take care of subroutine
20 *    prologue/epilogue as well as register allocation. For reference.
21 *    Win64 implements different ABI for AMD64, different from Linux.
22 *
23 * Q. How much faster does it get?
24 * A. 'apps/openssl speed rsa dsa' output with no-asm:
25 *
26 *	                  sign    verify    sign/s verify/s
27 *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
28 *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
29 *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
30 *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
31 *	                  sign    verify    sign/s verify/s
32 *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
33 *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
34 *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
35 *
36 *    'apps/openssl speed rsa dsa' output with this module:
37 *
38 *	                  sign    verify    sign/s verify/s
39 *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
40 *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
41 *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
42 *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
43 *	                  sign    verify    sign/s verify/s
44 *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
45 *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
46 *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
47 *
48 *    For the reference. IA-32 assembler implementation performs
49 *    very much like 64-bit code compiled with no-asm on the same
50 *    machine.
51 */
52
53#include <openssl/bn.h>
54
55/* TODO(davidben): Get this file working on Windows x64. */
56#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
57
58#include "../internal.h"
59
60
61#undef mul
62#undef mul_add
63
64#define asm __asm__
65
66/*
67 * "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
68 * "g"(0)		let the compiler to decide where does it
69 *			want to keep the value of zero;
70 */
71#define mul_add(r, a, word, carry)                                     \
72  do {                                                                 \
73    register BN_ULONG high, low;                                       \
74    asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
75    asm("addq %2,%0; adcq %3,%1"                                       \
76        : "+r"(carry), "+d"(high)                                      \
77        : "a"(low), "g"(0)                                             \
78        : "cc");                                                       \
79    asm("addq %2,%0; adcq %3,%1"                                       \
80        : "+m"(r), "+d"(high)                                          \
81        : "r"(carry), "g"(0)                                           \
82        : "cc");                                                       \
83    (carry) = high;                                                    \
84  } while (0)
85
86#define mul(r, a, word, carry)                                         \
87  do {                                                                 \
88    register BN_ULONG high, low;                                       \
89    asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
90    asm("addq %2,%0; adcq %3,%1"                                       \
91        : "+r"(carry), "+d"(high)                                      \
92        : "a"(low), "g"(0)                                             \
93        : "cc");                                                       \
94    (r) = (carry);                                                     \
95    (carry) = high;                                                    \
96  } while (0)
97#undef sqr
98#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
99
100BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
101                          BN_ULONG w) {
102  BN_ULONG c1 = 0;
103
104  if (num <= 0) {
105    return (c1);
106  }
107
108  while (num & ~3) {
109    mul_add(rp[0], ap[0], w, c1);
110    mul_add(rp[1], ap[1], w, c1);
111    mul_add(rp[2], ap[2], w, c1);
112    mul_add(rp[3], ap[3], w, c1);
113    ap += 4;
114    rp += 4;
115    num -= 4;
116  }
117  if (num) {
118    mul_add(rp[0], ap[0], w, c1);
119    if (--num == 0) {
120      return c1;
121    }
122    mul_add(rp[1], ap[1], w, c1);
123    if (--num == 0) {
124      return c1;
125    }
126    mul_add(rp[2], ap[2], w, c1);
127    return c1;
128  }
129
130  return c1;
131}
132
133BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
134  BN_ULONG c1 = 0;
135
136  if (num <= 0) {
137    return c1;
138  }
139
140  while (num & ~3) {
141    mul(rp[0], ap[0], w, c1);
142    mul(rp[1], ap[1], w, c1);
143    mul(rp[2], ap[2], w, c1);
144    mul(rp[3], ap[3], w, c1);
145    ap += 4;
146    rp += 4;
147    num -= 4;
148  }
149  if (num) {
150    mul(rp[0], ap[0], w, c1);
151    if (--num == 0) {
152      return c1;
153    }
154    mul(rp[1], ap[1], w, c1);
155    if (--num == 0) {
156      return c1;
157    }
158    mul(rp[2], ap[2], w, c1);
159  }
160  return c1;
161}
162
163void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
164  if (n <= 0) {
165    return;
166  }
167
168  while (n & ~3) {
169    sqr(r[0], r[1], a[0]);
170    sqr(r[2], r[3], a[1]);
171    sqr(r[4], r[5], a[2]);
172    sqr(r[6], r[7], a[3]);
173    a += 4;
174    r += 8;
175    n -= 4;
176  }
177  if (n) {
178    sqr(r[0], r[1], a[0]);
179    if (--n == 0) {
180      return;
181    }
182    sqr(r[2], r[3], a[1]);
183    if (--n == 0) {
184      return;
185    }
186    sqr(r[4], r[5], a[2]);
187  }
188}
189
190BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
191                      int n) {
192  BN_ULONG ret;
193  size_t i = 0;
194
195  if (n <= 0) {
196    return 0;
197  }
198
199  asm volatile (
200      "	subq	%0,%0		\n" /* clear carry */
201      "	jmp	1f		\n"
202      ".p2align 4			\n"
203      "1:	movq	(%4,%2,8),%0	\n"
204      "	adcq	(%5,%2,8),%0	\n"
205      "	movq	%0,(%3,%2,8)	\n"
206      "	lea	1(%2),%2	\n"
207      "	loop	1b		\n"
208      "	sbbq	%0,%0		\n"
209      : "=&r"(ret), "+c"(n), "+r"(i)
210      : "r"(rp), "r"(ap), "r"(bp)
211      : "cc", "memory");
212
213  return ret & 1;
214}
215
216BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
217                      int n) {
218  BN_ULONG ret;
219  size_t i = 0;
220
221  if (n <= 0) {
222    return 0;
223  }
224
225  asm volatile (
226      "	subq	%0,%0		\n" /* clear borrow */
227      "	jmp	1f		\n"
228      ".p2align 4			\n"
229      "1:	movq	(%4,%2,8),%0	\n"
230      "	sbbq	(%5,%2,8),%0	\n"
231      "	movq	%0,(%3,%2,8)	\n"
232      "	lea	1(%2),%2	\n"
233      "	loop	1b		\n"
234      "	sbbq	%0,%0		\n"
235      : "=&r"(ret), "+c"(n), "+r"(i)
236      : "r"(rp), "r"(ap), "r"(bp)
237      : "cc", "memory");
238
239  return ret & 1;
240}
241
242/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
243/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
244/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
245/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
246 */
247
248/* Keep in mind that carrying into high part of multiplication result can not
249 * overflow, because it cannot be all-ones. */
250#define mul_add_c(a, b, c0, c1, c2)          \
251  do {                                       \
252    BN_ULONG t1, t2;                \
253    asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
254    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
255        : "+r"(c0), "+r"(c1), "+r"(c2)       \
256        : "r"(t1), "r"(t2), "g"(0)           \
257        : "cc");                             \
258  } while (0)
259
260#define sqr_add_c(a, i, c0, c1, c2)                           \
261  do {                                                        \
262    BN_ULONG t1, t2;                                          \
263    asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
264    asm("addq %3,%0; adcq %4,%1; adcq %5,%2"                  \
265        : "+r"(c0), "+r"(c1), "+r"(c2)                        \
266        : "r"(t1), "r"(t2), "g"(0)                            \
267        : "cc");                                              \
268  } while (0)
269
270#define mul_add_c2(a, b, c0, c1, c2)         \
271  do {                                       \
272    BN_ULONG t1, t2;                                                    \
273    asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc");        \
274    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
275        : "+r"(c0), "+r"(c1), "+r"(c2)       \
276        : "r"(t1), "r"(t2), "g"(0)           \
277        : "cc");                             \
278    asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
279        : "+r"(c0), "+r"(c1), "+r"(c2)       \
280        : "r"(t1), "r"(t2), "g"(0)           \
281        : "cc");                             \
282  } while (0)
283
284#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
285
286void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
287  BN_ULONG c1, c2, c3;
288
289  c1 = 0;
290  c2 = 0;
291  c3 = 0;
292  mul_add_c(a[0], b[0], c1, c2, c3);
293  r[0] = c1;
294  c1 = 0;
295  mul_add_c(a[0], b[1], c2, c3, c1);
296  mul_add_c(a[1], b[0], c2, c3, c1);
297  r[1] = c2;
298  c2 = 0;
299  mul_add_c(a[2], b[0], c3, c1, c2);
300  mul_add_c(a[1], b[1], c3, c1, c2);
301  mul_add_c(a[0], b[2], c3, c1, c2);
302  r[2] = c3;
303  c3 = 0;
304  mul_add_c(a[0], b[3], c1, c2, c3);
305  mul_add_c(a[1], b[2], c1, c2, c3);
306  mul_add_c(a[2], b[1], c1, c2, c3);
307  mul_add_c(a[3], b[0], c1, c2, c3);
308  r[3] = c1;
309  c1 = 0;
310  mul_add_c(a[4], b[0], c2, c3, c1);
311  mul_add_c(a[3], b[1], c2, c3, c1);
312  mul_add_c(a[2], b[2], c2, c3, c1);
313  mul_add_c(a[1], b[3], c2, c3, c1);
314  mul_add_c(a[0], b[4], c2, c3, c1);
315  r[4] = c2;
316  c2 = 0;
317  mul_add_c(a[0], b[5], c3, c1, c2);
318  mul_add_c(a[1], b[4], c3, c1, c2);
319  mul_add_c(a[2], b[3], c3, c1, c2);
320  mul_add_c(a[3], b[2], c3, c1, c2);
321  mul_add_c(a[4], b[1], c3, c1, c2);
322  mul_add_c(a[5], b[0], c3, c1, c2);
323  r[5] = c3;
324  c3 = 0;
325  mul_add_c(a[6], b[0], c1, c2, c3);
326  mul_add_c(a[5], b[1], c1, c2, c3);
327  mul_add_c(a[4], b[2], c1, c2, c3);
328  mul_add_c(a[3], b[3], c1, c2, c3);
329  mul_add_c(a[2], b[4], c1, c2, c3);
330  mul_add_c(a[1], b[5], c1, c2, c3);
331  mul_add_c(a[0], b[6], c1, c2, c3);
332  r[6] = c1;
333  c1 = 0;
334  mul_add_c(a[0], b[7], c2, c3, c1);
335  mul_add_c(a[1], b[6], c2, c3, c1);
336  mul_add_c(a[2], b[5], c2, c3, c1);
337  mul_add_c(a[3], b[4], c2, c3, c1);
338  mul_add_c(a[4], b[3], c2, c3, c1);
339  mul_add_c(a[5], b[2], c2, c3, c1);
340  mul_add_c(a[6], b[1], c2, c3, c1);
341  mul_add_c(a[7], b[0], c2, c3, c1);
342  r[7] = c2;
343  c2 = 0;
344  mul_add_c(a[7], b[1], c3, c1, c2);
345  mul_add_c(a[6], b[2], c3, c1, c2);
346  mul_add_c(a[5], b[3], c3, c1, c2);
347  mul_add_c(a[4], b[4], c3, c1, c2);
348  mul_add_c(a[3], b[5], c3, c1, c2);
349  mul_add_c(a[2], b[6], c3, c1, c2);
350  mul_add_c(a[1], b[7], c3, c1, c2);
351  r[8] = c3;
352  c3 = 0;
353  mul_add_c(a[2], b[7], c1, c2, c3);
354  mul_add_c(a[3], b[6], c1, c2, c3);
355  mul_add_c(a[4], b[5], c1, c2, c3);
356  mul_add_c(a[5], b[4], c1, c2, c3);
357  mul_add_c(a[6], b[3], c1, c2, c3);
358  mul_add_c(a[7], b[2], c1, c2, c3);
359  r[9] = c1;
360  c1 = 0;
361  mul_add_c(a[7], b[3], c2, c3, c1);
362  mul_add_c(a[6], b[4], c2, c3, c1);
363  mul_add_c(a[5], b[5], c2, c3, c1);
364  mul_add_c(a[4], b[6], c2, c3, c1);
365  mul_add_c(a[3], b[7], c2, c3, c1);
366  r[10] = c2;
367  c2 = 0;
368  mul_add_c(a[4], b[7], c3, c1, c2);
369  mul_add_c(a[5], b[6], c3, c1, c2);
370  mul_add_c(a[6], b[5], c3, c1, c2);
371  mul_add_c(a[7], b[4], c3, c1, c2);
372  r[11] = c3;
373  c3 = 0;
374  mul_add_c(a[7], b[5], c1, c2, c3);
375  mul_add_c(a[6], b[6], c1, c2, c3);
376  mul_add_c(a[5], b[7], c1, c2, c3);
377  r[12] = c1;
378  c1 = 0;
379  mul_add_c(a[6], b[7], c2, c3, c1);
380  mul_add_c(a[7], b[6], c2, c3, c1);
381  r[13] = c2;
382  c2 = 0;
383  mul_add_c(a[7], b[7], c3, c1, c2);
384  r[14] = c3;
385  r[15] = c1;
386}
387
388void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
389  BN_ULONG c1, c2, c3;
390
391  c1 = 0;
392  c2 = 0;
393  c3 = 0;
394  mul_add_c(a[0], b[0], c1, c2, c3);
395  r[0] = c1;
396  c1 = 0;
397  mul_add_c(a[0], b[1], c2, c3, c1);
398  mul_add_c(a[1], b[0], c2, c3, c1);
399  r[1] = c2;
400  c2 = 0;
401  mul_add_c(a[2], b[0], c3, c1, c2);
402  mul_add_c(a[1], b[1], c3, c1, c2);
403  mul_add_c(a[0], b[2], c3, c1, c2);
404  r[2] = c3;
405  c3 = 0;
406  mul_add_c(a[0], b[3], c1, c2, c3);
407  mul_add_c(a[1], b[2], c1, c2, c3);
408  mul_add_c(a[2], b[1], c1, c2, c3);
409  mul_add_c(a[3], b[0], c1, c2, c3);
410  r[3] = c1;
411  c1 = 0;
412  mul_add_c(a[3], b[1], c2, c3, c1);
413  mul_add_c(a[2], b[2], c2, c3, c1);
414  mul_add_c(a[1], b[3], c2, c3, c1);
415  r[4] = c2;
416  c2 = 0;
417  mul_add_c(a[2], b[3], c3, c1, c2);
418  mul_add_c(a[3], b[2], c3, c1, c2);
419  r[5] = c3;
420  c3 = 0;
421  mul_add_c(a[3], b[3], c1, c2, c3);
422  r[6] = c1;
423  r[7] = c2;
424}
425
426void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
427  BN_ULONG c1, c2, c3;
428
429  c1 = 0;
430  c2 = 0;
431  c3 = 0;
432  sqr_add_c(a, 0, c1, c2, c3);
433  r[0] = c1;
434  c1 = 0;
435  sqr_add_c2(a, 1, 0, c2, c3, c1);
436  r[1] = c2;
437  c2 = 0;
438  sqr_add_c(a, 1, c3, c1, c2);
439  sqr_add_c2(a, 2, 0, c3, c1, c2);
440  r[2] = c3;
441  c3 = 0;
442  sqr_add_c2(a, 3, 0, c1, c2, c3);
443  sqr_add_c2(a, 2, 1, c1, c2, c3);
444  r[3] = c1;
445  c1 = 0;
446  sqr_add_c(a, 2, c2, c3, c1);
447  sqr_add_c2(a, 3, 1, c2, c3, c1);
448  sqr_add_c2(a, 4, 0, c2, c3, c1);
449  r[4] = c2;
450  c2 = 0;
451  sqr_add_c2(a, 5, 0, c3, c1, c2);
452  sqr_add_c2(a, 4, 1, c3, c1, c2);
453  sqr_add_c2(a, 3, 2, c3, c1, c2);
454  r[5] = c3;
455  c3 = 0;
456  sqr_add_c(a, 3, c1, c2, c3);
457  sqr_add_c2(a, 4, 2, c1, c2, c3);
458  sqr_add_c2(a, 5, 1, c1, c2, c3);
459  sqr_add_c2(a, 6, 0, c1, c2, c3);
460  r[6] = c1;
461  c1 = 0;
462  sqr_add_c2(a, 7, 0, c2, c3, c1);
463  sqr_add_c2(a, 6, 1, c2, c3, c1);
464  sqr_add_c2(a, 5, 2, c2, c3, c1);
465  sqr_add_c2(a, 4, 3, c2, c3, c1);
466  r[7] = c2;
467  c2 = 0;
468  sqr_add_c(a, 4, c3, c1, c2);
469  sqr_add_c2(a, 5, 3, c3, c1, c2);
470  sqr_add_c2(a, 6, 2, c3, c1, c2);
471  sqr_add_c2(a, 7, 1, c3, c1, c2);
472  r[8] = c3;
473  c3 = 0;
474  sqr_add_c2(a, 7, 2, c1, c2, c3);
475  sqr_add_c2(a, 6, 3, c1, c2, c3);
476  sqr_add_c2(a, 5, 4, c1, c2, c3);
477  r[9] = c1;
478  c1 = 0;
479  sqr_add_c(a, 5, c2, c3, c1);
480  sqr_add_c2(a, 6, 4, c2, c3, c1);
481  sqr_add_c2(a, 7, 3, c2, c3, c1);
482  r[10] = c2;
483  c2 = 0;
484  sqr_add_c2(a, 7, 4, c3, c1, c2);
485  sqr_add_c2(a, 6, 5, c3, c1, c2);
486  r[11] = c3;
487  c3 = 0;
488  sqr_add_c(a, 6, c1, c2, c3);
489  sqr_add_c2(a, 7, 5, c1, c2, c3);
490  r[12] = c1;
491  c1 = 0;
492  sqr_add_c2(a, 7, 6, c2, c3, c1);
493  r[13] = c2;
494  c2 = 0;
495  sqr_add_c(a, 7, c3, c1, c2);
496  r[14] = c3;
497  r[15] = c1;
498}
499
500void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
501  BN_ULONG c1, c2, c3;
502
503  c1 = 0;
504  c2 = 0;
505  c3 = 0;
506  sqr_add_c(a, 0, c1, c2, c3);
507  r[0] = c1;
508  c1 = 0;
509  sqr_add_c2(a, 1, 0, c2, c3, c1);
510  r[1] = c2;
511  c2 = 0;
512  sqr_add_c(a, 1, c3, c1, c2);
513  sqr_add_c2(a, 2, 0, c3, c1, c2);
514  r[2] = c3;
515  c3 = 0;
516  sqr_add_c2(a, 3, 0, c1, c2, c3);
517  sqr_add_c2(a, 2, 1, c1, c2, c3);
518  r[3] = c1;
519  c1 = 0;
520  sqr_add_c(a, 2, c2, c3, c1);
521  sqr_add_c2(a, 3, 1, c2, c3, c1);
522  r[4] = c2;
523  c2 = 0;
524  sqr_add_c2(a, 3, 2, c3, c1, c2);
525  r[5] = c3;
526  c3 = 0;
527  sqr_add_c(a, 3, c1, c2, c3);
528  r[6] = c1;
529  r[7] = c2;
530}
531
532#endif  /* !NO_ASM && X86_64 && __GNUC__ */
533