1/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2 * All rights reserved.
3 *
4 * This package is an SSL implementation written
5 * by Eric Young (eay@cryptsoft.com).
6 * The implementation was written so as to conform with Netscapes SSL.
7 *
8 * This library is free for commercial and non-commercial use as long as
9 * the following conditions are aheared to.  The following conditions
10 * apply to all code found in this distribution, be it the RC4, RSA,
11 * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
12 * included with this distribution is covered by the same copyright terms
13 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14 *
15 * Copyright remains Eric Young's, and as such any Copyright notices in
16 * the code are not to be removed.
17 * If this package is used in a product, Eric Young should be given attribution
18 * as the author of the parts of the library used.
19 * This can be in the form of a textual message at program startup or
20 * in documentation (online or textual) provided with the package.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 *    must display the following acknowledgement:
32 *    "This product includes cryptographic software written by
33 *     Eric Young (eay@cryptsoft.com)"
34 *    The word 'cryptographic' can be left out if the rouines from the library
35 *    being used are not cryptographic related :-).
36 * 4. If you include any Windows specific code (or a derivative thereof) from
37 *    the apps directory (application code) you must include an acknowledgement:
38 *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39 *
40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * The licence and distribution terms for any publically available version or
53 * derivative of this code cannot be changed.  i.e. this code cannot simply be
54 * copied and put under another distribution licence
55 * [including the GNU Public Licence.] */
56
57#include <openssl/bn.h>
58
59#include <assert.h>
60
61#include "internal.h"
62
63
64/* Generic implementations of most operations are needed for:
65 * - Configurations without inline assembly.
66 * - Architectures other than x86 or x86_64.
67 * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
68#if defined(OPENSSL_NO_ASM) || \
69    (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70    (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
71
72#ifdef BN_ULLONG
73#define mul_add(r, a, w, c)             \
74  {                                     \
75    BN_ULLONG t;                        \
76    t = (BN_ULLONG)w * (a) + (r) + (c); \
77    (r) = Lw(t);                        \
78    (c) = Hw(t);                        \
79  }
80
81#define mul(r, a, w, c)           \
82  {                               \
83    BN_ULLONG t;                  \
84    t = (BN_ULLONG)w * (a) + (c); \
85    (r) = Lw(t);                  \
86    (c) = Hw(t);                  \
87  }
88
89#define sqr(r0, r1, a)        \
90  {                           \
91    BN_ULLONG t;              \
92    t = (BN_ULLONG)(a) * (a); \
93    (r0) = Lw(t);             \
94    (r1) = Hw(t);             \
95  }
96
97#elif defined(BN_UMULT_LOHI)
98#define mul_add(r, a, w, c)             \
99  {                                     \
100    BN_ULONG high, low, ret, tmp = (a); \
101    ret = (r);                          \
102    BN_UMULT_LOHI(low, high, w, tmp);   \
103    ret += (c);                         \
104    (c) = (ret < (c)) ? 1 : 0;          \
105    (c) += high;                        \
106    ret += low;                         \
107    (c) += (ret < low) ? 1 : 0;         \
108    (r) = ret;                          \
109  }
110
111#define mul(r, a, w, c)                \
112  {                                    \
113    BN_ULONG high, low, ret, ta = (a); \
114    BN_UMULT_LOHI(low, high, w, ta);   \
115    ret = low + (c);                   \
116    (c) = high;                        \
117    (c) += (ret < low) ? 1 : 0;        \
118    (r) = ret;                         \
119  }
120
121#define sqr(r0, r1, a)               \
122  {                                  \
123    BN_ULONG tmp = (a);              \
124    BN_UMULT_LOHI(r0, r1, tmp, tmp); \
125  }
126
127#else
128
129/*************************************************************
130 * No long long type
131 */
132
133#define LBITS(a) ((a) & BN_MASK2l)
134#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
135#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
136
137#define LLBITS(a) ((a) & BN_MASKl)
138#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
139#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
140
141#define mul64(l, h, bl, bh)       \
142  {                               \
143    BN_ULONG m, m1, lt, ht;       \
144                                  \
145    lt = l;                       \
146    ht = h;                       \
147    m = (bh) * (lt);              \
148    lt = (bl) * (lt);             \
149    m1 = (bl) * (ht);             \
150    ht = (bh) * (ht);             \
151    m = (m + m1) & BN_MASK2;      \
152    if (m < m1)                   \
153      ht += L2HBITS((BN_ULONG)1); \
154    ht += HBITS(m);               \
155    m1 = L2HBITS(m);              \
156    lt = (lt + m1) & BN_MASK2;    \
157    if (lt < m1)                  \
158      ht++;                       \
159    (l) = lt;                     \
160    (h) = ht;                     \
161  }
162
163#define sqr64(lo, ho, in)                    \
164  {                                          \
165    BN_ULONG l, h, m;                        \
166                                             \
167    h = (in);                                \
168    l = LBITS(h);                            \
169    h = HBITS(h);                            \
170    m = (l) * (h);                           \
171    l *= l;                                  \
172    h *= h;                                  \
173    h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
174    m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
175    l = (l + m) & BN_MASK2;                  \
176    if (l < m)                               \
177      h++;                                   \
178    (lo) = l;                                \
179    (ho) = h;                                \
180  }
181
182#define mul_add(r, a, bl, bh, c) \
183  {                              \
184    BN_ULONG l, h;               \
185                                 \
186    h = (a);                     \
187    l = LBITS(h);                \
188    h = HBITS(h);                \
189    mul64(l, h, (bl), (bh));     \
190                                 \
191    /* non-multiply part */      \
192    l = (l + (c)) & BN_MASK2;    \
193    if (l < (c))                 \
194      h++;                       \
195    (c) = (r);                   \
196    l = (l + (c)) & BN_MASK2;    \
197    if (l < (c))                 \
198      h++;                       \
199    (c) = h & BN_MASK2;          \
200    (r) = l;                     \
201  }
202
203#define mul(r, a, bl, bh, c)  \
204  {                           \
205    BN_ULONG l, h;            \
206                              \
207    h = (a);                  \
208    l = LBITS(h);             \
209    h = HBITS(h);             \
210    mul64(l, h, (bl), (bh));  \
211                              \
212    /* non-multiply part */   \
213    l += (c);                 \
214    if ((l & BN_MASK2) < (c)) \
215      h++;                    \
216    (c) = h & BN_MASK2;       \
217    (r) = l & BN_MASK2;       \
218  }
219#endif /* !BN_ULLONG */
220
221#if defined(BN_ULLONG) || defined(BN_UMULT_HIGH)
222
223BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
224                          BN_ULONG w) {
225  BN_ULONG c1 = 0;
226
227  assert(num >= 0);
228  if (num <= 0) {
229    return c1;
230  }
231
232  while (num & ~3) {
233    mul_add(rp[0], ap[0], w, c1);
234    mul_add(rp[1], ap[1], w, c1);
235    mul_add(rp[2], ap[2], w, c1);
236    mul_add(rp[3], ap[3], w, c1);
237    ap += 4;
238    rp += 4;
239    num -= 4;
240  }
241
242  while (num) {
243    mul_add(rp[0], ap[0], w, c1);
244    ap++;
245    rp++;
246    num--;
247  }
248
249  return c1;
250}
251
252BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
253  BN_ULONG c1 = 0;
254
255  assert(num >= 0);
256  if (num <= 0) {
257    return c1;
258  }
259
260  while (num & ~3) {
261    mul(rp[0], ap[0], w, c1);
262    mul(rp[1], ap[1], w, c1);
263    mul(rp[2], ap[2], w, c1);
264    mul(rp[3], ap[3], w, c1);
265    ap += 4;
266    rp += 4;
267    num -= 4;
268  }
269  while (num) {
270    mul(rp[0], ap[0], w, c1);
271    ap++;
272    rp++;
273    num--;
274  }
275  return c1;
276}
277
278void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
279  assert(n >= 0);
280  if (n <= 0) {
281    return;
282  }
283
284  while (n & ~3) {
285    sqr(r[0], r[1], a[0]);
286    sqr(r[2], r[3], a[1]);
287    sqr(r[4], r[5], a[2]);
288    sqr(r[6], r[7], a[3]);
289    a += 4;
290    r += 8;
291    n -= 4;
292  }
293  while (n) {
294    sqr(r[0], r[1], a[0]);
295    a++;
296    r += 2;
297    n--;
298  }
299}
300
301#else /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
302
303BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
304                          BN_ULONG w) {
305  BN_ULONG c = 0;
306  BN_ULONG bl, bh;
307
308  assert(num >= 0);
309  if (num <= 0) {
310    return (BN_ULONG)0;
311  }
312
313  bl = LBITS(w);
314  bh = HBITS(w);
315
316  while (num & ~3) {
317    mul_add(rp[0], ap[0], bl, bh, c);
318    mul_add(rp[1], ap[1], bl, bh, c);
319    mul_add(rp[2], ap[2], bl, bh, c);
320    mul_add(rp[3], ap[3], bl, bh, c);
321    ap += 4;
322    rp += 4;
323    num -= 4;
324  }
325  while (num) {
326    mul_add(rp[0], ap[0], bl, bh, c);
327    ap++;
328    rp++;
329    num--;
330  }
331  return c;
332}
333
334BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
335  BN_ULONG carry = 0;
336  BN_ULONG bl, bh;
337
338  assert(num >= 0);
339  if (num <= 0) {
340    return (BN_ULONG)0;
341  }
342
343  bl = LBITS(w);
344  bh = HBITS(w);
345
346  while (num & ~3) {
347    mul(rp[0], ap[0], bl, bh, carry);
348    mul(rp[1], ap[1], bl, bh, carry);
349    mul(rp[2], ap[2], bl, bh, carry);
350    mul(rp[3], ap[3], bl, bh, carry);
351    ap += 4;
352    rp += 4;
353    num -= 4;
354  }
355  while (num) {
356    mul(rp[0], ap[0], bl, bh, carry);
357    ap++;
358    rp++;
359    num--;
360  }
361  return carry;
362}
363
364void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
365  assert(n >= 0);
366  if (n <= 0) {
367    return;
368  }
369
370  while (n & ~3) {
371    sqr64(r[0], r[1], a[0]);
372    sqr64(r[2], r[3], a[1]);
373    sqr64(r[4], r[5], a[2]);
374    sqr64(r[6], r[7], a[3]);
375    a += 4;
376    r += 8;
377    n -= 4;
378  }
379  while (n) {
380    sqr64(r[0], r[1], a[0]);
381    a++;
382    r += 2;
383    n--;
384  }
385}
386
387#endif /* !(defined(BN_ULLONG) || defined(BN_UMULT_HIGH)) */
388
389#if defined(BN_ULLONG)
390
391BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
392  return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
393}
394
395#else
396
397/* Divide h,l by d and return the result. */
398BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
399  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
400  int i, count = 2;
401
402  if (d == 0) {
403    return BN_MASK2;
404  }
405
406  i = BN_num_bits_word(d);
407  assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
408
409  i = BN_BITS2 - i;
410  if (h >= d) {
411    h -= d;
412  }
413
414  if (i) {
415    d <<= i;
416    h = (h << i) | (l >> (BN_BITS2 - i));
417    l <<= i;
418  }
419  dh = (d & BN_MASK2h) >> BN_BITS4;
420  dl = (d & BN_MASK2l);
421  for (;;) {
422    if ((h >> BN_BITS4) == dh) {
423      q = BN_MASK2l;
424    } else {
425      q = h / dh;
426    }
427
428    th = q * dh;
429    tl = dl * q;
430    for (;;) {
431      t = h - th;
432      if ((t & BN_MASK2h) ||
433          ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
434        break;
435      }
436      q--;
437      th -= dh;
438      tl -= dl;
439    }
440    t = (tl >> BN_BITS4);
441    tl = (tl << BN_BITS4) & BN_MASK2h;
442    th += t;
443
444    if (l < tl) {
445      th++;
446    }
447    l -= tl;
448    if (h < th) {
449      h += d;
450      q--;
451    }
452    h -= th;
453
454    if (--count == 0) {
455      break;
456    }
457
458    ret = q << BN_BITS4;
459    h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
460    l = (l & BN_MASK2l) << BN_BITS4;
461  }
462
463  ret |= q;
464  return ret;
465}
466
467#endif /* !defined(BN_ULLONG) */
468
469#ifdef BN_ULLONG
470BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
471                      int n) {
472  BN_ULLONG ll = 0;
473
474  assert(n >= 0);
475  if (n <= 0) {
476    return (BN_ULONG)0;
477  }
478
479  while (n & ~3) {
480    ll += (BN_ULLONG)a[0] + b[0];
481    r[0] = (BN_ULONG)ll & BN_MASK2;
482    ll >>= BN_BITS2;
483    ll += (BN_ULLONG)a[1] + b[1];
484    r[1] = (BN_ULONG)ll & BN_MASK2;
485    ll >>= BN_BITS2;
486    ll += (BN_ULLONG)a[2] + b[2];
487    r[2] = (BN_ULONG)ll & BN_MASK2;
488    ll >>= BN_BITS2;
489    ll += (BN_ULLONG)a[3] + b[3];
490    r[3] = (BN_ULONG)ll & BN_MASK2;
491    ll >>= BN_BITS2;
492    a += 4;
493    b += 4;
494    r += 4;
495    n -= 4;
496  }
497  while (n) {
498    ll += (BN_ULLONG)a[0] + b[0];
499    r[0] = (BN_ULONG)ll & BN_MASK2;
500    ll >>= BN_BITS2;
501    a++;
502    b++;
503    r++;
504    n--;
505  }
506  return (BN_ULONG)ll;
507}
508
509#else /* !BN_ULLONG */
510
511BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
512                      int n) {
513  BN_ULONG c, l, t;
514
515  assert(n >= 0);
516  if (n <= 0) {
517    return (BN_ULONG)0;
518  }
519
520  c = 0;
521  while (n & ~3) {
522    t = a[0];
523    t = (t + c) & BN_MASK2;
524    c = (t < c);
525    l = (t + b[0]) & BN_MASK2;
526    c += (l < t);
527    r[0] = l;
528    t = a[1];
529    t = (t + c) & BN_MASK2;
530    c = (t < c);
531    l = (t + b[1]) & BN_MASK2;
532    c += (l < t);
533    r[1] = l;
534    t = a[2];
535    t = (t + c) & BN_MASK2;
536    c = (t < c);
537    l = (t + b[2]) & BN_MASK2;
538    c += (l < t);
539    r[2] = l;
540    t = a[3];
541    t = (t + c) & BN_MASK2;
542    c = (t < c);
543    l = (t + b[3]) & BN_MASK2;
544    c += (l < t);
545    r[3] = l;
546    a += 4;
547    b += 4;
548    r += 4;
549    n -= 4;
550  }
551  while (n) {
552    t = a[0];
553    t = (t + c) & BN_MASK2;
554    c = (t < c);
555    l = (t + b[0]) & BN_MASK2;
556    c += (l < t);
557    r[0] = l;
558    a++;
559    b++;
560    r++;
561    n--;
562  }
563  return (BN_ULONG)c;
564}
565
566#endif /* !BN_ULLONG */
567
568BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
569                      int n) {
570  BN_ULONG t1, t2;
571  int c = 0;
572
573  assert(n >= 0);
574  if (n <= 0) {
575    return (BN_ULONG)0;
576  }
577
578  while (n & ~3) {
579    t1 = a[0];
580    t2 = b[0];
581    r[0] = (t1 - t2 - c) & BN_MASK2;
582    if (t1 != t2) {
583      c = (t1 < t2);
584    }
585    t1 = a[1];
586    t2 = b[1];
587    r[1] = (t1 - t2 - c) & BN_MASK2;
588    if (t1 != t2) {
589      c = (t1 < t2);
590    }
591    t1 = a[2];
592    t2 = b[2];
593    r[2] = (t1 - t2 - c) & BN_MASK2;
594    if (t1 != t2) {
595      c = (t1 < t2);
596    }
597    t1 = a[3];
598    t2 = b[3];
599    r[3] = (t1 - t2 - c) & BN_MASK2;
600    if (t1 != t2) {
601      c = (t1 < t2);
602    }
603    a += 4;
604    b += 4;
605    r += 4;
606    n -= 4;
607  }
608  while (n) {
609    t1 = a[0];
610    t2 = b[0];
611    r[0] = (t1 - t2 - c) & BN_MASK2;
612    if (t1 != t2) {
613      c = (t1 < t2);
614    }
615    a++;
616    b++;
617    r++;
618    n--;
619  }
620  return c;
621}
622
623/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
624/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
625/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
626/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
627
628#ifdef BN_ULLONG
629
630/* Keep in mind that additions to multiplication result can not overflow,
631 * because its high half cannot be all-ones. */
632#define mul_add_c(a, b, c0, c1, c2)     \
633  do {                                  \
634    BN_ULONG hi;                        \
635    BN_ULLONG t = (BN_ULLONG)(a) * (b); \
636    t += c0; /* no carry */             \
637    c0 = (BN_ULONG)Lw(t);               \
638    hi = (BN_ULONG)Hw(t);               \
639    c1 = (c1 + hi) & BN_MASK2;          \
640    if (c1 < hi)                        \
641      c2++;                             \
642  } while (0)
643
644#define mul_add_c2(a, b, c0, c1, c2)      \
645  do {                                    \
646    BN_ULONG hi;                          \
647    BN_ULLONG t = (BN_ULLONG)(a) * (b);   \
648    BN_ULLONG tt = t + c0; /* no carry */ \
649    c0 = (BN_ULONG)Lw(tt);                \
650    hi = (BN_ULONG)Hw(tt);                \
651    c1 = (c1 + hi) & BN_MASK2;            \
652    if (c1 < hi)                          \
653      c2++;                               \
654    t += c0; /* no carry */               \
655    c0 = (BN_ULONG)Lw(t);                 \
656    hi = (BN_ULONG)Hw(t);                 \
657    c1 = (c1 + hi) & BN_MASK2;            \
658    if (c1 < hi)                          \
659      c2++;                               \
660  } while (0)
661
662#define sqr_add_c(a, i, c0, c1, c2)       \
663  do {                                    \
664    BN_ULONG hi;                          \
665    BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
666    t += c0; /* no carry */               \
667    c0 = (BN_ULONG)Lw(t);                 \
668    hi = (BN_ULONG)Hw(t);                 \
669    c1 = (c1 + hi) & BN_MASK2;            \
670    if (c1 < hi)                          \
671      c2++;                               \
672  } while (0)
673
674#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
675
676#elif defined(BN_UMULT_LOHI)
677
678/* Keep in mind that additions to hi can not overflow, because the high word of
679 * a multiplication result cannot be all-ones. */
680#define mul_add_c(a, b, c0, c1, c2) \
681  do {                              \
682    BN_ULONG ta = (a), tb = (b);    \
683    BN_ULONG lo, hi;                \
684    BN_UMULT_LOHI(lo, hi, ta, tb);  \
685    c0 += lo;                       \
686    hi += (c0 < lo) ? 1 : 0;        \
687    c1 += hi;                       \
688    c2 += (c1 < hi) ? 1 : 0;        \
689  } while (0)
690
691#define mul_add_c2(a, b, c0, c1, c2) \
692  do {                               \
693    BN_ULONG ta = (a), tb = (b);     \
694    BN_ULONG lo, hi, tt;             \
695    BN_UMULT_LOHI(lo, hi, ta, tb);   \
696    c0 += lo;                        \
697    tt = hi + ((c0 < lo) ? 1 : 0);   \
698    c1 += tt;                        \
699    c2 += (c1 < tt) ? 1 : 0;         \
700    c0 += lo;                        \
701    hi += (c0 < lo) ? 1 : 0;         \
702    c1 += hi;                        \
703    c2 += (c1 < hi) ? 1 : 0;         \
704  } while (0)
705
706#define sqr_add_c(a, i, c0, c1, c2) \
707  do {                              \
708    BN_ULONG ta = (a)[i];           \
709    BN_ULONG lo, hi;                \
710    BN_UMULT_LOHI(lo, hi, ta, ta);  \
711    c0 += lo;                       \
712    hi += (c0 < lo) ? 1 : 0;        \
713    c1 += hi;                       \
714    c2 += (c1 < hi) ? 1 : 0;        \
715  } while (0)
716
717#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
718
719#else /* !BN_ULLONG */
720
721/* Keep in mind that additions to hi can not overflow, because
722 * the high word of a multiplication result cannot be all-ones. */
723
724#define mul_add_c(a, b, c0, c1, c2)        \
725  do {                                     \
726    BN_ULONG lo = LBITS(a), hi = HBITS(a); \
727    BN_ULONG bl = LBITS(b), bh = HBITS(b); \
728    mul64(lo, hi, bl, bh);                 \
729    c0 = (c0 + lo) & BN_MASK2;             \
730    if (c0 < lo)                           \
731      hi++;                                \
732    c1 = (c1 + hi) & BN_MASK2;             \
733    if (c1 < hi)                           \
734      c2++;                                \
735  } while (0)
736
737#define mul_add_c2(a, b, c0, c1, c2)       \
738  do {                                     \
739    BN_ULONG tt;                           \
740    BN_ULONG lo = LBITS(a), hi = HBITS(a); \
741    BN_ULONG bl = LBITS(b), bh = HBITS(b); \
742    mul64(lo, hi, bl, bh);                 \
743    tt = hi;                               \
744    c0 = (c0 + lo) & BN_MASK2;             \
745    if (c0 < lo)                           \
746      tt++;                                \
747    c1 = (c1 + tt) & BN_MASK2;             \
748    if (c1 < tt)                           \
749      c2++;                                \
750    c0 = (c0 + lo) & BN_MASK2;             \
751    if (c0 < lo)                           \
752      hi++;                                \
753    c1 = (c1 + hi) & BN_MASK2;             \
754    if (c1 < hi)                           \
755      c2++;                                \
756  } while (0)
757
758#define sqr_add_c(a, i, c0, c1, c2) \
759  do {                              \
760    BN_ULONG lo, hi;                \
761    sqr64(lo, hi, (a)[i]);          \
762    c0 = (c0 + lo) & BN_MASK2;      \
763    if (c0 < lo)                    \
764      hi++;                         \
765    c1 = (c1 + hi) & BN_MASK2;      \
766    if (c1 < hi)                    \
767      c2++;                         \
768  } while (0)
769
770#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
771#endif /* !BN_ULLONG */
772
773void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
774  BN_ULONG c1, c2, c3;
775
776  c1 = 0;
777  c2 = 0;
778  c3 = 0;
779  mul_add_c(a[0], b[0], c1, c2, c3);
780  r[0] = c1;
781  c1 = 0;
782  mul_add_c(a[0], b[1], c2, c3, c1);
783  mul_add_c(a[1], b[0], c2, c3, c1);
784  r[1] = c2;
785  c2 = 0;
786  mul_add_c(a[2], b[0], c3, c1, c2);
787  mul_add_c(a[1], b[1], c3, c1, c2);
788  mul_add_c(a[0], b[2], c3, c1, c2);
789  r[2] = c3;
790  c3 = 0;
791  mul_add_c(a[0], b[3], c1, c2, c3);
792  mul_add_c(a[1], b[2], c1, c2, c3);
793  mul_add_c(a[2], b[1], c1, c2, c3);
794  mul_add_c(a[3], b[0], c1, c2, c3);
795  r[3] = c1;
796  c1 = 0;
797  mul_add_c(a[4], b[0], c2, c3, c1);
798  mul_add_c(a[3], b[1], c2, c3, c1);
799  mul_add_c(a[2], b[2], c2, c3, c1);
800  mul_add_c(a[1], b[3], c2, c3, c1);
801  mul_add_c(a[0], b[4], c2, c3, c1);
802  r[4] = c2;
803  c2 = 0;
804  mul_add_c(a[0], b[5], c3, c1, c2);
805  mul_add_c(a[1], b[4], c3, c1, c2);
806  mul_add_c(a[2], b[3], c3, c1, c2);
807  mul_add_c(a[3], b[2], c3, c1, c2);
808  mul_add_c(a[4], b[1], c3, c1, c2);
809  mul_add_c(a[5], b[0], c3, c1, c2);
810  r[5] = c3;
811  c3 = 0;
812  mul_add_c(a[6], b[0], c1, c2, c3);
813  mul_add_c(a[5], b[1], c1, c2, c3);
814  mul_add_c(a[4], b[2], c1, c2, c3);
815  mul_add_c(a[3], b[3], c1, c2, c3);
816  mul_add_c(a[2], b[4], c1, c2, c3);
817  mul_add_c(a[1], b[5], c1, c2, c3);
818  mul_add_c(a[0], b[6], c1, c2, c3);
819  r[6] = c1;
820  c1 = 0;
821  mul_add_c(a[0], b[7], c2, c3, c1);
822  mul_add_c(a[1], b[6], c2, c3, c1);
823  mul_add_c(a[2], b[5], c2, c3, c1);
824  mul_add_c(a[3], b[4], c2, c3, c1);
825  mul_add_c(a[4], b[3], c2, c3, c1);
826  mul_add_c(a[5], b[2], c2, c3, c1);
827  mul_add_c(a[6], b[1], c2, c3, c1);
828  mul_add_c(a[7], b[0], c2, c3, c1);
829  r[7] = c2;
830  c2 = 0;
831  mul_add_c(a[7], b[1], c3, c1, c2);
832  mul_add_c(a[6], b[2], c3, c1, c2);
833  mul_add_c(a[5], b[3], c3, c1, c2);
834  mul_add_c(a[4], b[4], c3, c1, c2);
835  mul_add_c(a[3], b[5], c3, c1, c2);
836  mul_add_c(a[2], b[6], c3, c1, c2);
837  mul_add_c(a[1], b[7], c3, c1, c2);
838  r[8] = c3;
839  c3 = 0;
840  mul_add_c(a[2], b[7], c1, c2, c3);
841  mul_add_c(a[3], b[6], c1, c2, c3);
842  mul_add_c(a[4], b[5], c1, c2, c3);
843  mul_add_c(a[5], b[4], c1, c2, c3);
844  mul_add_c(a[6], b[3], c1, c2, c3);
845  mul_add_c(a[7], b[2], c1, c2, c3);
846  r[9] = c1;
847  c1 = 0;
848  mul_add_c(a[7], b[3], c2, c3, c1);
849  mul_add_c(a[6], b[4], c2, c3, c1);
850  mul_add_c(a[5], b[5], c2, c3, c1);
851  mul_add_c(a[4], b[6], c2, c3, c1);
852  mul_add_c(a[3], b[7], c2, c3, c1);
853  r[10] = c2;
854  c2 = 0;
855  mul_add_c(a[4], b[7], c3, c1, c2);
856  mul_add_c(a[5], b[6], c3, c1, c2);
857  mul_add_c(a[6], b[5], c3, c1, c2);
858  mul_add_c(a[7], b[4], c3, c1, c2);
859  r[11] = c3;
860  c3 = 0;
861  mul_add_c(a[7], b[5], c1, c2, c3);
862  mul_add_c(a[6], b[6], c1, c2, c3);
863  mul_add_c(a[5], b[7], c1, c2, c3);
864  r[12] = c1;
865  c1 = 0;
866  mul_add_c(a[6], b[7], c2, c3, c1);
867  mul_add_c(a[7], b[6], c2, c3, c1);
868  r[13] = c2;
869  c2 = 0;
870  mul_add_c(a[7], b[7], c3, c1, c2);
871  r[14] = c3;
872  r[15] = c1;
873}
874
875void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
876  BN_ULONG c1, c2, c3;
877
878  c1 = 0;
879  c2 = 0;
880  c3 = 0;
881  mul_add_c(a[0], b[0], c1, c2, c3);
882  r[0] = c1;
883  c1 = 0;
884  mul_add_c(a[0], b[1], c2, c3, c1);
885  mul_add_c(a[1], b[0], c2, c3, c1);
886  r[1] = c2;
887  c2 = 0;
888  mul_add_c(a[2], b[0], c3, c1, c2);
889  mul_add_c(a[1], b[1], c3, c1, c2);
890  mul_add_c(a[0], b[2], c3, c1, c2);
891  r[2] = c3;
892  c3 = 0;
893  mul_add_c(a[0], b[3], c1, c2, c3);
894  mul_add_c(a[1], b[2], c1, c2, c3);
895  mul_add_c(a[2], b[1], c1, c2, c3);
896  mul_add_c(a[3], b[0], c1, c2, c3);
897  r[3] = c1;
898  c1 = 0;
899  mul_add_c(a[3], b[1], c2, c3, c1);
900  mul_add_c(a[2], b[2], c2, c3, c1);
901  mul_add_c(a[1], b[3], c2, c3, c1);
902  r[4] = c2;
903  c2 = 0;
904  mul_add_c(a[2], b[3], c3, c1, c2);
905  mul_add_c(a[3], b[2], c3, c1, c2);
906  r[5] = c3;
907  c3 = 0;
908  mul_add_c(a[3], b[3], c1, c2, c3);
909  r[6] = c1;
910  r[7] = c2;
911}
912
913void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
914  BN_ULONG c1, c2, c3;
915
916  c1 = 0;
917  c2 = 0;
918  c3 = 0;
919  sqr_add_c(a, 0, c1, c2, c3);
920  r[0] = c1;
921  c1 = 0;
922  sqr_add_c2(a, 1, 0, c2, c3, c1);
923  r[1] = c2;
924  c2 = 0;
925  sqr_add_c(a, 1, c3, c1, c2);
926  sqr_add_c2(a, 2, 0, c3, c1, c2);
927  r[2] = c3;
928  c3 = 0;
929  sqr_add_c2(a, 3, 0, c1, c2, c3);
930  sqr_add_c2(a, 2, 1, c1, c2, c3);
931  r[3] = c1;
932  c1 = 0;
933  sqr_add_c(a, 2, c2, c3, c1);
934  sqr_add_c2(a, 3, 1, c2, c3, c1);
935  sqr_add_c2(a, 4, 0, c2, c3, c1);
936  r[4] = c2;
937  c2 = 0;
938  sqr_add_c2(a, 5, 0, c3, c1, c2);
939  sqr_add_c2(a, 4, 1, c3, c1, c2);
940  sqr_add_c2(a, 3, 2, c3, c1, c2);
941  r[5] = c3;
942  c3 = 0;
943  sqr_add_c(a, 3, c1, c2, c3);
944  sqr_add_c2(a, 4, 2, c1, c2, c3);
945  sqr_add_c2(a, 5, 1, c1, c2, c3);
946  sqr_add_c2(a, 6, 0, c1, c2, c3);
947  r[6] = c1;
948  c1 = 0;
949  sqr_add_c2(a, 7, 0, c2, c3, c1);
950  sqr_add_c2(a, 6, 1, c2, c3, c1);
951  sqr_add_c2(a, 5, 2, c2, c3, c1);
952  sqr_add_c2(a, 4, 3, c2, c3, c1);
953  r[7] = c2;
954  c2 = 0;
955  sqr_add_c(a, 4, c3, c1, c2);
956  sqr_add_c2(a, 5, 3, c3, c1, c2);
957  sqr_add_c2(a, 6, 2, c3, c1, c2);
958  sqr_add_c2(a, 7, 1, c3, c1, c2);
959  r[8] = c3;
960  c3 = 0;
961  sqr_add_c2(a, 7, 2, c1, c2, c3);
962  sqr_add_c2(a, 6, 3, c1, c2, c3);
963  sqr_add_c2(a, 5, 4, c1, c2, c3);
964  r[9] = c1;
965  c1 = 0;
966  sqr_add_c(a, 5, c2, c3, c1);
967  sqr_add_c2(a, 6, 4, c2, c3, c1);
968  sqr_add_c2(a, 7, 3, c2, c3, c1);
969  r[10] = c2;
970  c2 = 0;
971  sqr_add_c2(a, 7, 4, c3, c1, c2);
972  sqr_add_c2(a, 6, 5, c3, c1, c2);
973  r[11] = c3;
974  c3 = 0;
975  sqr_add_c(a, 6, c1, c2, c3);
976  sqr_add_c2(a, 7, 5, c1, c2, c3);
977  r[12] = c1;
978  c1 = 0;
979  sqr_add_c2(a, 7, 6, c2, c3, c1);
980  r[13] = c2;
981  c2 = 0;
982  sqr_add_c(a, 7, c3, c1, c2);
983  r[14] = c3;
984  r[15] = c1;
985}
986
987void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
988  BN_ULONG c1, c2, c3;
989
990  c1 = 0;
991  c2 = 0;
992  c3 = 0;
993  sqr_add_c(a, 0, c1, c2, c3);
994  r[0] = c1;
995  c1 = 0;
996  sqr_add_c2(a, 1, 0, c2, c3, c1);
997  r[1] = c2;
998  c2 = 0;
999  sqr_add_c(a, 1, c3, c1, c2);
1000  sqr_add_c2(a, 2, 0, c3, c1, c2);
1001  r[2] = c3;
1002  c3 = 0;
1003  sqr_add_c2(a, 3, 0, c1, c2, c3);
1004  sqr_add_c2(a, 2, 1, c1, c2, c3);
1005  r[3] = c1;
1006  c1 = 0;
1007  sqr_add_c(a, 2, c2, c3, c1);
1008  sqr_add_c2(a, 3, 1, c2, c3, c1);
1009  r[4] = c2;
1010  c2 = 0;
1011  sqr_add_c2(a, 3, 2, c3, c1, c2);
1012  r[5] = c3;
1013  c3 = 0;
1014  sqr_add_c(a, 3, c1, c2, c3);
1015  r[6] = c1;
1016  r[7] = c2;
1017}
1018
1019#endif
1020