1/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2 * All rights reserved.
3 *
4 * This package is an SSL implementation written
5 * by Eric Young (eay@cryptsoft.com).
6 * The implementation was written so as to conform with Netscapes SSL.
7 *
8 * This library is free for commercial and non-commercial use as long as
9 * the following conditions are aheared to.  The following conditions
10 * apply to all code found in this distribution, be it the RC4, RSA,
11 * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
12 * included with this distribution is covered by the same copyright terms
13 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14 *
15 * Copyright remains Eric Young's, and as such any Copyright notices in
16 * the code are not to be removed.
17 * If this package is used in a product, Eric Young should be given attribution
18 * as the author of the parts of the library used.
19 * This can be in the form of a textual message at program startup or
20 * in documentation (online or textual) provided with the package.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the copyright
26 *    notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 *    notice, this list of conditions and the following disclaimer in the
29 *    documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 *    must display the following acknowledgement:
32 *    "This product includes cryptographic software written by
33 *     Eric Young (eay@cryptsoft.com)"
34 *    The word 'cryptographic' can be left out if the rouines from the library
35 *    being used are not cryptographic related :-).
36 * 4. If you include any Windows specific code (or a derivative thereof) from
37 *    the apps directory (application code) you must include an acknowledgement:
38 *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39 *
40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * The licence and distribution terms for any publically available version or
53 * derivative of this code cannot be changed.  i.e. this code cannot simply be
54 * copied and put under another distribution licence
55 * [including the GNU Public Licence.] */
56
57#include <openssl/bn.h>
58
59#include <assert.h>
60
61#include "internal.h"
62
63
64#if defined(OPENSSL_WINDOWS) || defined(OPENSSL_NO_ASM) || \
65    (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86))
66
67#if defined(OPENSSL_WINDOWS)
68#define alloca _alloca
69#else
70#include <alloca.h>
71#endif
72
73#ifdef BN_LLONG
74#define mul_add(r, a, w, c)             \
75  {                                     \
76    BN_ULLONG t;                        \
77    t = (BN_ULLONG)w * (a) + (r) + (c); \
78    (r) = Lw(t);                        \
79    (c) = Hw(t);                        \
80  }
81
82#define mul(r, a, w, c)           \
83  {                               \
84    BN_ULLONG t;                  \
85    t = (BN_ULLONG)w * (a) + (c); \
86    (r) = Lw(t);                  \
87    (c) = Hw(t);                  \
88  }
89
90#define sqr(r0, r1, a)        \
91  {                           \
92    BN_ULLONG t;              \
93    t = (BN_ULLONG)(a) * (a); \
94    (r0) = Lw(t);             \
95    (r1) = Hw(t);             \
96  }
97
98#elif defined(BN_UMULT_LOHI)
99#define mul_add(r, a, w, c)             \
100  {                                     \
101    BN_ULONG high, low, ret, tmp = (a); \
102    ret = (r);                          \
103    BN_UMULT_LOHI(low, high, w, tmp);   \
104    ret += (c);                         \
105    (c) = (ret < (c)) ? 1 : 0;          \
106    (c) += high;                        \
107    ret += low;                         \
108    (c) += (ret < low) ? 1 : 0;         \
109    (r) = ret;                          \
110  }
111
112#define mul(r, a, w, c)                \
113  {                                    \
114    BN_ULONG high, low, ret, ta = (a); \
115    BN_UMULT_LOHI(low, high, w, ta);   \
116    ret = low + (c);                   \
117    (c) = high;                        \
118    (c) += (ret < low) ? 1 : 0;        \
119    (r) = ret;                         \
120  }
121
122#define sqr(r0, r1, a)               \
123  {                                  \
124    BN_ULONG tmp = (a);              \
125    BN_UMULT_LOHI(r0, r1, tmp, tmp); \
126  }
127
128#elif defined(BN_UMULT_HIGH)
129#define mul_add(r, a, w, c)             \
130  {                                     \
131    BN_ULONG high, low, ret, tmp = (a); \
132    ret = (r);                          \
133    high = BN_UMULT_HIGH(w, tmp);       \
134    ret += (c);                         \
135    low = (w) * tmp;                    \
136    (c) = (ret < (c)) ? 1 : 0;          \
137    (c) += high;                        \
138    ret += low;                         \
139    (c) += (ret < low) ? 1 : 0;         \
140    (r) = ret;                          \
141  }
142
143#define mul(r, a, w, c)                \
144  {                                    \
145    BN_ULONG high, low, ret, ta = (a); \
146    low = (w) * ta;                    \
147    high = BN_UMULT_HIGH(w, ta);       \
148    ret = low + (c);                   \
149    (c) = high;                        \
150    (c) += (ret < low) ? 1 : 0;        \
151    (r) = ret;                         \
152  }
153
154#define sqr(r0, r1, a)              \
155  {                                 \
156    BN_ULONG tmp = (a);             \
157    (r0) = tmp * tmp;               \
158    (r1) = BN_UMULT_HIGH(tmp, tmp); \
159  }
160
161#else
162/*************************************************************
163 * No long long type
164 */
165
166#define LBITS(a) ((a) & BN_MASK2l)
167#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
168#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
169
170#define LLBITS(a) ((a) & BN_MASKl)
171#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
172#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
173
174#define mul64(l, h, bl, bh)       \
175  {                               \
176    BN_ULONG m, m1, lt, ht;       \
177                                  \
178    lt = l;                       \
179    ht = h;                       \
180    m = (bh) * (lt);              \
181    lt = (bl) * (lt);             \
182    m1 = (bl) * (ht);             \
183    ht = (bh) * (ht);             \
184    m = (m + m1) & BN_MASK2;      \
185    if (m < m1)                   \
186      ht += L2HBITS((BN_ULONG)1); \
187    ht += HBITS(m);               \
188    m1 = L2HBITS(m);              \
189    lt = (lt + m1) & BN_MASK2;    \
190    if (lt < m1)                  \
191      ht++;                       \
192    (l) = lt;                     \
193    (h) = ht;                     \
194  }
195
196#define sqr64(lo, ho, in)                    \
197  {                                          \
198    BN_ULONG l, h, m;                        \
199                                             \
200    h = (in);                                \
201    l = LBITS(h);                            \
202    h = HBITS(h);                            \
203    m = (l) * (h);                           \
204    l *= l;                                  \
205    h *= h;                                  \
206    h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
207    m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
208    l = (l + m) & BN_MASK2;                  \
209    if (l < m)                               \
210      h++;                                   \
211    (lo) = l;                                \
212    (ho) = h;                                \
213  }
214
215#define mul_add(r, a, bl, bh, c) \
216  {                              \
217    BN_ULONG l, h;               \
218                                 \
219    h = (a);                     \
220    l = LBITS(h);                \
221    h = HBITS(h);                \
222    mul64(l, h, (bl), (bh));     \
223                                 \
224    /* non-multiply part */      \
225    l = (l + (c)) & BN_MASK2;    \
226    if (l < (c))                 \
227      h++;                       \
228    (c) = (r);                   \
229    l = (l + (c)) & BN_MASK2;    \
230    if (l < (c))                 \
231      h++;                       \
232    (c) = h & BN_MASK2;          \
233    (r) = l;                     \
234  }
235
236#define mul(r, a, bl, bh, c)  \
237  {                           \
238    BN_ULONG l, h;            \
239                              \
240    h = (a);                  \
241    l = LBITS(h);             \
242    h = HBITS(h);             \
243    mul64(l, h, (bl), (bh));  \
244                              \
245    /* non-multiply part */   \
246    l += (c);                 \
247    if ((l & BN_MASK2) < (c)) \
248      h++;                    \
249    (c) = h & BN_MASK2;       \
250    (r) = l & BN_MASK2;       \
251  }
252#endif /* !BN_LLONG */
253
254#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
255
256BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
257                          BN_ULONG w) {
258  BN_ULONG c1 = 0;
259
260  assert(num >= 0);
261  if (num <= 0) {
262    return c1;
263  }
264
265  while (num & ~3) {
266    mul_add(rp[0], ap[0], w, c1);
267    mul_add(rp[1], ap[1], w, c1);
268    mul_add(rp[2], ap[2], w, c1);
269    mul_add(rp[3], ap[3], w, c1);
270    ap += 4;
271    rp += 4;
272    num -= 4;
273  }
274
275  while (num) {
276    mul_add(rp[0], ap[0], w, c1);
277    ap++;
278    rp++;
279    num--;
280  }
281
282  return c1;
283}
284
285BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
286  BN_ULONG c1 = 0;
287
288  assert(num >= 0);
289  if (num <= 0) {
290    return c1;
291  }
292
293  while (num & ~3) {
294    mul(rp[0], ap[0], w, c1);
295    mul(rp[1], ap[1], w, c1);
296    mul(rp[2], ap[2], w, c1);
297    mul(rp[3], ap[3], w, c1);
298    ap += 4;
299    rp += 4;
300    num -= 4;
301  }
302  while (num) {
303    mul(rp[0], ap[0], w, c1);
304    ap++;
305    rp++;
306    num--;
307  }
308  return c1;
309}
310
311void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
312  assert(n >= 0);
313  if (n <= 0) {
314    return;
315  }
316
317  while (n & ~3) {
318    sqr(r[0], r[1], a[0]);
319    sqr(r[2], r[3], a[1]);
320    sqr(r[4], r[5], a[2]);
321    sqr(r[6], r[7], a[3]);
322    a += 4;
323    r += 8;
324    n -= 4;
325  }
326  while (n) {
327    sqr(r[0], r[1], a[0]);
328    a++;
329    r += 2;
330    n--;
331  }
332}
333
334#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
335
336BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
337                          BN_ULONG w) {
338  BN_ULONG c = 0;
339  BN_ULONG bl, bh;
340
341  assert(num >= 0);
342  if (num <= 0) {
343    return (BN_ULONG)0;
344  }
345
346  bl = LBITS(w);
347  bh = HBITS(w);
348
349  while (num & ~3) {
350    mul_add(rp[0], ap[0], bl, bh, c);
351    mul_add(rp[1], ap[1], bl, bh, c);
352    mul_add(rp[2], ap[2], bl, bh, c);
353    mul_add(rp[3], ap[3], bl, bh, c);
354    ap += 4;
355    rp += 4;
356    num -= 4;
357  }
358  while (num) {
359    mul_add(rp[0], ap[0], bl, bh, c);
360    ap++;
361    rp++;
362    num--;
363  }
364  return c;
365}
366
367BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
368  BN_ULONG carry = 0;
369  BN_ULONG bl, bh;
370
371  assert(num >= 0);
372  if (num <= 0) {
373    return (BN_ULONG)0;
374  }
375
376  bl = LBITS(w);
377  bh = HBITS(w);
378
379  while (num & ~3) {
380    mul(rp[0], ap[0], bl, bh, carry);
381    mul(rp[1], ap[1], bl, bh, carry);
382    mul(rp[2], ap[2], bl, bh, carry);
383    mul(rp[3], ap[3], bl, bh, carry);
384    ap += 4;
385    rp += 4;
386    num -= 4;
387  }
388  while (num) {
389    mul(rp[0], ap[0], bl, bh, carry);
390    ap++;
391    rp++;
392    num--;
393  }
394  return carry;
395}
396
397void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
398  assert(n >= 0);
399  if (n <= 0) {
400    return;
401  }
402
403  while (n & ~3) {
404    sqr64(r[0], r[1], a[0]);
405    sqr64(r[2], r[3], a[1]);
406    sqr64(r[4], r[5], a[2]);
407    sqr64(r[6], r[7], a[3]);
408    a += 4;
409    r += 8;
410    n -= 4;
411  }
412  while (n) {
413    sqr64(r[0], r[1], a[0]);
414    a++;
415    r += 2;
416    n--;
417  }
418}
419
420#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
421
422#if defined(BN_LLONG) && defined(BN_DIV2W)
423
424BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
425  return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
426}
427
428#else
429
430/* Divide h,l by d and return the result. */
431BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
432  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
433  int i, count = 2;
434
435  if (d == 0) {
436    return BN_MASK2;
437  }
438
439  i = BN_num_bits_word(d);
440  assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
441
442  i = BN_BITS2 - i;
443  if (h >= d) {
444    h -= d;
445  }
446
447  if (i) {
448    d <<= i;
449    h = (h << i) | (l >> (BN_BITS2 - i));
450    l <<= i;
451  }
452  dh = (d & BN_MASK2h) >> BN_BITS4;
453  dl = (d & BN_MASK2l);
454  for (;;) {
455    if ((h >> BN_BITS4) == dh) {
456      q = BN_MASK2l;
457    } else {
458      q = h / dh;
459    }
460
461    th = q * dh;
462    tl = dl * q;
463    for (;;) {
464      t = h - th;
465      if ((t & BN_MASK2h) ||
466          ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
467        break;
468      }
469      q--;
470      th -= dh;
471      tl -= dl;
472    }
473    t = (tl >> BN_BITS4);
474    tl = (tl << BN_BITS4) & BN_MASK2h;
475    th += t;
476
477    if (l < tl) {
478      th++;
479    }
480    l -= tl;
481    if (h < th) {
482      h += d;
483      q--;
484    }
485    h -= th;
486
487    if (--count == 0) {
488      break;
489    }
490
491    ret = q << BN_BITS4;
492    h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
493    l = (l & BN_MASK2l) << BN_BITS4;
494  }
495
496  ret |= q;
497  return ret;
498}
499
500#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
501
502#ifdef BN_LLONG
503BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
504                      int n) {
505  BN_ULLONG ll = 0;
506
507  assert(n >= 0);
508  if (n <= 0) {
509    return (BN_ULONG)0;
510  }
511
512  while (n & ~3) {
513    ll += (BN_ULLONG)a[0] + b[0];
514    r[0] = (BN_ULONG)ll & BN_MASK2;
515    ll >>= BN_BITS2;
516    ll += (BN_ULLONG)a[1] + b[1];
517    r[1] = (BN_ULONG)ll & BN_MASK2;
518    ll >>= BN_BITS2;
519    ll += (BN_ULLONG)a[2] + b[2];
520    r[2] = (BN_ULONG)ll & BN_MASK2;
521    ll >>= BN_BITS2;
522    ll += (BN_ULLONG)a[3] + b[3];
523    r[3] = (BN_ULONG)ll & BN_MASK2;
524    ll >>= BN_BITS2;
525    a += 4;
526    b += 4;
527    r += 4;
528    n -= 4;
529  }
530  while (n) {
531    ll += (BN_ULLONG)a[0] + b[0];
532    r[0] = (BN_ULONG)ll & BN_MASK2;
533    ll >>= BN_BITS2;
534    a++;
535    b++;
536    r++;
537    n--;
538  }
539  return (BN_ULONG)ll;
540}
541
542#else /* !BN_LLONG */
543
544BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
545                      int n) {
546  BN_ULONG c, l, t;
547
548  assert(n >= 0);
549  if (n <= 0) {
550    return (BN_ULONG)0;
551  }
552
553  c = 0;
554  while (n & ~3) {
555    t = a[0];
556    t = (t + c) & BN_MASK2;
557    c = (t < c);
558    l = (t + b[0]) & BN_MASK2;
559    c += (l < t);
560    r[0] = l;
561    t = a[1];
562    t = (t + c) & BN_MASK2;
563    c = (t < c);
564    l = (t + b[1]) & BN_MASK2;
565    c += (l < t);
566    r[1] = l;
567    t = a[2];
568    t = (t + c) & BN_MASK2;
569    c = (t < c);
570    l = (t + b[2]) & BN_MASK2;
571    c += (l < t);
572    r[2] = l;
573    t = a[3];
574    t = (t + c) & BN_MASK2;
575    c = (t < c);
576    l = (t + b[3]) & BN_MASK2;
577    c += (l < t);
578    r[3] = l;
579    a += 4;
580    b += 4;
581    r += 4;
582    n -= 4;
583  }
584  while (n) {
585    t = a[0];
586    t = (t + c) & BN_MASK2;
587    c = (t < c);
588    l = (t + b[0]) & BN_MASK2;
589    c += (l < t);
590    r[0] = l;
591    a++;
592    b++;
593    r++;
594    n--;
595  }
596  return (BN_ULONG)c;
597}
598
599#endif /* !BN_LLONG */
600
601BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
602                      int n) {
603  BN_ULONG t1, t2;
604  int c = 0;
605
606  assert(n >= 0);
607  if (n <= 0) {
608    return (BN_ULONG)0;
609  }
610
611  while (n & ~3) {
612    t1 = a[0];
613    t2 = b[0];
614    r[0] = (t1 - t2 - c) & BN_MASK2;
615    if (t1 != t2)
616      c = (t1 < t2);
617    t1 = a[1];
618    t2 = b[1];
619    r[1] = (t1 - t2 - c) & BN_MASK2;
620    if (t1 != t2)
621      c = (t1 < t2);
622    t1 = a[2];
623    t2 = b[2];
624    r[2] = (t1 - t2 - c) & BN_MASK2;
625    if (t1 != t2)
626      c = (t1 < t2);
627    t1 = a[3];
628    t2 = b[3];
629    r[3] = (t1 - t2 - c) & BN_MASK2;
630    if (t1 != t2)
631      c = (t1 < t2);
632    a += 4;
633    b += 4;
634    r += 4;
635    n -= 4;
636  }
637  while (n) {
638    t1 = a[0];
639    t2 = b[0];
640    r[0] = (t1 - t2 - c) & BN_MASK2;
641    if (t1 != t2)
642      c = (t1 < t2);
643    a++;
644    b++;
645    r++;
646    n--;
647  }
648  return c;
649}
650
651/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
652/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
653/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
654/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
655
656#ifdef BN_LLONG
657#define mul_add_c(a, b, c0, c1, c2) \
658  t = (BN_ULLONG)a * b;             \
659  t1 = (BN_ULONG)Lw(t);             \
660  t2 = (BN_ULONG)Hw(t);             \
661  c0 = (c0 + t1) & BN_MASK2;        \
662  if ((c0) < t1)                    \
663    t2++;                           \
664  c1 = (c1 + t2) & BN_MASK2;        \
665  if ((c1) < t2)                    \
666    c2++;
667
668#define mul_add_c2(a, b, c0, c1, c2)           \
669  t = (BN_ULLONG)a * b;                        \
670  tt = (t + t) & BN_MASK;                      \
671  if (tt < t)                                  \
672    c2++;                                      \
673  t1 = (BN_ULONG)Lw(tt);                       \
674  t2 = (BN_ULONG)Hw(tt);                       \
675  c0 = (c0 + t1) & BN_MASK2;                   \
676  if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
677    c2++;                                      \
678  c1 = (c1 + t2) & BN_MASK2;                   \
679  if ((c1) < t2)                               \
680    c2++;
681
682#define sqr_add_c(a, i, c0, c1, c2) \
683  t = (BN_ULLONG)a[i] * a[i];       \
684  t1 = (BN_ULONG)Lw(t);             \
685  t2 = (BN_ULONG)Hw(t);             \
686  c0 = (c0 + t1) & BN_MASK2;        \
687  if ((c0) < t1)                    \
688    t2++;                           \
689  c1 = (c1 + t2) & BN_MASK2;        \
690  if ((c1) < t2)                    \
691    c2++;
692
693#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
694
695#elif defined(BN_UMULT_LOHI)
696
697#define mul_add_c(a, b, c0, c1, c2) \
698  {                                 \
699    BN_ULONG ta = (a), tb = (b);    \
700    BN_UMULT_LOHI(t1, t2, ta, tb);  \
701    c0 += t1;                       \
702    t2 += (c0 < t1) ? 1 : 0;        \
703    c1 += t2;                       \
704    c2 += (c1 < t2) ? 1 : 0;        \
705  }
706
707#define mul_add_c2(a, b, c0, c1, c2) \
708  {                                  \
709    BN_ULONG ta = (a), tb = (b), t0; \
710    BN_UMULT_LOHI(t0, t1, ta, tb);   \
711    t2 = t1 + t1;                    \
712    c2 += (t2 < t1) ? 1 : 0;         \
713    t1 = t0 + t0;                    \
714    t2 += (t1 < t0) ? 1 : 0;         \
715    c0 += t1;                        \
716    t2 += (c0 < t1) ? 1 : 0;         \
717    c1 += t2;                        \
718    c2 += (c1 < t2) ? 1 : 0;         \
719  }
720
721#define sqr_add_c(a, i, c0, c1, c2) \
722  {                                 \
723    BN_ULONG ta = (a)[i];           \
724    BN_UMULT_LOHI(t1, t2, ta, ta);  \
725    c0 += t1;                       \
726    t2 += (c0 < t1) ? 1 : 0;        \
727    c1 += t2;                       \
728    c2 += (c1 < t2) ? 1 : 0;        \
729  }
730
731#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
732
733#elif defined(BN_UMULT_HIGH)
734
735#define mul_add_c(a, b, c0, c1, c2) \
736  {                                 \
737    BN_ULONG ta = (a), tb = (b);    \
738    t1 = ta * tb;                   \
739    t2 = BN_UMULT_HIGH(ta, tb);     \
740    c0 += t1;                       \
741    t2 += (c0 < t1) ? 1 : 0;        \
742    c1 += t2;                       \
743    c2 += (c1 < t2) ? 1 : 0;        \
744  }
745
746#define mul_add_c2(a, b, c0, c1, c2) \
747  {                                  \
748    BN_ULONG ta = (a), tb = (b), t0; \
749    t1 = BN_UMULT_HIGH(ta, tb);      \
750    t0 = ta * tb;                    \
751    t2 = t1 + t1;                    \
752    c2 += (t2 < t1) ? 1 : 0;         \
753    t1 = t0 + t0;                    \
754    t2 += (t1 < t0) ? 1 : 0;         \
755    c0 += t1;                        \
756    t2 += (c0 < t1) ? 1 : 0;         \
757    c1 += t2;                        \
758    c2 += (c1 < t2) ? 1 : 0;         \
759  }
760
761#define sqr_add_c(a, i, c0, c1, c2) \
762  {                                 \
763    BN_ULONG ta = (a)[i];           \
764    t1 = ta * ta;                   \
765    t2 = BN_UMULT_HIGH(ta, ta);     \
766    c0 += t1;                       \
767    t2 += (c0 < t1) ? 1 : 0;        \
768    c1 += t2;                       \
769    c2 += (c1 < t2) ? 1 : 0;        \
770  }
771
772#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
773
774#else /* !BN_LLONG */
775#define mul_add_c(a, b, c0, c1, c2) \
776  t1 = LBITS(a);                    \
777  t2 = HBITS(a);                    \
778  bl = LBITS(b);                    \
779  bh = HBITS(b);                    \
780  mul64(t1, t2, bl, bh);            \
781  c0 = (c0 + t1) & BN_MASK2;        \
782  if ((c0) < t1)                    \
783    t2++;                           \
784  c1 = (c1 + t2) & BN_MASK2;        \
785  if ((c1) < t2)                    \
786    c2++;
787
788#define mul_add_c2(a, b, c0, c1, c2)           \
789  t1 = LBITS(a);                               \
790  t2 = HBITS(a);                               \
791  bl = LBITS(b);                               \
792  bh = HBITS(b);                               \
793  mul64(t1, t2, bl, bh);                       \
794  if (t2 & BN_TBIT)                            \
795    c2++;                                      \
796  t2 = (t2 + t2) & BN_MASK2;                   \
797  if (t1 & BN_TBIT)                            \
798    t2++;                                      \
799  t1 = (t1 + t1) & BN_MASK2;                   \
800  c0 = (c0 + t1) & BN_MASK2;                   \
801  if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
802    c2++;                                      \
803  c1 = (c1 + t2) & BN_MASK2;                   \
804  if ((c1) < t2)                               \
805    c2++;
806
807#define sqr_add_c(a, i, c0, c1, c2) \
808  sqr64(t1, t2, (a)[i]);            \
809  c0 = (c0 + t1) & BN_MASK2;        \
810  if ((c0) < t1)                    \
811    t2++;                           \
812  c1 = (c1 + t2) & BN_MASK2;        \
813  if ((c1) < t2)                    \
814    c2++;
815
816#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
817#endif /* !BN_LLONG */
818
819void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
820#ifdef BN_LLONG
821  BN_ULLONG t;
822#else
823  BN_ULONG bl, bh;
824#endif
825  BN_ULONG t1, t2;
826  BN_ULONG c1, c2, c3;
827
828  c1 = 0;
829  c2 = 0;
830  c3 = 0;
831  mul_add_c(a[0], b[0], c1, c2, c3);
832  r[0] = c1;
833  c1 = 0;
834  mul_add_c(a[0], b[1], c2, c3, c1);
835  mul_add_c(a[1], b[0], c2, c3, c1);
836  r[1] = c2;
837  c2 = 0;
838  mul_add_c(a[2], b[0], c3, c1, c2);
839  mul_add_c(a[1], b[1], c3, c1, c2);
840  mul_add_c(a[0], b[2], c3, c1, c2);
841  r[2] = c3;
842  c3 = 0;
843  mul_add_c(a[0], b[3], c1, c2, c3);
844  mul_add_c(a[1], b[2], c1, c2, c3);
845  mul_add_c(a[2], b[1], c1, c2, c3);
846  mul_add_c(a[3], b[0], c1, c2, c3);
847  r[3] = c1;
848  c1 = 0;
849  mul_add_c(a[4], b[0], c2, c3, c1);
850  mul_add_c(a[3], b[1], c2, c3, c1);
851  mul_add_c(a[2], b[2], c2, c3, c1);
852  mul_add_c(a[1], b[3], c2, c3, c1);
853  mul_add_c(a[0], b[4], c2, c3, c1);
854  r[4] = c2;
855  c2 = 0;
856  mul_add_c(a[0], b[5], c3, c1, c2);
857  mul_add_c(a[1], b[4], c3, c1, c2);
858  mul_add_c(a[2], b[3], c3, c1, c2);
859  mul_add_c(a[3], b[2], c3, c1, c2);
860  mul_add_c(a[4], b[1], c3, c1, c2);
861  mul_add_c(a[5], b[0], c3, c1, c2);
862  r[5] = c3;
863  c3 = 0;
864  mul_add_c(a[6], b[0], c1, c2, c3);
865  mul_add_c(a[5], b[1], c1, c2, c3);
866  mul_add_c(a[4], b[2], c1, c2, c3);
867  mul_add_c(a[3], b[3], c1, c2, c3);
868  mul_add_c(a[2], b[4], c1, c2, c3);
869  mul_add_c(a[1], b[5], c1, c2, c3);
870  mul_add_c(a[0], b[6], c1, c2, c3);
871  r[6] = c1;
872  c1 = 0;
873  mul_add_c(a[0], b[7], c2, c3, c1);
874  mul_add_c(a[1], b[6], c2, c3, c1);
875  mul_add_c(a[2], b[5], c2, c3, c1);
876  mul_add_c(a[3], b[4], c2, c3, c1);
877  mul_add_c(a[4], b[3], c2, c3, c1);
878  mul_add_c(a[5], b[2], c2, c3, c1);
879  mul_add_c(a[6], b[1], c2, c3, c1);
880  mul_add_c(a[7], b[0], c2, c3, c1);
881  r[7] = c2;
882  c2 = 0;
883  mul_add_c(a[7], b[1], c3, c1, c2);
884  mul_add_c(a[6], b[2], c3, c1, c2);
885  mul_add_c(a[5], b[3], c3, c1, c2);
886  mul_add_c(a[4], b[4], c3, c1, c2);
887  mul_add_c(a[3], b[5], c3, c1, c2);
888  mul_add_c(a[2], b[6], c3, c1, c2);
889  mul_add_c(a[1], b[7], c3, c1, c2);
890  r[8] = c3;
891  c3 = 0;
892  mul_add_c(a[2], b[7], c1, c2, c3);
893  mul_add_c(a[3], b[6], c1, c2, c3);
894  mul_add_c(a[4], b[5], c1, c2, c3);
895  mul_add_c(a[5], b[4], c1, c2, c3);
896  mul_add_c(a[6], b[3], c1, c2, c3);
897  mul_add_c(a[7], b[2], c1, c2, c3);
898  r[9] = c1;
899  c1 = 0;
900  mul_add_c(a[7], b[3], c2, c3, c1);
901  mul_add_c(a[6], b[4], c2, c3, c1);
902  mul_add_c(a[5], b[5], c2, c3, c1);
903  mul_add_c(a[4], b[6], c2, c3, c1);
904  mul_add_c(a[3], b[7], c2, c3, c1);
905  r[10] = c2;
906  c2 = 0;
907  mul_add_c(a[4], b[7], c3, c1, c2);
908  mul_add_c(a[5], b[6], c3, c1, c2);
909  mul_add_c(a[6], b[5], c3, c1, c2);
910  mul_add_c(a[7], b[4], c3, c1, c2);
911  r[11] = c3;
912  c3 = 0;
913  mul_add_c(a[7], b[5], c1, c2, c3);
914  mul_add_c(a[6], b[6], c1, c2, c3);
915  mul_add_c(a[5], b[7], c1, c2, c3);
916  r[12] = c1;
917  c1 = 0;
918  mul_add_c(a[6], b[7], c2, c3, c1);
919  mul_add_c(a[7], b[6], c2, c3, c1);
920  r[13] = c2;
921  c2 = 0;
922  mul_add_c(a[7], b[7], c3, c1, c2);
923  r[14] = c3;
924  r[15] = c1;
925}
926
927void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
928#ifdef BN_LLONG
929  BN_ULLONG t;
930#else
931  BN_ULONG bl, bh;
932#endif
933  BN_ULONG t1, t2;
934  BN_ULONG c1, c2, c3;
935
936  c1 = 0;
937  c2 = 0;
938  c3 = 0;
939  mul_add_c(a[0], b[0], c1, c2, c3);
940  r[0] = c1;
941  c1 = 0;
942  mul_add_c(a[0], b[1], c2, c3, c1);
943  mul_add_c(a[1], b[0], c2, c3, c1);
944  r[1] = c2;
945  c2 = 0;
946  mul_add_c(a[2], b[0], c3, c1, c2);
947  mul_add_c(a[1], b[1], c3, c1, c2);
948  mul_add_c(a[0], b[2], c3, c1, c2);
949  r[2] = c3;
950  c3 = 0;
951  mul_add_c(a[0], b[3], c1, c2, c3);
952  mul_add_c(a[1], b[2], c1, c2, c3);
953  mul_add_c(a[2], b[1], c1, c2, c3);
954  mul_add_c(a[3], b[0], c1, c2, c3);
955  r[3] = c1;
956  c1 = 0;
957  mul_add_c(a[3], b[1], c2, c3, c1);
958  mul_add_c(a[2], b[2], c2, c3, c1);
959  mul_add_c(a[1], b[3], c2, c3, c1);
960  r[4] = c2;
961  c2 = 0;
962  mul_add_c(a[2], b[3], c3, c1, c2);
963  mul_add_c(a[3], b[2], c3, c1, c2);
964  r[5] = c3;
965  c3 = 0;
966  mul_add_c(a[3], b[3], c1, c2, c3);
967  r[6] = c1;
968  r[7] = c2;
969}
970
971void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
972#ifdef BN_LLONG
973  BN_ULLONG t, tt;
974#else
975  BN_ULONG bl, bh;
976#endif
977  BN_ULONG t1, t2;
978  BN_ULONG c1, c2, c3;
979
980  c1 = 0;
981  c2 = 0;
982  c3 = 0;
983  sqr_add_c(a, 0, c1, c2, c3);
984  r[0] = c1;
985  c1 = 0;
986  sqr_add_c2(a, 1, 0, c2, c3, c1);
987  r[1] = c2;
988  c2 = 0;
989  sqr_add_c(a, 1, c3, c1, c2);
990  sqr_add_c2(a, 2, 0, c3, c1, c2);
991  r[2] = c3;
992  c3 = 0;
993  sqr_add_c2(a, 3, 0, c1, c2, c3);
994  sqr_add_c2(a, 2, 1, c1, c2, c3);
995  r[3] = c1;
996  c1 = 0;
997  sqr_add_c(a, 2, c2, c3, c1);
998  sqr_add_c2(a, 3, 1, c2, c3, c1);
999  sqr_add_c2(a, 4, 0, c2, c3, c1);
1000  r[4] = c2;
1001  c2 = 0;
1002  sqr_add_c2(a, 5, 0, c3, c1, c2);
1003  sqr_add_c2(a, 4, 1, c3, c1, c2);
1004  sqr_add_c2(a, 3, 2, c3, c1, c2);
1005  r[5] = c3;
1006  c3 = 0;
1007  sqr_add_c(a, 3, c1, c2, c3);
1008  sqr_add_c2(a, 4, 2, c1, c2, c3);
1009  sqr_add_c2(a, 5, 1, c1, c2, c3);
1010  sqr_add_c2(a, 6, 0, c1, c2, c3);
1011  r[6] = c1;
1012  c1 = 0;
1013  sqr_add_c2(a, 7, 0, c2, c3, c1);
1014  sqr_add_c2(a, 6, 1, c2, c3, c1);
1015  sqr_add_c2(a, 5, 2, c2, c3, c1);
1016  sqr_add_c2(a, 4, 3, c2, c3, c1);
1017  r[7] = c2;
1018  c2 = 0;
1019  sqr_add_c(a, 4, c3, c1, c2);
1020  sqr_add_c2(a, 5, 3, c3, c1, c2);
1021  sqr_add_c2(a, 6, 2, c3, c1, c2);
1022  sqr_add_c2(a, 7, 1, c3, c1, c2);
1023  r[8] = c3;
1024  c3 = 0;
1025  sqr_add_c2(a, 7, 2, c1, c2, c3);
1026  sqr_add_c2(a, 6, 3, c1, c2, c3);
1027  sqr_add_c2(a, 5, 4, c1, c2, c3);
1028  r[9] = c1;
1029  c1 = 0;
1030  sqr_add_c(a, 5, c2, c3, c1);
1031  sqr_add_c2(a, 6, 4, c2, c3, c1);
1032  sqr_add_c2(a, 7, 3, c2, c3, c1);
1033  r[10] = c2;
1034  c2 = 0;
1035  sqr_add_c2(a, 7, 4, c3, c1, c2);
1036  sqr_add_c2(a, 6, 5, c3, c1, c2);
1037  r[11] = c3;
1038  c3 = 0;
1039  sqr_add_c(a, 6, c1, c2, c3);
1040  sqr_add_c2(a, 7, 5, c1, c2, c3);
1041  r[12] = c1;
1042  c1 = 0;
1043  sqr_add_c2(a, 7, 6, c2, c3, c1);
1044  r[13] = c2;
1045  c2 = 0;
1046  sqr_add_c(a, 7, c3, c1, c2);
1047  r[14] = c3;
1048  r[15] = c1;
1049}
1050
1051void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
1052#ifdef BN_LLONG
1053  BN_ULLONG t, tt;
1054#else
1055  BN_ULONG bl, bh;
1056#endif
1057  BN_ULONG t1, t2;
1058  BN_ULONG c1, c2, c3;
1059
1060  c1 = 0;
1061  c2 = 0;
1062  c3 = 0;
1063  sqr_add_c(a, 0, c1, c2, c3);
1064  r[0] = c1;
1065  c1 = 0;
1066  sqr_add_c2(a, 1, 0, c2, c3, c1);
1067  r[1] = c2;
1068  c2 = 0;
1069  sqr_add_c(a, 1, c3, c1, c2);
1070  sqr_add_c2(a, 2, 0, c3, c1, c2);
1071  r[2] = c3;
1072  c3 = 0;
1073  sqr_add_c2(a, 3, 0, c1, c2, c3);
1074  sqr_add_c2(a, 2, 1, c1, c2, c3);
1075  r[3] = c1;
1076  c1 = 0;
1077  sqr_add_c(a, 2, c2, c3, c1);
1078  sqr_add_c2(a, 3, 1, c2, c3, c1);
1079  r[4] = c2;
1080  c2 = 0;
1081  sqr_add_c2(a, 3, 2, c3, c1, c2);
1082  r[5] = c3;
1083  c3 = 0;
1084  sqr_add_c(a, 3, c1, c2, c3);
1085  r[6] = c1;
1086  r[7] = c2;
1087}
1088
1089#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
1090/* This is essentially reference implementation, which may or may not
1091 * result in performance improvement. E.g. on IA-32 this routine was
1092 * observed to give 40% faster rsa1024 private key operations and 10%
1093 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
1094 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
1095 * reference implementation, one to be used as starting point for
1096 * platform-specific assembler. Mentioned numbers apply to compiler
1097 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
1098 * can vary not only from platform to platform, but even for compiler
1099 * versions. Assembler vs. assembler improvement coefficients can
1100 * [and are known to] differ and are to be documented elsewhere. */
1101int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1102                const BN_ULONG *np, const BN_ULONG *n0p, int num) {
1103  BN_ULONG c0, c1, ml, *tp, n0;
1104#ifdef mul64
1105  BN_ULONG mh;
1106#endif
1107  volatile BN_ULONG *vp;
1108  int i = 0, j;
1109
1110#if 0 /* template for platform-specific implementation */
1111	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
1112#endif
1113  vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1114
1115  n0 = *n0p;
1116
1117  c0 = 0;
1118  ml = bp[0];
1119#ifdef mul64
1120  mh = HBITS(ml);
1121  ml = LBITS(ml);
1122  for (j = 0; j < num; ++j)
1123    mul(tp[j], ap[j], ml, mh, c0);
1124#else
1125  for (j = 0; j < num; ++j)
1126    mul(tp[j], ap[j], ml, c0);
1127#endif
1128
1129  tp[num] = c0;
1130  tp[num + 1] = 0;
1131  goto enter;
1132
1133  for (i = 0; i < num; i++) {
1134    c0 = 0;
1135    ml = bp[i];
1136#ifdef mul64
1137    mh = HBITS(ml);
1138    ml = LBITS(ml);
1139    for (j = 0; j < num; ++j)
1140      mul_add(tp[j], ap[j], ml, mh, c0);
1141#else
1142    for (j = 0; j < num; ++j)
1143      mul_add(tp[j], ap[j], ml, c0);
1144#endif
1145    c1 = (tp[num] + c0) & BN_MASK2;
1146    tp[num] = c1;
1147    tp[num + 1] = (c1 < c0 ? 1 : 0);
1148  enter:
1149    c1 = tp[0];
1150    ml = (c1 * n0) & BN_MASK2;
1151    c0 = 0;
1152#ifdef mul64
1153    mh = HBITS(ml);
1154    ml = LBITS(ml);
1155    mul_add(c1, np[0], ml, mh, c0);
1156#else
1157    mul_add(c1, ml, np[0], c0);
1158#endif
1159    for (j = 1; j < num; j++) {
1160      c1 = tp[j];
1161#ifdef mul64
1162      mul_add(c1, np[j], ml, mh, c0);
1163#else
1164      mul_add(c1, ml, np[j], c0);
1165#endif
1166      tp[j - 1] = c1 & BN_MASK2;
1167    }
1168    c1 = (tp[num] + c0) & BN_MASK2;
1169    tp[num - 1] = c1;
1170    tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
1171  }
1172
1173  if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1174    c0 = bn_sub_words(rp, tp, np, num);
1175    if (tp[num] != 0 || c0 == 0) {
1176      for (i = 0; i < num + 2; i++)
1177        vp[i] = 0;
1178      return 1;
1179    }
1180  }
1181  for (i = 0; i < num; i++)
1182    rp[i] = tp[i], vp[i] = 0;
1183  vp[num] = 0;
1184  vp[num + 1] = 0;
1185  return 1;
1186}
1187#endif
1188
1189#endif
1190