libtommath.c revision 700a137ab366edc72e371da68ba187b4717ee660
1/*
2 * Minimal code for RSA support from LibTomMath 0.41
3 * http://libtom.org/
4 * http://libtom.org/files/ltm-0.41.tar.bz2
5 * This library was released in public domain by Tom St Denis.
6 *
7 * The combination in this file may not use all of the optimized algorithms
8 * from LibTomMath and may be considerable slower than the LibTomMath with its
9 * default settings. The main purpose of having this version here is to make it
10 * easier to build bignum.c wrapper without having to install and build an
11 * external library.
12 *
13 * If CONFIG_INTERNAL_LIBTOMMATH is defined, bignum.c includes this
14 * libtommath.c file instead of using the external LibTomMath library.
15 */
16
17#ifndef CHAR_BIT
18#define CHAR_BIT 8
19#endif
20
21#define BN_MP_INVMOD_C
22#define BN_S_MP_EXPTMOD_C /* Note: #undef in tommath_superclass.h; this would
23			   * require BN_MP_EXPTMOD_FAST_C instead */
24#define BN_S_MP_MUL_DIGS_C
25#define BN_MP_INVMOD_SLOW_C
26#define BN_S_MP_SQR_C
27#define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
28				 * would require other than mp_reduce */
29
30#ifdef LTM_FAST
31
32/* Use faster div at the cost of about 1 kB */
33#define BN_MP_MUL_D_C
34
35/* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
36#define BN_MP_EXPTMOD_FAST_C
37#define BN_MP_MONTGOMERY_SETUP_C
38#define BN_FAST_MP_MONTGOMERY_REDUCE_C
39#define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
40#define BN_MP_MUL_2_C
41
42/* Include faster sqr at the cost of about 0.5 kB in code */
43#define BN_FAST_S_MP_SQR_C
44
45/* About 0.25 kB of code, but ~1.7kB of stack space! */
46#define BN_FAST_S_MP_MUL_DIGS_C
47
48#else /* LTM_FAST */
49
50#define BN_MP_DIV_SMALL
51#define BN_MP_INIT_MULTI_C
52#define BN_MP_CLEAR_MULTI_C
53#define BN_MP_ABS_C
54#endif /* LTM_FAST */
55
56/* Current uses do not require support for negative exponent in exptmod, so we
57 * can save about 1.5 kB in leaving out invmod. */
58#define LTM_NO_NEG_EXP
59
60/* from tommath.h */
61
62#ifndef MIN
63   #define MIN(x,y) ((x)<(y)?(x):(y))
64#endif
65
66#ifndef MAX
67   #define MAX(x,y) ((x)>(y)?(x):(y))
68#endif
69
70#define  OPT_CAST(x)
71
72#ifdef __x86_64__
73typedef unsigned long mp_digit;
74typedef unsigned long mp_word __attribute__((mode(TI)));
75
76#define DIGIT_BIT 60
77#define MP_64BIT
78#else
79typedef unsigned long mp_digit;
80typedef u64 mp_word;
81
82#define DIGIT_BIT          28
83#define MP_28BIT
84#endif
85
86
87#define XMALLOC  os_malloc
88#define XFREE    os_free
89#define XREALLOC os_realloc
90
91
92#define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
93
94#define MP_LT        -1   /* less than */
95#define MP_EQ         0   /* equal to */
96#define MP_GT         1   /* greater than */
97
98#define MP_ZPOS       0   /* positive integer */
99#define MP_NEG        1   /* negative */
100
101#define MP_OKAY       0   /* ok result */
102#define MP_MEM        -2  /* out of mem */
103#define MP_VAL        -3  /* invalid input */
104
105#define MP_YES        1   /* yes response */
106#define MP_NO         0   /* no response */
107
108typedef int           mp_err;
109
110/* define this to use lower memory usage routines (exptmods mostly) */
111#define MP_LOW_MEM
112
113/* default precision */
114#ifndef MP_PREC
115   #ifndef MP_LOW_MEM
116      #define MP_PREC                 32     /* default digits of precision */
117   #else
118      #define MP_PREC                 8      /* default digits of precision */
119   #endif
120#endif
121
122/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
123#define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))
124
125/* the infamous mp_int structure */
126typedef struct  {
127    int used, alloc, sign;
128    mp_digit *dp;
129} mp_int;
130
131
132/* ---> Basic Manipulations <--- */
133#define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
134#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? MP_YES : MP_NO)
135#define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? MP_YES : MP_NO)
136
137
138/* prototypes for copied functions */
139#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
140static int s_mp_exptmod(mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
141static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
142static int s_mp_sqr(mp_int * a, mp_int * b);
143static int s_mp_mul_high_digs(mp_int * a, mp_int * b, mp_int * c, int digs);
144
145#ifdef BN_FAST_S_MP_MUL_DIGS_C
146static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
147#endif
148
149#ifdef BN_MP_INIT_MULTI_C
150static int mp_init_multi(mp_int *mp, ...);
151#endif
152#ifdef BN_MP_CLEAR_MULTI_C
153static void mp_clear_multi(mp_int *mp, ...);
154#endif
155static int mp_lshd(mp_int * a, int b);
156static void mp_set(mp_int * a, mp_digit b);
157static void mp_clamp(mp_int * a);
158static void mp_exch(mp_int * a, mp_int * b);
159static void mp_rshd(mp_int * a, int b);
160static void mp_zero(mp_int * a);
161static int mp_mod_2d(mp_int * a, int b, mp_int * c);
162static int mp_div_2d(mp_int * a, int b, mp_int * c, mp_int * d);
163static int mp_init_copy(mp_int * a, mp_int * b);
164static int mp_mul_2d(mp_int * a, int b, mp_int * c);
165#ifndef LTM_NO_NEG_EXP
166static int mp_div_2(mp_int * a, mp_int * b);
167static int mp_invmod(mp_int * a, mp_int * b, mp_int * c);
168static int mp_invmod_slow(mp_int * a, mp_int * b, mp_int * c);
169#endif /* LTM_NO_NEG_EXP */
170static int mp_copy(mp_int * a, mp_int * b);
171static int mp_count_bits(mp_int * a);
172static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d);
173static int mp_mod(mp_int * a, mp_int * b, mp_int * c);
174static int mp_grow(mp_int * a, int size);
175static int mp_cmp_mag(mp_int * a, mp_int * b);
176#ifdef BN_MP_ABS_C
177static int mp_abs(mp_int * a, mp_int * b);
178#endif
179static int mp_sqr(mp_int * a, mp_int * b);
180static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
181static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
182static int mp_2expt(mp_int * a, int b);
183static int mp_reduce_setup(mp_int * a, mp_int * b);
184static int mp_reduce(mp_int * x, mp_int * m, mp_int * mu);
185static int mp_init_size(mp_int * a, int size);
186#ifdef BN_MP_EXPTMOD_FAST_C
187static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
188#endif /* BN_MP_EXPTMOD_FAST_C */
189#ifdef BN_FAST_S_MP_SQR_C
190static int fast_s_mp_sqr (mp_int * a, mp_int * b);
191#endif /* BN_FAST_S_MP_SQR_C */
192#ifdef BN_MP_MUL_D_C
193static int mp_mul_d (mp_int * a, mp_digit b, mp_int * c);
194#endif /* BN_MP_MUL_D_C */
195
196
197
198/* functions from bn_<func name>.c */
199
200
201/* reverse an array, used for radix code */
202static void bn_reverse (unsigned char *s, int len)
203{
204  int     ix, iy;
205  unsigned char t;
206
207  ix = 0;
208  iy = len - 1;
209  while (ix < iy) {
210    t     = s[ix];
211    s[ix] = s[iy];
212    s[iy] = t;
213    ++ix;
214    --iy;
215  }
216}
217
218
219/* low level addition, based on HAC pp.594, Algorithm 14.7 */
220static int s_mp_add (mp_int * a, mp_int * b, mp_int * c)
221{
222  mp_int *x;
223  int     olduse, res, min, max;
224
225  /* find sizes, we let |a| <= |b| which means we have to sort
226   * them.  "x" will point to the input with the most digits
227   */
228  if (a->used > b->used) {
229    min = b->used;
230    max = a->used;
231    x = a;
232  } else {
233    min = a->used;
234    max = b->used;
235    x = b;
236  }
237
238  /* init result */
239  if (c->alloc < max + 1) {
240    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
241      return res;
242    }
243  }
244
245  /* get old used digit count and set new one */
246  olduse = c->used;
247  c->used = max + 1;
248
249  {
250    register mp_digit u, *tmpa, *tmpb, *tmpc;
251    register int i;
252
253    /* alias for digit pointers */
254
255    /* first input */
256    tmpa = a->dp;
257
258    /* second input */
259    tmpb = b->dp;
260
261    /* destination */
262    tmpc = c->dp;
263
264    /* zero the carry */
265    u = 0;
266    for (i = 0; i < min; i++) {
267      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
268      *tmpc = *tmpa++ + *tmpb++ + u;
269
270      /* U = carry bit of T[i] */
271      u = *tmpc >> ((mp_digit)DIGIT_BIT);
272
273      /* take away carry bit from T[i] */
274      *tmpc++ &= MP_MASK;
275    }
276
277    /* now copy higher words if any, that is in A+B
278     * if A or B has more digits add those in
279     */
280    if (min != max) {
281      for (; i < max; i++) {
282        /* T[i] = X[i] + U */
283        *tmpc = x->dp[i] + u;
284
285        /* U = carry bit of T[i] */
286        u = *tmpc >> ((mp_digit)DIGIT_BIT);
287
288        /* take away carry bit from T[i] */
289        *tmpc++ &= MP_MASK;
290      }
291    }
292
293    /* add carry */
294    *tmpc++ = u;
295
296    /* clear digits above oldused */
297    for (i = c->used; i < olduse; i++) {
298      *tmpc++ = 0;
299    }
300  }
301
302  mp_clamp (c);
303  return MP_OKAY;
304}
305
306
307/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
308static int s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
309{
310  int     olduse, res, min, max;
311
312  /* find sizes */
313  min = b->used;
314  max = a->used;
315
316  /* init result */
317  if (c->alloc < max) {
318    if ((res = mp_grow (c, max)) != MP_OKAY) {
319      return res;
320    }
321  }
322  olduse = c->used;
323  c->used = max;
324
325  {
326    register mp_digit u, *tmpa, *tmpb, *tmpc;
327    register int i;
328
329    /* alias for digit pointers */
330    tmpa = a->dp;
331    tmpb = b->dp;
332    tmpc = c->dp;
333
334    /* set carry to zero */
335    u = 0;
336    for (i = 0; i < min; i++) {
337      /* T[i] = A[i] - B[i] - U */
338      *tmpc = *tmpa++ - *tmpb++ - u;
339
340      /* U = carry bit of T[i]
341       * Note this saves performing an AND operation since
342       * if a carry does occur it will propagate all the way to the
343       * MSB.  As a result a single shift is enough to get the carry
344       */
345      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
346
347      /* Clear carry from T[i] */
348      *tmpc++ &= MP_MASK;
349    }
350
351    /* now copy higher words if any, e.g. if A has more digits than B  */
352    for (; i < max; i++) {
353      /* T[i] = A[i] - U */
354      *tmpc = *tmpa++ - u;
355
356      /* U = carry bit of T[i] */
357      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
358
359      /* Clear carry from T[i] */
360      *tmpc++ &= MP_MASK;
361    }
362
363    /* clear digits above used (since we may not have grown result above) */
364    for (i = c->used; i < olduse; i++) {
365      *tmpc++ = 0;
366    }
367  }
368
369  mp_clamp (c);
370  return MP_OKAY;
371}
372
373
374/* init a new mp_int */
375static int mp_init (mp_int * a)
376{
377  int i;
378
379  /* allocate memory required and clear it */
380  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
381  if (a->dp == NULL) {
382    return MP_MEM;
383  }
384
385  /* set the digits to zero */
386  for (i = 0; i < MP_PREC; i++) {
387      a->dp[i] = 0;
388  }
389
390  /* set the used to zero, allocated digits to the default precision
391   * and sign to positive */
392  a->used  = 0;
393  a->alloc = MP_PREC;
394  a->sign  = MP_ZPOS;
395
396  return MP_OKAY;
397}
398
399
400/* clear one (frees)  */
401static void mp_clear (mp_int * a)
402{
403  int i;
404
405  /* only do anything if a hasn't been freed previously */
406  if (a->dp != NULL) {
407    /* first zero the digits */
408    for (i = 0; i < a->used; i++) {
409        a->dp[i] = 0;
410    }
411
412    /* free ram */
413    XFREE(a->dp);
414
415    /* reset members to make debugging easier */
416    a->dp    = NULL;
417    a->alloc = a->used = 0;
418    a->sign  = MP_ZPOS;
419  }
420}
421
422
423/* high level addition (handles signs) */
424static int mp_add (mp_int * a, mp_int * b, mp_int * c)
425{
426  int     sa, sb, res;
427
428  /* get sign of both inputs */
429  sa = a->sign;
430  sb = b->sign;
431
432  /* handle two cases, not four */
433  if (sa == sb) {
434    /* both positive or both negative */
435    /* add their magnitudes, copy the sign */
436    c->sign = sa;
437    res = s_mp_add (a, b, c);
438  } else {
439    /* one positive, the other negative */
440    /* subtract the one with the greater magnitude from */
441    /* the one of the lesser magnitude.  The result gets */
442    /* the sign of the one with the greater magnitude. */
443    if (mp_cmp_mag (a, b) == MP_LT) {
444      c->sign = sb;
445      res = s_mp_sub (b, a, c);
446    } else {
447      c->sign = sa;
448      res = s_mp_sub (a, b, c);
449    }
450  }
451  return res;
452}
453
454
455/* high level subtraction (handles signs) */
456static int mp_sub (mp_int * a, mp_int * b, mp_int * c)
457{
458  int     sa, sb, res;
459
460  sa = a->sign;
461  sb = b->sign;
462
463  if (sa != sb) {
464    /* subtract a negative from a positive, OR */
465    /* subtract a positive from a negative. */
466    /* In either case, ADD their magnitudes, */
467    /* and use the sign of the first number. */
468    c->sign = sa;
469    res = s_mp_add (a, b, c);
470  } else {
471    /* subtract a positive from a positive, OR */
472    /* subtract a negative from a negative. */
473    /* First, take the difference between their */
474    /* magnitudes, then... */
475    if (mp_cmp_mag (a, b) != MP_LT) {
476      /* Copy the sign from the first */
477      c->sign = sa;
478      /* The first has a larger or equal magnitude */
479      res = s_mp_sub (a, b, c);
480    } else {
481      /* The result has the *opposite* sign from */
482      /* the first number. */
483      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
484      /* The second has a larger magnitude */
485      res = s_mp_sub (b, a, c);
486    }
487  }
488  return res;
489}
490
491
492/* high level multiplication (handles sign) */
493static int mp_mul (mp_int * a, mp_int * b, mp_int * c)
494{
495  int     res, neg;
496  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
497
498  /* use Toom-Cook? */
499#ifdef BN_MP_TOOM_MUL_C
500  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
501    res = mp_toom_mul(a, b, c);
502  } else
503#endif
504#ifdef BN_MP_KARATSUBA_MUL_C
505  /* use Karatsuba? */
506  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
507    res = mp_karatsuba_mul (a, b, c);
508  } else
509#endif
510  {
511    /* can we use the fast multiplier?
512     *
513     * The fast multiplier can be used if the output will
514     * have less than MP_WARRAY digits and the number of
515     * digits won't affect carry propagation
516     */
517#ifdef BN_FAST_S_MP_MUL_DIGS_C
518    int     digs = a->used + b->used + 1;
519
520    if ((digs < MP_WARRAY) &&
521        MIN(a->used, b->used) <=
522        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
523      res = fast_s_mp_mul_digs (a, b, c, digs);
524    } else
525#endif
526#ifdef BN_S_MP_MUL_DIGS_C
527      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
528#else
529#error mp_mul could fail
530      res = MP_VAL;
531#endif
532
533  }
534  c->sign = (c->used > 0) ? neg : MP_ZPOS;
535  return res;
536}
537
538
539/* d = a * b (mod c) */
540static int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
541{
542  int     res;
543  mp_int  t;
544
545  if ((res = mp_init (&t)) != MP_OKAY) {
546    return res;
547  }
548
549  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
550    mp_clear (&t);
551    return res;
552  }
553  res = mp_mod (&t, c, d);
554  mp_clear (&t);
555  return res;
556}
557
558
559/* c = a mod b, 0 <= c < b */
560static int mp_mod (mp_int * a, mp_int * b, mp_int * c)
561{
562  mp_int  t;
563  int     res;
564
565  if ((res = mp_init (&t)) != MP_OKAY) {
566    return res;
567  }
568
569  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
570    mp_clear (&t);
571    return res;
572  }
573
574  if (t.sign != b->sign) {
575    res = mp_add (b, &t, c);
576  } else {
577    res = MP_OKAY;
578    mp_exch (&t, c);
579  }
580
581  mp_clear (&t);
582  return res;
583}
584
585
586/* this is a shell function that calls either the normal or Montgomery
587 * exptmod functions.  Originally the call to the montgomery code was
588 * embedded in the normal function but that wasted a lot of stack space
589 * for nothing (since 99% of the time the Montgomery code would be called)
590 */
591static int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
592{
593  int dr;
594
595  /* modulus P must be positive */
596  if (P->sign == MP_NEG) {
597     return MP_VAL;
598  }
599
600  /* if exponent X is negative we have to recurse */
601  if (X->sign == MP_NEG) {
602#ifdef LTM_NO_NEG_EXP
603        return MP_VAL;
604#else /* LTM_NO_NEG_EXP */
605#ifdef BN_MP_INVMOD_C
606     mp_int tmpG, tmpX;
607     int err;
608
609     /* first compute 1/G mod P */
610     if ((err = mp_init(&tmpG)) != MP_OKAY) {
611        return err;
612     }
613     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
614        mp_clear(&tmpG);
615        return err;
616     }
617
618     /* now get |X| */
619     if ((err = mp_init(&tmpX)) != MP_OKAY) {
620        mp_clear(&tmpG);
621        return err;
622     }
623     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
624        mp_clear_multi(&tmpG, &tmpX, NULL);
625        return err;
626     }
627
628     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
629     err = mp_exptmod(&tmpG, &tmpX, P, Y);
630     mp_clear_multi(&tmpG, &tmpX, NULL);
631     return err;
632#else
633#error mp_exptmod would always fail
634     /* no invmod */
635     return MP_VAL;
636#endif
637#endif /* LTM_NO_NEG_EXP */
638  }
639
640/* modified diminished radix reduction */
641#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
642  if (mp_reduce_is_2k_l(P) == MP_YES) {
643     return s_mp_exptmod(G, X, P, Y, 1);
644  }
645#endif
646
647#ifdef BN_MP_DR_IS_MODULUS_C
648  /* is it a DR modulus? */
649  dr = mp_dr_is_modulus(P);
650#else
651  /* default to no */
652  dr = 0;
653#endif
654
655#ifdef BN_MP_REDUCE_IS_2K_C
656  /* if not, is it a unrestricted DR modulus? */
657  if (dr == 0) {
658     dr = mp_reduce_is_2k(P) << 1;
659  }
660#endif
661
662  /* if the modulus is odd or dr != 0 use the montgomery method */
663#ifdef BN_MP_EXPTMOD_FAST_C
664  if (mp_isodd (P) == 1 || dr !=  0) {
665    return mp_exptmod_fast (G, X, P, Y, dr);
666  } else {
667#endif
668#ifdef BN_S_MP_EXPTMOD_C
669    /* otherwise use the generic Barrett reduction technique */
670    return s_mp_exptmod (G, X, P, Y, 0);
671#else
672#error mp_exptmod could fail
673    /* no exptmod for evens */
674    return MP_VAL;
675#endif
676#ifdef BN_MP_EXPTMOD_FAST_C
677  }
678#endif
679  if (dr == 0) {
680    /* avoid compiler warnings about possibly unused variable */
681  }
682}
683
684
685/* compare two ints (signed)*/
686static int mp_cmp (mp_int * a, mp_int * b)
687{
688  /* compare based on sign */
689  if (a->sign != b->sign) {
690     if (a->sign == MP_NEG) {
691        return MP_LT;
692     } else {
693        return MP_GT;
694     }
695  }
696
697  /* compare digits */
698  if (a->sign == MP_NEG) {
699     /* if negative compare opposite direction */
700     return mp_cmp_mag(b, a);
701  } else {
702     return mp_cmp_mag(a, b);
703  }
704}
705
706
707/* compare a digit */
708static int mp_cmp_d(mp_int * a, mp_digit b)
709{
710  /* compare based on sign */
711  if (a->sign == MP_NEG) {
712    return MP_LT;
713  }
714
715  /* compare based on magnitude */
716  if (a->used > 1) {
717    return MP_GT;
718  }
719
720  /* compare the only digit of a to b */
721  if (a->dp[0] > b) {
722    return MP_GT;
723  } else if (a->dp[0] < b) {
724    return MP_LT;
725  } else {
726    return MP_EQ;
727  }
728}
729
730
731#ifndef LTM_NO_NEG_EXP
732/* hac 14.61, pp608 */
733static int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
734{
735  /* b cannot be negative */
736  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
737    return MP_VAL;
738  }
739
740#ifdef BN_FAST_MP_INVMOD_C
741  /* if the modulus is odd we can use a faster routine instead */
742  if (mp_isodd (b) == 1) {
743    return fast_mp_invmod (a, b, c);
744  }
745#endif
746
747#ifdef BN_MP_INVMOD_SLOW_C
748  return mp_invmod_slow(a, b, c);
749#endif
750
751#ifndef BN_FAST_MP_INVMOD_C
752#ifndef BN_MP_INVMOD_SLOW_C
753#error mp_invmod would always fail
754#endif
755#endif
756  return MP_VAL;
757}
758#endif /* LTM_NO_NEG_EXP */
759
760
761/* get the size for an unsigned equivalent */
762static int mp_unsigned_bin_size (mp_int * a)
763{
764  int     size = mp_count_bits (a);
765  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
766}
767
768
769#ifndef LTM_NO_NEG_EXP
770/* hac 14.61, pp608 */
771static int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
772{
773  mp_int  x, y, u, v, A, B, C, D;
774  int     res;
775
776  /* b cannot be negative */
777  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
778    return MP_VAL;
779  }
780
781  /* init temps */
782  if ((res = mp_init_multi(&x, &y, &u, &v,
783                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
784     return res;
785  }
786
787  /* x = a, y = b */
788  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
789      goto LBL_ERR;
790  }
791  if ((res = mp_copy (b, &y)) != MP_OKAY) {
792    goto LBL_ERR;
793  }
794
795  /* 2. [modified] if x,y are both even then return an error! */
796  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
797    res = MP_VAL;
798    goto LBL_ERR;
799  }
800
801  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
802  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
803    goto LBL_ERR;
804  }
805  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
806    goto LBL_ERR;
807  }
808  mp_set (&A, 1);
809  mp_set (&D, 1);
810
811top:
812  /* 4.  while u is even do */
813  while (mp_iseven (&u) == 1) {
814    /* 4.1 u = u/2 */
815    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
816      goto LBL_ERR;
817    }
818    /* 4.2 if A or B is odd then */
819    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
820      /* A = (A+y)/2, B = (B-x)/2 */
821      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
822         goto LBL_ERR;
823      }
824      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
825         goto LBL_ERR;
826      }
827    }
828    /* A = A/2, B = B/2 */
829    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
830      goto LBL_ERR;
831    }
832    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
833      goto LBL_ERR;
834    }
835  }
836
837  /* 5.  while v is even do */
838  while (mp_iseven (&v) == 1) {
839    /* 5.1 v = v/2 */
840    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
841      goto LBL_ERR;
842    }
843    /* 5.2 if C or D is odd then */
844    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
845      /* C = (C+y)/2, D = (D-x)/2 */
846      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
847         goto LBL_ERR;
848      }
849      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
850         goto LBL_ERR;
851      }
852    }
853    /* C = C/2, D = D/2 */
854    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
855      goto LBL_ERR;
856    }
857    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
858      goto LBL_ERR;
859    }
860  }
861
862  /* 6.  if u >= v then */
863  if (mp_cmp (&u, &v) != MP_LT) {
864    /* u = u - v, A = A - C, B = B - D */
865    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
866      goto LBL_ERR;
867    }
868
869    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
870      goto LBL_ERR;
871    }
872
873    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
874      goto LBL_ERR;
875    }
876  } else {
877    /* v - v - u, C = C - A, D = D - B */
878    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
879      goto LBL_ERR;
880    }
881
882    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
883      goto LBL_ERR;
884    }
885
886    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
887      goto LBL_ERR;
888    }
889  }
890
891  /* if not zero goto step 4 */
892  if (mp_iszero (&u) == 0)
893    goto top;
894
895  /* now a = C, b = D, gcd == g*v */
896
897  /* if v != 1 then there is no inverse */
898  if (mp_cmp_d (&v, 1) != MP_EQ) {
899    res = MP_VAL;
900    goto LBL_ERR;
901  }
902
903  /* if its too low */
904  while (mp_cmp_d(&C, 0) == MP_LT) {
905      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
906         goto LBL_ERR;
907      }
908  }
909
910  /* too big */
911  while (mp_cmp_mag(&C, b) != MP_LT) {
912      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
913         goto LBL_ERR;
914      }
915  }
916
917  /* C is now the inverse */
918  mp_exch (&C, c);
919  res = MP_OKAY;
920LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
921  return res;
922}
923#endif /* LTM_NO_NEG_EXP */
924
925
926/* compare maginitude of two ints (unsigned) */
927static int mp_cmp_mag (mp_int * a, mp_int * b)
928{
929  int     n;
930  mp_digit *tmpa, *tmpb;
931
932  /* compare based on # of non-zero digits */
933  if (a->used > b->used) {
934    return MP_GT;
935  }
936
937  if (a->used < b->used) {
938    return MP_LT;
939  }
940
941  /* alias for a */
942  tmpa = a->dp + (a->used - 1);
943
944  /* alias for b */
945  tmpb = b->dp + (a->used - 1);
946
947  /* compare based on digits  */
948  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
949    if (*tmpa > *tmpb) {
950      return MP_GT;
951    }
952
953    if (*tmpa < *tmpb) {
954      return MP_LT;
955    }
956  }
957  return MP_EQ;
958}
959
960
961/* reads a unsigned char array, assumes the msb is stored first [big endian] */
962static int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
963{
964  int     res;
965
966  /* make sure there are at least two digits */
967  if (a->alloc < 2) {
968     if ((res = mp_grow(a, 2)) != MP_OKAY) {
969        return res;
970     }
971  }
972
973  /* zero the int */
974  mp_zero (a);
975
976  /* read the bytes in */
977  while (c-- > 0) {
978    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
979      return res;
980    }
981
982#ifndef MP_8BIT
983      a->dp[0] |= *b++;
984      a->used += 1;
985#else
986      a->dp[0] = (*b & MP_MASK);
987      a->dp[1] |= ((*b++ >> 7U) & 1);
988      a->used += 2;
989#endif
990  }
991  mp_clamp (a);
992  return MP_OKAY;
993}
994
995
996/* store in unsigned [big endian] format */
997static int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
998{
999  int     x, res;
1000  mp_int  t;
1001
1002  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
1003    return res;
1004  }
1005
1006  x = 0;
1007  while (mp_iszero (&t) == 0) {
1008#ifndef MP_8BIT
1009      b[x++] = (unsigned char) (t.dp[0] & 255);
1010#else
1011      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
1012#endif
1013    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
1014      mp_clear (&t);
1015      return res;
1016    }
1017  }
1018  bn_reverse (b, x);
1019  mp_clear (&t);
1020  return MP_OKAY;
1021}
1022
1023
1024/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1025static int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
1026{
1027  mp_digit D, r, rr;
1028  int     x, res;
1029  mp_int  t;
1030
1031
1032  /* if the shift count is <= 0 then we do no work */
1033  if (b <= 0) {
1034    res = mp_copy (a, c);
1035    if (d != NULL) {
1036      mp_zero (d);
1037    }
1038    return res;
1039  }
1040
1041  if ((res = mp_init (&t)) != MP_OKAY) {
1042    return res;
1043  }
1044
1045  /* get the remainder */
1046  if (d != NULL) {
1047    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1048      mp_clear (&t);
1049      return res;
1050    }
1051  }
1052
1053  /* copy */
1054  if ((res = mp_copy (a, c)) != MP_OKAY) {
1055    mp_clear (&t);
1056    return res;
1057  }
1058
1059  /* shift by as many digits in the bit count */
1060  if (b >= (int)DIGIT_BIT) {
1061    mp_rshd (c, b / DIGIT_BIT);
1062  }
1063
1064  /* shift any bit count < DIGIT_BIT */
1065  D = (mp_digit) (b % DIGIT_BIT);
1066  if (D != 0) {
1067    register mp_digit *tmpc, mask, shift;
1068
1069    /* mask */
1070    mask = (((mp_digit)1) << D) - 1;
1071
1072    /* shift for lsb */
1073    shift = DIGIT_BIT - D;
1074
1075    /* alias */
1076    tmpc = c->dp + (c->used - 1);
1077
1078    /* carry */
1079    r = 0;
1080    for (x = c->used - 1; x >= 0; x--) {
1081      /* get the lower  bits of this word in a temp */
1082      rr = *tmpc & mask;
1083
1084      /* shift the current word and mix in the carry bits from the previous word */
1085      *tmpc = (*tmpc >> D) | (r << shift);
1086      --tmpc;
1087
1088      /* set the carry to the carry bits of the current word found above */
1089      r = rr;
1090    }
1091  }
1092  mp_clamp (c);
1093  if (d != NULL) {
1094    mp_exch (&t, d);
1095  }
1096  mp_clear (&t);
1097  return MP_OKAY;
1098}
1099
1100
1101static int mp_init_copy (mp_int * a, mp_int * b)
1102{
1103  int     res;
1104
1105  if ((res = mp_init (a)) != MP_OKAY) {
1106    return res;
1107  }
1108  return mp_copy (b, a);
1109}
1110
1111
1112/* set to zero */
1113static void mp_zero (mp_int * a)
1114{
1115  int       n;
1116  mp_digit *tmp;
1117
1118  a->sign = MP_ZPOS;
1119  a->used = 0;
1120
1121  tmp = a->dp;
1122  for (n = 0; n < a->alloc; n++) {
1123     *tmp++ = 0;
1124  }
1125}
1126
1127
1128/* copy, b = a */
1129static int mp_copy (mp_int * a, mp_int * b)
1130{
1131  int     res, n;
1132
1133  /* if dst == src do nothing */
1134  if (a == b) {
1135    return MP_OKAY;
1136  }
1137
1138  /* grow dest */
1139  if (b->alloc < a->used) {
1140     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1141        return res;
1142     }
1143  }
1144
1145  /* zero b and copy the parameters over */
1146  {
1147    register mp_digit *tmpa, *tmpb;
1148
1149    /* pointer aliases */
1150
1151    /* source */
1152    tmpa = a->dp;
1153
1154    /* destination */
1155    tmpb = b->dp;
1156
1157    /* copy all the digits */
1158    for (n = 0; n < a->used; n++) {
1159      *tmpb++ = *tmpa++;
1160    }
1161
1162    /* clear high digits */
1163    for (; n < b->used; n++) {
1164      *tmpb++ = 0;
1165    }
1166  }
1167
1168  /* copy used count and sign */
1169  b->used = a->used;
1170  b->sign = a->sign;
1171  return MP_OKAY;
1172}
1173
1174
1175/* shift right a certain amount of digits */
1176static void mp_rshd (mp_int * a, int b)
1177{
1178  int     x;
1179
1180  /* if b <= 0 then ignore it */
1181  if (b <= 0) {
1182    return;
1183  }
1184
1185  /* if b > used then simply zero it and return */
1186  if (a->used <= b) {
1187    mp_zero (a);
1188    return;
1189  }
1190
1191  {
1192    register mp_digit *bottom, *top;
1193
1194    /* shift the digits down */
1195
1196    /* bottom */
1197    bottom = a->dp;
1198
1199    /* top [offset into digits] */
1200    top = a->dp + b;
1201
1202    /* this is implemented as a sliding window where
1203     * the window is b-digits long and digits from
1204     * the top of the window are copied to the bottom
1205     *
1206     * e.g.
1207
1208     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
1209                 /\                   |      ---->
1210                  \-------------------/      ---->
1211     */
1212    for (x = 0; x < (a->used - b); x++) {
1213      *bottom++ = *top++;
1214    }
1215
1216    /* zero the top digits */
1217    for (; x < a->used; x++) {
1218      *bottom++ = 0;
1219    }
1220  }
1221
1222  /* remove excess digits */
1223  a->used -= b;
1224}
1225
1226
1227/* swap the elements of two integers, for cases where you can't simply swap the
1228 * mp_int pointers around
1229 */
1230static void mp_exch (mp_int * a, mp_int * b)
1231{
1232  mp_int  t;
1233
1234  t  = *a;
1235  *a = *b;
1236  *b = t;
1237}
1238
1239
1240/* trim unused digits
1241 *
1242 * This is used to ensure that leading zero digits are
1243 * trimed and the leading "used" digit will be non-zero
1244 * Typically very fast.  Also fixes the sign if there
1245 * are no more leading digits
1246 */
1247static void mp_clamp (mp_int * a)
1248{
1249  /* decrease used while the most significant digit is
1250   * zero.
1251   */
1252  while (a->used > 0 && a->dp[a->used - 1] == 0) {
1253    --(a->used);
1254  }
1255
1256  /* reset the sign flag if used == 0 */
1257  if (a->used == 0) {
1258    a->sign = MP_ZPOS;
1259  }
1260}
1261
1262
1263/* grow as required */
1264static int mp_grow (mp_int * a, int size)
1265{
1266  int     i;
1267  mp_digit *tmp;
1268
1269  /* if the alloc size is smaller alloc more ram */
1270  if (a->alloc < size) {
1271    /* ensure there are always at least MP_PREC digits extra on top */
1272    size += (MP_PREC * 2) - (size % MP_PREC);
1273
1274    /* reallocate the array a->dp
1275     *
1276     * We store the return in a temporary variable
1277     * in case the operation failed we don't want
1278     * to overwrite the dp member of a.
1279     */
1280    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
1281    if (tmp == NULL) {
1282      /* reallocation failed but "a" is still valid [can be freed] */
1283      return MP_MEM;
1284    }
1285
1286    /* reallocation succeeded so set a->dp */
1287    a->dp = tmp;
1288
1289    /* zero excess digits */
1290    i        = a->alloc;
1291    a->alloc = size;
1292    for (; i < a->alloc; i++) {
1293      a->dp[i] = 0;
1294    }
1295  }
1296  return MP_OKAY;
1297}
1298
1299
1300#ifdef BN_MP_ABS_C
1301/* b = |a|
1302 *
1303 * Simple function copies the input and fixes the sign to positive
1304 */
1305static int mp_abs (mp_int * a, mp_int * b)
1306{
1307  int     res;
1308
1309  /* copy a to b */
1310  if (a != b) {
1311     if ((res = mp_copy (a, b)) != MP_OKAY) {
1312       return res;
1313     }
1314  }
1315
1316  /* force the sign of b to positive */
1317  b->sign = MP_ZPOS;
1318
1319  return MP_OKAY;
1320}
1321#endif
1322
1323
1324/* set to a digit */
1325static void mp_set (mp_int * a, mp_digit b)
1326{
1327  mp_zero (a);
1328  a->dp[0] = b & MP_MASK;
1329  a->used  = (a->dp[0] != 0) ? 1 : 0;
1330}
1331
1332
1333#ifndef LTM_NO_NEG_EXP
1334/* b = a/2 */
1335static int mp_div_2(mp_int * a, mp_int * b)
1336{
1337  int     x, res, oldused;
1338
1339  /* copy */
1340  if (b->alloc < a->used) {
1341    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1342      return res;
1343    }
1344  }
1345
1346  oldused = b->used;
1347  b->used = a->used;
1348  {
1349    register mp_digit r, rr, *tmpa, *tmpb;
1350
1351    /* source alias */
1352    tmpa = a->dp + b->used - 1;
1353
1354    /* dest alias */
1355    tmpb = b->dp + b->used - 1;
1356
1357    /* carry */
1358    r = 0;
1359    for (x = b->used - 1; x >= 0; x--) {
1360      /* get the carry for the next iteration */
1361      rr = *tmpa & 1;
1362
1363      /* shift the current digit, add in carry and store */
1364      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1365
1366      /* forward carry to next iteration */
1367      r = rr;
1368    }
1369
1370    /* zero excess digits */
1371    tmpb = b->dp + b->used;
1372    for (x = b->used; x < oldused; x++) {
1373      *tmpb++ = 0;
1374    }
1375  }
1376  b->sign = a->sign;
1377  mp_clamp (b);
1378  return MP_OKAY;
1379}
1380#endif /* LTM_NO_NEG_EXP */
1381
1382
1383/* shift left by a certain bit count */
1384static int mp_mul_2d (mp_int * a, int b, mp_int * c)
1385{
1386  mp_digit d;
1387  int      res;
1388
1389  /* copy */
1390  if (a != c) {
1391     if ((res = mp_copy (a, c)) != MP_OKAY) {
1392       return res;
1393     }
1394  }
1395
1396  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
1397     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
1398       return res;
1399     }
1400  }
1401
1402  /* shift by as many digits in the bit count */
1403  if (b >= (int)DIGIT_BIT) {
1404    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
1405      return res;
1406    }
1407  }
1408
1409  /* shift any bit count < DIGIT_BIT */
1410  d = (mp_digit) (b % DIGIT_BIT);
1411  if (d != 0) {
1412    register mp_digit *tmpc, shift, mask, r, rr;
1413    register int x;
1414
1415    /* bitmask for carries */
1416    mask = (((mp_digit)1) << d) - 1;
1417
1418    /* shift for msbs */
1419    shift = DIGIT_BIT - d;
1420
1421    /* alias */
1422    tmpc = c->dp;
1423
1424    /* carry */
1425    r    = 0;
1426    for (x = 0; x < c->used; x++) {
1427      /* get the higher bits of the current word */
1428      rr = (*tmpc >> shift) & mask;
1429
1430      /* shift the current word and OR in the carry */
1431      *tmpc = ((*tmpc << d) | r) & MP_MASK;
1432      ++tmpc;
1433
1434      /* set the carry to the carry bits of the current word */
1435      r = rr;
1436    }
1437
1438    /* set final carry */
1439    if (r != 0) {
1440       c->dp[(c->used)++] = r;
1441    }
1442  }
1443  mp_clamp (c);
1444  return MP_OKAY;
1445}
1446
1447
1448#ifdef BN_MP_INIT_MULTI_C
1449static int mp_init_multi(mp_int *mp, ...)
1450{
1451    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
1452    int n = 0;                 /* Number of ok inits */
1453    mp_int* cur_arg = mp;
1454    va_list args;
1455
1456    va_start(args, mp);        /* init args to next argument from caller */
1457    while (cur_arg != NULL) {
1458        if (mp_init(cur_arg) != MP_OKAY) {
1459            /* Oops - error! Back-track and mp_clear what we already
1460               succeeded in init-ing, then return error.
1461            */
1462            va_list clean_args;
1463
1464            /* end the current list */
1465            va_end(args);
1466
1467            /* now start cleaning up */
1468            cur_arg = mp;
1469            va_start(clean_args, mp);
1470            while (n--) {
1471                mp_clear(cur_arg);
1472                cur_arg = va_arg(clean_args, mp_int*);
1473            }
1474            va_end(clean_args);
1475            res = MP_MEM;
1476            break;
1477        }
1478        n++;
1479        cur_arg = va_arg(args, mp_int*);
1480    }
1481    va_end(args);
1482    return res;                /* Assumed ok, if error flagged above. */
1483}
1484#endif
1485
1486
1487#ifdef BN_MP_CLEAR_MULTI_C
1488static void mp_clear_multi(mp_int *mp, ...)
1489{
1490    mp_int* next_mp = mp;
1491    va_list args;
1492    va_start(args, mp);
1493    while (next_mp != NULL) {
1494        mp_clear(next_mp);
1495        next_mp = va_arg(args, mp_int*);
1496    }
1497    va_end(args);
1498}
1499#endif
1500
1501
1502/* shift left a certain amount of digits */
1503static int mp_lshd (mp_int * a, int b)
1504{
1505  int     x, res;
1506
1507  /* if its less than zero return */
1508  if (b <= 0) {
1509    return MP_OKAY;
1510  }
1511
1512  /* grow to fit the new digits */
1513  if (a->alloc < a->used + b) {
1514     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
1515       return res;
1516     }
1517  }
1518
1519  {
1520    register mp_digit *top, *bottom;
1521
1522    /* increment the used by the shift amount then copy upwards */
1523    a->used += b;
1524
1525    /* top */
1526    top = a->dp + a->used - 1;
1527
1528    /* base */
1529    bottom = a->dp + a->used - 1 - b;
1530
1531    /* much like mp_rshd this is implemented using a sliding window
1532     * except the window goes the otherway around.  Copying from
1533     * the bottom to the top.  see bn_mp_rshd.c for more info.
1534     */
1535    for (x = a->used - 1; x >= b; x--) {
1536      *top-- = *bottom--;
1537    }
1538
1539    /* zero the lower digits */
1540    top = a->dp;
1541    for (x = 0; x < b; x++) {
1542      *top++ = 0;
1543    }
1544  }
1545  return MP_OKAY;
1546}
1547
1548
1549/* returns the number of bits in an int */
1550static int mp_count_bits (mp_int * a)
1551{
1552  int     r;
1553  mp_digit q;
1554
1555  /* shortcut */
1556  if (a->used == 0) {
1557    return 0;
1558  }
1559
1560  /* get number of digits and add that */
1561  r = (a->used - 1) * DIGIT_BIT;
1562
1563  /* take the last digit and count the bits in it */
1564  q = a->dp[a->used - 1];
1565  while (q > ((mp_digit) 0)) {
1566    ++r;
1567    q >>= ((mp_digit) 1);
1568  }
1569  return r;
1570}
1571
1572
1573/* calc a value mod 2**b */
1574static int mp_mod_2d (mp_int * a, int b, mp_int * c)
1575{
1576  int     x, res;
1577
1578  /* if b is <= 0 then zero the int */
1579  if (b <= 0) {
1580    mp_zero (c);
1581    return MP_OKAY;
1582  }
1583
1584  /* if the modulus is larger than the value than return */
1585  if (b >= (int) (a->used * DIGIT_BIT)) {
1586    res = mp_copy (a, c);
1587    return res;
1588  }
1589
1590  /* copy */
1591  if ((res = mp_copy (a, c)) != MP_OKAY) {
1592    return res;
1593  }
1594
1595  /* zero digits above the last digit of the modulus */
1596  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
1597    c->dp[x] = 0;
1598  }
1599  /* clear the digit that is not completely outside/inside the modulus */
1600  c->dp[b / DIGIT_BIT] &=
1601    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
1602  mp_clamp (c);
1603  return MP_OKAY;
1604}
1605
1606
1607#ifdef BN_MP_DIV_SMALL
1608
1609/* slower bit-bang division... also smaller */
1610static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1611{
1612   mp_int ta, tb, tq, q;
1613   int    res, n, n2;
1614
1615  /* is divisor zero ? */
1616  if (mp_iszero (b) == 1) {
1617    return MP_VAL;
1618  }
1619
1620  /* if a < b then q=0, r = a */
1621  if (mp_cmp_mag (a, b) == MP_LT) {
1622    if (d != NULL) {
1623      res = mp_copy (a, d);
1624    } else {
1625      res = MP_OKAY;
1626    }
1627    if (c != NULL) {
1628      mp_zero (c);
1629    }
1630    return res;
1631  }
1632
1633  /* init our temps */
1634  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
1635     return res;
1636  }
1637
1638
1639  mp_set(&tq, 1);
1640  n = mp_count_bits(a) - mp_count_bits(b);
1641  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
1642      ((res = mp_abs(b, &tb)) != MP_OKAY) ||
1643      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
1644      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
1645      goto LBL_ERR;
1646  }
1647
1648  while (n-- >= 0) {
1649     if (mp_cmp(&tb, &ta) != MP_GT) {
1650        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
1651            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
1652           goto LBL_ERR;
1653        }
1654     }
1655     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
1656         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
1657           goto LBL_ERR;
1658     }
1659  }
1660
1661  /* now q == quotient and ta == remainder */
1662  n  = a->sign;
1663  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
1664  if (c != NULL) {
1665     mp_exch(c, &q);
1666     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
1667  }
1668  if (d != NULL) {
1669     mp_exch(d, &ta);
1670     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
1671  }
1672LBL_ERR:
1673   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
1674   return res;
1675}
1676
1677#else
1678
1679/* integer signed division.
1680 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1681 * HAC pp.598 Algorithm 14.20
1682 *
1683 * Note that the description in HAC is horribly
1684 * incomplete.  For example, it doesn't consider
1685 * the case where digits are removed from 'x' in
1686 * the inner loop.  It also doesn't consider the
1687 * case that y has fewer than three digits, etc..
1688 *
1689 * The overall algorithm is as described as
1690 * 14.20 from HAC but fixed to treat these cases.
1691*/
1692static int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1693{
1694  mp_int  q, x, y, t1, t2;
1695  int     res, n, t, i, norm, neg;
1696
1697  /* is divisor zero ? */
1698  if (mp_iszero (b) == 1) {
1699    return MP_VAL;
1700  }
1701
1702  /* if a < b then q=0, r = a */
1703  if (mp_cmp_mag (a, b) == MP_LT) {
1704    if (d != NULL) {
1705      res = mp_copy (a, d);
1706    } else {
1707      res = MP_OKAY;
1708    }
1709    if (c != NULL) {
1710      mp_zero (c);
1711    }
1712    return res;
1713  }
1714
1715  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1716    return res;
1717  }
1718  q.used = a->used + 2;
1719
1720  if ((res = mp_init (&t1)) != MP_OKAY) {
1721    goto LBL_Q;
1722  }
1723
1724  if ((res = mp_init (&t2)) != MP_OKAY) {
1725    goto LBL_T1;
1726  }
1727
1728  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1729    goto LBL_T2;
1730  }
1731
1732  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1733    goto LBL_X;
1734  }
1735
1736  /* fix the sign */
1737  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1738  x.sign = y.sign = MP_ZPOS;
1739
1740  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1741  norm = mp_count_bits(&y) % DIGIT_BIT;
1742  if (norm < (int)(DIGIT_BIT-1)) {
1743     norm = (DIGIT_BIT-1) - norm;
1744     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1745       goto LBL_Y;
1746     }
1747     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1748       goto LBL_Y;
1749     }
1750  } else {
1751     norm = 0;
1752  }
1753
1754  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1755  n = x.used - 1;
1756  t = y.used - 1;
1757
1758  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1759  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1760    goto LBL_Y;
1761  }
1762
1763  while (mp_cmp (&x, &y) != MP_LT) {
1764    ++(q.dp[n - t]);
1765    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1766      goto LBL_Y;
1767    }
1768  }
1769
1770  /* reset y by shifting it back down */
1771  mp_rshd (&y, n - t);
1772
1773  /* step 3. for i from n down to (t + 1) */
1774  for (i = n; i >= (t + 1); i--) {
1775    if (i > x.used) {
1776      continue;
1777    }
1778
1779    /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1780     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1781    if (x.dp[i] == y.dp[t]) {
1782      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1783    } else {
1784      mp_word tmp;
1785      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1786      tmp |= ((mp_word) x.dp[i - 1]);
1787      tmp /= ((mp_word) y.dp[t]);
1788      if (tmp > (mp_word) MP_MASK)
1789        tmp = MP_MASK;
1790      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1791    }
1792
1793    /* while (q{i-t-1} * (yt * b + y{t-1})) >
1794             xi * b**2 + xi-1 * b + xi-2
1795
1796       do q{i-t-1} -= 1;
1797    */
1798    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1799    do {
1800      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1801
1802      /* find left hand */
1803      mp_zero (&t1);
1804      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1805      t1.dp[1] = y.dp[t];
1806      t1.used = 2;
1807      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1808        goto LBL_Y;
1809      }
1810
1811      /* find right hand */
1812      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1813      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1814      t2.dp[2] = x.dp[i];
1815      t2.used = 3;
1816    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1817
1818    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1819    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1820      goto LBL_Y;
1821    }
1822
1823    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1824      goto LBL_Y;
1825    }
1826
1827    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1828      goto LBL_Y;
1829    }
1830
1831    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1832    if (x.sign == MP_NEG) {
1833      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1834        goto LBL_Y;
1835      }
1836      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1837        goto LBL_Y;
1838      }
1839      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1840        goto LBL_Y;
1841      }
1842
1843      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1844    }
1845  }
1846
1847  /* now q is the quotient and x is the remainder
1848   * [which we have to normalize]
1849   */
1850
1851  /* get sign before writing to c */
1852  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1853
1854  if (c != NULL) {
1855    mp_clamp (&q);
1856    mp_exch (&q, c);
1857    c->sign = neg;
1858  }
1859
1860  if (d != NULL) {
1861    mp_div_2d (&x, norm, &x, NULL);
1862    mp_exch (&x, d);
1863  }
1864
1865  res = MP_OKAY;
1866
1867LBL_Y:mp_clear (&y);
1868LBL_X:mp_clear (&x);
1869LBL_T2:mp_clear (&t2);
1870LBL_T1:mp_clear (&t1);
1871LBL_Q:mp_clear (&q);
1872  return res;
1873}
1874
1875#endif
1876
1877
1878#ifdef MP_LOW_MEM
1879   #define TAB_SIZE 32
1880#else
1881   #define TAB_SIZE 256
1882#endif
1883
1884static int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
1885{
1886  mp_int  M[TAB_SIZE], res, mu;
1887  mp_digit buf;
1888  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1889  int (*redux)(mp_int*,mp_int*,mp_int*);
1890
1891  /* find window size */
1892  x = mp_count_bits (X);
1893  if (x <= 7) {
1894    winsize = 2;
1895  } else if (x <= 36) {
1896    winsize = 3;
1897  } else if (x <= 140) {
1898    winsize = 4;
1899  } else if (x <= 450) {
1900    winsize = 5;
1901  } else if (x <= 1303) {
1902    winsize = 6;
1903  } else if (x <= 3529) {
1904    winsize = 7;
1905  } else {
1906    winsize = 8;
1907  }
1908
1909#ifdef MP_LOW_MEM
1910    if (winsize > 5) {
1911       winsize = 5;
1912    }
1913#endif
1914
1915  /* init M array */
1916  /* init first cell */
1917  if ((err = mp_init(&M[1])) != MP_OKAY) {
1918     return err;
1919  }
1920
1921  /* now init the second half of the array */
1922  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1923    if ((err = mp_init(&M[x])) != MP_OKAY) {
1924      for (y = 1<<(winsize-1); y < x; y++) {
1925        mp_clear (&M[y]);
1926      }
1927      mp_clear(&M[1]);
1928      return err;
1929    }
1930  }
1931
1932  /* create mu, used for Barrett reduction */
1933  if ((err = mp_init (&mu)) != MP_OKAY) {
1934    goto LBL_M;
1935  }
1936
1937  if (redmode == 0) {
1938     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
1939        goto LBL_MU;
1940     }
1941     redux = mp_reduce;
1942  } else {
1943     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
1944        goto LBL_MU;
1945     }
1946     redux = mp_reduce_2k_l;
1947  }
1948
1949  /* create M table
1950   *
1951   * The M table contains powers of the base,
1952   * e.g. M[x] = G**x mod P
1953   *
1954   * The first half of the table is not
1955   * computed though accept for M[0] and M[1]
1956   */
1957  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
1958    goto LBL_MU;
1959  }
1960
1961  /* compute the value at M[1<<(winsize-1)] by squaring
1962   * M[1] (winsize-1) times
1963   */
1964  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1965    goto LBL_MU;
1966  }
1967
1968  for (x = 0; x < (winsize - 1); x++) {
1969    /* square it */
1970    if ((err = mp_sqr (&M[1 << (winsize - 1)],
1971                       &M[1 << (winsize - 1)])) != MP_OKAY) {
1972      goto LBL_MU;
1973    }
1974
1975    /* reduce modulo P */
1976    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
1977      goto LBL_MU;
1978    }
1979  }
1980
1981  /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
1982   * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
1983   */
1984  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1985    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1986      goto LBL_MU;
1987    }
1988    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
1989      goto LBL_MU;
1990    }
1991  }
1992
1993  /* setup result */
1994  if ((err = mp_init (&res)) != MP_OKAY) {
1995    goto LBL_MU;
1996  }
1997  mp_set (&res, 1);
1998
1999  /* set initial mode and bit cnt */
2000  mode   = 0;
2001  bitcnt = 1;
2002  buf    = 0;
2003  digidx = X->used - 1;
2004  bitcpy = 0;
2005  bitbuf = 0;
2006
2007  for (;;) {
2008    /* grab next digit as required */
2009    if (--bitcnt == 0) {
2010      /* if digidx == -1 we are out of digits */
2011      if (digidx == -1) {
2012        break;
2013      }
2014      /* read next digit and reset the bitcnt */
2015      buf    = X->dp[digidx--];
2016      bitcnt = (int) DIGIT_BIT;
2017    }
2018
2019    /* grab the next msb from the exponent */
2020    y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
2021    buf <<= (mp_digit)1;
2022
2023    /* if the bit is zero and mode == 0 then we ignore it
2024     * These represent the leading zero bits before the first 1 bit
2025     * in the exponent.  Technically this opt is not required but it
2026     * does lower the # of trivial squaring/reductions used
2027     */
2028    if (mode == 0 && y == 0) {
2029      continue;
2030    }
2031
2032    /* if the bit is zero and mode == 1 then we square */
2033    if (mode == 1 && y == 0) {
2034      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2035        goto LBL_RES;
2036      }
2037      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2038        goto LBL_RES;
2039      }
2040      continue;
2041    }
2042
2043    /* else we add it to the window */
2044    bitbuf |= (y << (winsize - ++bitcpy));
2045    mode    = 2;
2046
2047    if (bitcpy == winsize) {
2048      /* ok window is filled so square as required and multiply  */
2049      /* square first */
2050      for (x = 0; x < winsize; x++) {
2051        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2052          goto LBL_RES;
2053        }
2054        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2055          goto LBL_RES;
2056        }
2057      }
2058
2059      /* then multiply */
2060      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
2061        goto LBL_RES;
2062      }
2063      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2064        goto LBL_RES;
2065      }
2066
2067      /* empty window and reset */
2068      bitcpy = 0;
2069      bitbuf = 0;
2070      mode   = 1;
2071    }
2072  }
2073
2074  /* if bits remain then square/multiply */
2075  if (mode == 2 && bitcpy > 0) {
2076    /* square then multiply if the bit is set */
2077    for (x = 0; x < bitcpy; x++) {
2078      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2079        goto LBL_RES;
2080      }
2081      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2082        goto LBL_RES;
2083      }
2084
2085      bitbuf <<= 1;
2086      if ((bitbuf & (1 << winsize)) != 0) {
2087        /* then multiply */
2088        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
2089          goto LBL_RES;
2090        }
2091        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2092          goto LBL_RES;
2093        }
2094      }
2095    }
2096  }
2097
2098  mp_exch (&res, Y);
2099  err = MP_OKAY;
2100LBL_RES:mp_clear (&res);
2101LBL_MU:mp_clear (&mu);
2102LBL_M:
2103  mp_clear(&M[1]);
2104  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
2105    mp_clear (&M[x]);
2106  }
2107  return err;
2108}
2109
2110
2111/* computes b = a*a */
2112static int mp_sqr (mp_int * a, mp_int * b)
2113{
2114  int     res;
2115
2116#ifdef BN_MP_TOOM_SQR_C
2117  /* use Toom-Cook? */
2118  if (a->used >= TOOM_SQR_CUTOFF) {
2119    res = mp_toom_sqr(a, b);
2120  /* Karatsuba? */
2121  } else
2122#endif
2123#ifdef BN_MP_KARATSUBA_SQR_C
2124if (a->used >= KARATSUBA_SQR_CUTOFF) {
2125    res = mp_karatsuba_sqr (a, b);
2126  } else
2127#endif
2128  {
2129#ifdef BN_FAST_S_MP_SQR_C
2130    /* can we use the fast comba multiplier? */
2131    if ((a->used * 2 + 1) < MP_WARRAY &&
2132         a->used <
2133         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
2134      res = fast_s_mp_sqr (a, b);
2135    } else
2136#endif
2137#ifdef BN_S_MP_SQR_C
2138      res = s_mp_sqr (a, b);
2139#else
2140#error mp_sqr could fail
2141      res = MP_VAL;
2142#endif
2143  }
2144  b->sign = MP_ZPOS;
2145  return res;
2146}
2147
2148
2149/* reduces a modulo n where n is of the form 2**p - d
2150   This differs from reduce_2k since "d" can be larger
2151   than a single digit.
2152*/
2153static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
2154{
2155   mp_int q;
2156   int    p, res;
2157
2158   if ((res = mp_init(&q)) != MP_OKAY) {
2159      return res;
2160   }
2161
2162   p = mp_count_bits(n);
2163top:
2164   /* q = a/2**p, a = a mod 2**p */
2165   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
2166      goto ERR;
2167   }
2168
2169   /* q = q * d */
2170   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) {
2171      goto ERR;
2172   }
2173
2174   /* a = a + q */
2175   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
2176      goto ERR;
2177   }
2178
2179   if (mp_cmp_mag(a, n) != MP_LT) {
2180      s_mp_sub(a, n, a);
2181      goto top;
2182   }
2183
2184ERR:
2185   mp_clear(&q);
2186   return res;
2187}
2188
2189
2190/* determines the setup value */
2191static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
2192{
2193   int    res;
2194   mp_int tmp;
2195
2196   if ((res = mp_init(&tmp)) != MP_OKAY) {
2197      return res;
2198   }
2199
2200   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
2201      goto ERR;
2202   }
2203
2204   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
2205      goto ERR;
2206   }
2207
2208ERR:
2209   mp_clear(&tmp);
2210   return res;
2211}
2212
2213
2214/* computes a = 2**b
2215 *
2216 * Simple algorithm which zeroes the int, grows it then just sets one bit
2217 * as required.
2218 */
2219static int mp_2expt (mp_int * a, int b)
2220{
2221  int     res;
2222
2223  /* zero a as per default */
2224  mp_zero (a);
2225
2226  /* grow a to accommodate the single bit */
2227  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
2228    return res;
2229  }
2230
2231  /* set the used count of where the bit will go */
2232  a->used = b / DIGIT_BIT + 1;
2233
2234  /* put the single bit in its place */
2235  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
2236
2237  return MP_OKAY;
2238}
2239
2240
2241/* pre-calculate the value required for Barrett reduction
2242 * For a given modulus "b" it calulates the value required in "a"
2243 */
2244static int mp_reduce_setup (mp_int * a, mp_int * b)
2245{
2246  int     res;
2247
2248  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
2249    return res;
2250  }
2251  return mp_div (a, b, a, NULL);
2252}
2253
2254
2255/* reduces x mod m, assumes 0 < x < m**2, mu is
2256 * precomputed via mp_reduce_setup.
2257 * From HAC pp.604 Algorithm 14.42
2258 */
2259static int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
2260{
2261  mp_int  q;
2262  int     res, um = m->used;
2263
2264  /* q = x */
2265  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
2266    return res;
2267  }
2268
2269  /* q1 = x / b**(k-1)  */
2270  mp_rshd (&q, um - 1);
2271
2272  /* according to HAC this optimization is ok */
2273  if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
2274    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
2275      goto CLEANUP;
2276    }
2277  } else {
2278#ifdef BN_S_MP_MUL_HIGH_DIGS_C
2279    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2280      goto CLEANUP;
2281    }
2282#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
2283    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2284      goto CLEANUP;
2285    }
2286#else
2287    {
2288#error mp_reduce would always fail
2289      res = MP_VAL;
2290      goto CLEANUP;
2291    }
2292#endif
2293  }
2294
2295  /* q3 = q2 / b**(k+1) */
2296  mp_rshd (&q, um + 1);
2297
2298  /* x = x mod b**(k+1), quick (no division) */
2299  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
2300    goto CLEANUP;
2301  }
2302
2303  /* q = q * m mod b**(k+1), quick (no division) */
2304  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
2305    goto CLEANUP;
2306  }
2307
2308  /* x = x - q */
2309  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
2310    goto CLEANUP;
2311  }
2312
2313  /* If x < 0, add b**(k+1) to it */
2314  if (mp_cmp_d (x, 0) == MP_LT) {
2315    mp_set (&q, 1);
2316    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) {
2317      goto CLEANUP;
2318    }
2319    if ((res = mp_add (x, &q, x)) != MP_OKAY) {
2320      goto CLEANUP;
2321    }
2322  }
2323
2324  /* Back off if it's too big */
2325  while (mp_cmp (x, m) != MP_LT) {
2326    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
2327      goto CLEANUP;
2328    }
2329  }
2330
2331CLEANUP:
2332  mp_clear (&q);
2333
2334  return res;
2335}
2336
2337
2338/* multiplies |a| * |b| and only computes up to digs digits of result
2339 * HAC pp. 595, Algorithm 14.12  Modified so you can control how
2340 * many digits of output are created.
2341 */
2342static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2343{
2344  mp_int  t;
2345  int     res, pa, pb, ix, iy;
2346  mp_digit u;
2347  mp_word r;
2348  mp_digit tmpx, *tmpt, *tmpy;
2349
2350#ifdef BN_FAST_S_MP_MUL_DIGS_C
2351  /* can we use the fast multiplier? */
2352  if (((digs) < MP_WARRAY) &&
2353      MIN (a->used, b->used) <
2354          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2355    return fast_s_mp_mul_digs (a, b, c, digs);
2356  }
2357#endif
2358
2359  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
2360    return res;
2361  }
2362  t.used = digs;
2363
2364  /* compute the digits of the product directly */
2365  pa = a->used;
2366  for (ix = 0; ix < pa; ix++) {
2367    /* set the carry to zero */
2368    u = 0;
2369
2370    /* limit ourselves to making digs digits of output */
2371    pb = MIN (b->used, digs - ix);
2372
2373    /* setup some aliases */
2374    /* copy of the digit from a used within the nested loop */
2375    tmpx = a->dp[ix];
2376
2377    /* an alias for the destination shifted ix places */
2378    tmpt = t.dp + ix;
2379
2380    /* an alias for the digits of b */
2381    tmpy = b->dp;
2382
2383    /* compute the columns of the output and propagate the carry */
2384    for (iy = 0; iy < pb; iy++) {
2385      /* compute the column as a mp_word */
2386      r       = ((mp_word)*tmpt) +
2387                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2388                ((mp_word) u);
2389
2390      /* the new column is the lower part of the result */
2391      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2392
2393      /* get the carry word from the result */
2394      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2395    }
2396    /* set carry if it is placed below digs */
2397    if (ix + iy < digs) {
2398      *tmpt = u;
2399    }
2400  }
2401
2402  mp_clamp (&t);
2403  mp_exch (&t, c);
2404
2405  mp_clear (&t);
2406  return MP_OKAY;
2407}
2408
2409
2410#ifdef BN_FAST_S_MP_MUL_DIGS_C
2411/* Fast (comba) multiplier
2412 *
2413 * This is the fast column-array [comba] multiplier.  It is
2414 * designed to compute the columns of the product first
2415 * then handle the carries afterwards.  This has the effect
2416 * of making the nested loops that compute the columns very
2417 * simple and schedulable on super-scalar processors.
2418 *
2419 * This has been modified to produce a variable number of
2420 * digits of output so if say only a half-product is required
2421 * you don't have to compute the upper half (a feature
2422 * required for fast Barrett reduction).
2423 *
2424 * Based on Algorithm 14.12 on pp.595 of HAC.
2425 *
2426 */
2427static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2428{
2429  int     olduse, res, pa, ix, iz;
2430  mp_digit W[MP_WARRAY];
2431  register mp_word  _W;
2432
2433  /* grow the destination as required */
2434  if (c->alloc < digs) {
2435    if ((res = mp_grow (c, digs)) != MP_OKAY) {
2436      return res;
2437    }
2438  }
2439
2440  /* number of output digits to produce */
2441  pa = MIN(digs, a->used + b->used);
2442
2443  /* clear the carry */
2444  _W = 0;
2445  for (ix = 0; ix < pa; ix++) {
2446      int      tx, ty;
2447      int      iy;
2448      mp_digit *tmpx, *tmpy;
2449
2450      /* get offsets into the two bignums */
2451      ty = MIN(b->used-1, ix);
2452      tx = ix - ty;
2453
2454      /* setup temp aliases */
2455      tmpx = a->dp + tx;
2456      tmpy = b->dp + ty;
2457
2458      /* this is the number of times the loop will iterrate, essentially
2459         while (tx++ < a->used && ty-- >= 0) { ... }
2460       */
2461      iy = MIN(a->used-tx, ty+1);
2462
2463      /* execute loop */
2464      for (iz = 0; iz < iy; ++iz) {
2465         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2466
2467      }
2468
2469      /* store term */
2470      W[ix] = ((mp_digit)_W) & MP_MASK;
2471
2472      /* make next carry */
2473      _W = _W >> ((mp_word)DIGIT_BIT);
2474 }
2475
2476  /* setup dest */
2477  olduse  = c->used;
2478  c->used = pa;
2479
2480  {
2481    register mp_digit *tmpc;
2482    tmpc = c->dp;
2483    for (ix = 0; ix < pa+1; ix++) {
2484      /* now extract the previous digit [below the carry] */
2485      *tmpc++ = W[ix];
2486    }
2487
2488    /* clear unused digits [that existed in the old copy of c] */
2489    for (; ix < olduse; ix++) {
2490      *tmpc++ = 0;
2491    }
2492  }
2493  mp_clamp (c);
2494  return MP_OKAY;
2495}
2496#endif /* BN_FAST_S_MP_MUL_DIGS_C */
2497
2498
2499/* init an mp_init for a given size */
2500static int mp_init_size (mp_int * a, int size)
2501{
2502  int x;
2503
2504  /* pad size so there are always extra digits */
2505  size += (MP_PREC * 2) - (size % MP_PREC);
2506
2507  /* alloc mem */
2508  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
2509  if (a->dp == NULL) {
2510    return MP_MEM;
2511  }
2512
2513  /* set the members */
2514  a->used  = 0;
2515  a->alloc = size;
2516  a->sign  = MP_ZPOS;
2517
2518  /* zero the digits */
2519  for (x = 0; x < size; x++) {
2520      a->dp[x] = 0;
2521  }
2522
2523  return MP_OKAY;
2524}
2525
2526
2527/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
2528static int s_mp_sqr (mp_int * a, mp_int * b)
2529{
2530  mp_int  t;
2531  int     res, ix, iy, pa;
2532  mp_word r;
2533  mp_digit u, tmpx, *tmpt;
2534
2535  pa = a->used;
2536  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
2537    return res;
2538  }
2539
2540  /* default used is maximum possible size */
2541  t.used = 2*pa + 1;
2542
2543  for (ix = 0; ix < pa; ix++) {
2544    /* first calculate the digit at 2*ix */
2545    /* calculate double precision result */
2546    r = ((mp_word) t.dp[2*ix]) +
2547        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
2548
2549    /* store lower part in result */
2550    t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
2551
2552    /* get the carry */
2553    u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2554
2555    /* left hand side of A[ix] * A[iy] */
2556    tmpx        = a->dp[ix];
2557
2558    /* alias for where to store the results */
2559    tmpt        = t.dp + (2*ix + 1);
2560
2561    for (iy = ix + 1; iy < pa; iy++) {
2562      /* first calculate the product */
2563      r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
2564
2565      /* now calculate the double precision result, note we use
2566       * addition instead of *2 since it's easier to optimize
2567       */
2568      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
2569
2570      /* store lower part */
2571      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2572
2573      /* get carry */
2574      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2575    }
2576    /* propagate upwards */
2577    while (u != ((mp_digit) 0)) {
2578      r       = ((mp_word) *tmpt) + ((mp_word) u);
2579      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2580      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2581    }
2582  }
2583
2584  mp_clamp (&t);
2585  mp_exch (&t, b);
2586  mp_clear (&t);
2587  return MP_OKAY;
2588}
2589
2590
2591/* multiplies |a| * |b| and does not compute the lower digs digits
2592 * [meant to get the higher part of the product]
2593 */
2594static int s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2595{
2596  mp_int  t;
2597  int     res, pa, pb, ix, iy;
2598  mp_digit u;
2599  mp_word r;
2600  mp_digit tmpx, *tmpt, *tmpy;
2601
2602  /* can we use the fast multiplier? */
2603#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
2604  if (((a->used + b->used + 1) < MP_WARRAY)
2605      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2606    return fast_s_mp_mul_high_digs (a, b, c, digs);
2607  }
2608#endif
2609
2610  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
2611    return res;
2612  }
2613  t.used = a->used + b->used + 1;
2614
2615  pa = a->used;
2616  pb = b->used;
2617  for (ix = 0; ix < pa; ix++) {
2618    /* clear the carry */
2619    u = 0;
2620
2621    /* left hand side of A[ix] * B[iy] */
2622    tmpx = a->dp[ix];
2623
2624    /* alias to the address of where the digits will be stored */
2625    tmpt = &(t.dp[digs]);
2626
2627    /* alias for where to read the right hand side from */
2628    tmpy = b->dp + (digs - ix);
2629
2630    for (iy = digs - ix; iy < pb; iy++) {
2631      /* calculate the double precision result */
2632      r       = ((mp_word)*tmpt) +
2633                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2634                ((mp_word) u);
2635
2636      /* get the lower part */
2637      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2638
2639      /* carry the carry */
2640      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2641    }
2642    *tmpt = u;
2643  }
2644  mp_clamp (&t);
2645  mp_exch (&t, c);
2646  mp_clear (&t);
2647  return MP_OKAY;
2648}
2649
2650
2651#ifdef BN_MP_MONTGOMERY_SETUP_C
2652/* setups the montgomery reduction stuff */
2653static int
2654mp_montgomery_setup (mp_int * n, mp_digit * rho)
2655{
2656  mp_digit x, b;
2657
2658/* fast inversion mod 2**k
2659 *
2660 * Based on the fact that
2661 *
2662 * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
2663 *                    =>  2*X*A - X*X*A*A = 1
2664 *                    =>  2*(1) - (1)     = 1
2665 */
2666  b = n->dp[0];
2667
2668  if ((b & 1) == 0) {
2669    return MP_VAL;
2670  }
2671
2672  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2673  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
2674#if !defined(MP_8BIT)
2675  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
2676#endif
2677#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
2678  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
2679#endif
2680#ifdef MP_64BIT
2681  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
2682#endif
2683
2684  /* rho = -1/m mod b */
2685  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2686
2687  return MP_OKAY;
2688}
2689#endif
2690
2691
2692#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
2693/* computes xR**-1 == x (mod N) via Montgomery Reduction
2694 *
2695 * This is an optimized implementation of montgomery_reduce
2696 * which uses the comba method to quickly calculate the columns of the
2697 * reduction.
2698 *
2699 * Based on Algorithm 14.32 on pp.601 of HAC.
2700*/
2701static int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
2702{
2703  int     ix, res, olduse;
2704  mp_word W[MP_WARRAY];
2705
2706  /* get old used count */
2707  olduse = x->used;
2708
2709  /* grow a as required */
2710  if (x->alloc < n->used + 1) {
2711    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
2712      return res;
2713    }
2714  }
2715
2716  /* first we have to get the digits of the input into
2717   * an array of double precision words W[...]
2718   */
2719  {
2720    register mp_word *_W;
2721    register mp_digit *tmpx;
2722
2723    /* alias for the W[] array */
2724    _W   = W;
2725
2726    /* alias for the digits of  x*/
2727    tmpx = x->dp;
2728
2729    /* copy the digits of a into W[0..a->used-1] */
2730    for (ix = 0; ix < x->used; ix++) {
2731      *_W++ = *tmpx++;
2732    }
2733
2734    /* zero the high words of W[a->used..m->used*2] */
2735    for (; ix < n->used * 2 + 1; ix++) {
2736      *_W++ = 0;
2737    }
2738  }
2739
2740  /* now we proceed to zero successive digits
2741   * from the least significant upwards
2742   */
2743  for (ix = 0; ix < n->used; ix++) {
2744    /* mu = ai * m' mod b
2745     *
2746     * We avoid a double precision multiplication (which isn't required)
2747     * by casting the value down to a mp_digit.  Note this requires
2748     * that W[ix-1] have  the carry cleared (see after the inner loop)
2749     */
2750    register mp_digit mu;
2751    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
2752
2753    /* a = a + mu * m * b**i
2754     *
2755     * This is computed in place and on the fly.  The multiplication
2756     * by b**i is handled by offseting which columns the results
2757     * are added to.
2758     *
2759     * Note the comba method normally doesn't handle carries in the
2760     * inner loop In this case we fix the carry from the previous
2761     * column since the Montgomery reduction requires digits of the
2762     * result (so far) [see above] to work.  This is
2763     * handled by fixing up one carry after the inner loop.  The
2764     * carry fixups are done in order so after these loops the
2765     * first m->used words of W[] have the carries fixed
2766     */
2767    {
2768      register int iy;
2769      register mp_digit *tmpn;
2770      register mp_word *_W;
2771
2772      /* alias for the digits of the modulus */
2773      tmpn = n->dp;
2774
2775      /* Alias for the columns set by an offset of ix */
2776      _W = W + ix;
2777
2778      /* inner loop */
2779      for (iy = 0; iy < n->used; iy++) {
2780          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
2781      }
2782    }
2783
2784    /* now fix carry for next digit, W[ix+1] */
2785    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
2786  }
2787
2788  /* now we have to propagate the carries and
2789   * shift the words downward [all those least
2790   * significant digits we zeroed].
2791   */
2792  {
2793    register mp_digit *tmpx;
2794    register mp_word *_W, *_W1;
2795
2796    /* nox fix rest of carries */
2797
2798    /* alias for current word */
2799    _W1 = W + ix;
2800
2801    /* alias for next word, where the carry goes */
2802    _W = W + ++ix;
2803
2804    for (; ix <= n->used * 2 + 1; ix++) {
2805      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
2806    }
2807
2808    /* copy out, A = A/b**n
2809     *
2810     * The result is A/b**n but instead of converting from an
2811     * array of mp_word to mp_digit than calling mp_rshd
2812     * we just copy them in the right order
2813     */
2814
2815    /* alias for destination word */
2816    tmpx = x->dp;
2817
2818    /* alias for shifted double precision result */
2819    _W = W + n->used;
2820
2821    for (ix = 0; ix < n->used + 1; ix++) {
2822      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
2823    }
2824
2825    /* zero oldused digits, if the input a was larger than
2826     * m->used+1 we'll have to clear the digits
2827     */
2828    for (; ix < olduse; ix++) {
2829      *tmpx++ = 0;
2830    }
2831  }
2832
2833  /* set the max used and clamp */
2834  x->used = n->used + 1;
2835  mp_clamp (x);
2836
2837  /* if A >= m then A = A - m */
2838  if (mp_cmp_mag (x, n) != MP_LT) {
2839    return s_mp_sub (x, n, x);
2840  }
2841  return MP_OKAY;
2842}
2843#endif
2844
2845
2846#ifdef BN_MP_MUL_2_C
2847/* b = a*2 */
2848static int mp_mul_2(mp_int * a, mp_int * b)
2849{
2850  int     x, res, oldused;
2851
2852  /* grow to accommodate result */
2853  if (b->alloc < a->used + 1) {
2854    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2855      return res;
2856    }
2857  }
2858
2859  oldused = b->used;
2860  b->used = a->used;
2861
2862  {
2863    register mp_digit r, rr, *tmpa, *tmpb;
2864
2865    /* alias for source */
2866    tmpa = a->dp;
2867
2868    /* alias for dest */
2869    tmpb = b->dp;
2870
2871    /* carry */
2872    r = 0;
2873    for (x = 0; x < a->used; x++) {
2874
2875      /* get what will be the *next* carry bit from the
2876       * MSB of the current digit
2877       */
2878      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2879
2880      /* now shift up this digit, add in the carry [from the previous] */
2881      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2882
2883      /* copy the carry that would be from the source
2884       * digit into the next iteration
2885       */
2886      r = rr;
2887    }
2888
2889    /* new leading digit? */
2890    if (r != 0) {
2891      /* add a MSB which is always 1 at this point */
2892      *tmpb = 1;
2893      ++(b->used);
2894    }
2895
2896    /* now zero any excess digits on the destination
2897     * that we didn't write to
2898     */
2899    tmpb = b->dp + b->used;
2900    for (x = b->used; x < oldused; x++) {
2901      *tmpb++ = 0;
2902    }
2903  }
2904  b->sign = a->sign;
2905  return MP_OKAY;
2906}
2907#endif
2908
2909
2910#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
2911/*
2912 * shifts with subtractions when the result is greater than b.
2913 *
2914 * The method is slightly modified to shift B unconditionally up to just under
2915 * the leading bit of b.  This saves a lot of multiple precision shifting.
2916 */
2917static int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
2918{
2919  int     x, bits, res;
2920
2921  /* how many bits of last digit does b use */
2922  bits = mp_count_bits (b) % DIGIT_BIT;
2923
2924  if (b->used > 1) {
2925     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2926        return res;
2927     }
2928  } else {
2929     mp_set(a, 1);
2930     bits = 1;
2931  }
2932
2933
2934  /* now compute C = A * B mod b */
2935  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
2936    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2937      return res;
2938    }
2939    if (mp_cmp_mag (a, b) != MP_LT) {
2940      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2941        return res;
2942      }
2943    }
2944  }
2945
2946  return MP_OKAY;
2947}
2948#endif
2949
2950
2951#ifdef BN_MP_EXPTMOD_FAST_C
2952/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
2953 *
2954 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
2955 * The value of k changes based on the size of the exponent.
2956 *
2957 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
2958 */
2959
2960static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
2961{
2962  mp_int  M[TAB_SIZE], res;
2963  mp_digit buf, mp;
2964  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
2965
2966  /* use a pointer to the reduction algorithm.  This allows us to use
2967   * one of many reduction algorithms without modding the guts of
2968   * the code with if statements everywhere.
2969   */
2970  int     (*redux)(mp_int*,mp_int*,mp_digit);
2971
2972  /* find window size */
2973  x = mp_count_bits (X);
2974  if (x <= 7) {
2975    winsize = 2;
2976  } else if (x <= 36) {
2977    winsize = 3;
2978  } else if (x <= 140) {
2979    winsize = 4;
2980  } else if (x <= 450) {
2981    winsize = 5;
2982  } else if (x <= 1303) {
2983    winsize = 6;
2984  } else if (x <= 3529) {
2985    winsize = 7;
2986  } else {
2987    winsize = 8;
2988  }
2989
2990#ifdef MP_LOW_MEM
2991  if (winsize > 5) {
2992     winsize = 5;
2993  }
2994#endif
2995
2996  /* init M array */
2997  /* init first cell */
2998  if ((err = mp_init(&M[1])) != MP_OKAY) {
2999     return err;
3000  }
3001
3002  /* now init the second half of the array */
3003  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3004    if ((err = mp_init(&M[x])) != MP_OKAY) {
3005      for (y = 1<<(winsize-1); y < x; y++) {
3006        mp_clear (&M[y]);
3007      }
3008      mp_clear(&M[1]);
3009      return err;
3010    }
3011  }
3012
3013  /* determine and setup reduction code */
3014  if (redmode == 0) {
3015#ifdef BN_MP_MONTGOMERY_SETUP_C
3016     /* now setup montgomery  */
3017     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
3018        goto LBL_M;
3019     }
3020#else
3021     err = MP_VAL;
3022     goto LBL_M;
3023#endif
3024
3025     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
3026#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
3027     if (((P->used * 2 + 1) < MP_WARRAY) &&
3028          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
3029        redux = fast_mp_montgomery_reduce;
3030     } else
3031#endif
3032     {
3033#ifdef BN_MP_MONTGOMERY_REDUCE_C
3034        /* use slower baseline Montgomery method */
3035        redux = mp_montgomery_reduce;
3036#else
3037        err = MP_VAL;
3038        goto LBL_M;
3039#endif
3040     }
3041  } else if (redmode == 1) {
3042#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
3043     /* setup DR reduction for moduli of the form B**k - b */
3044     mp_dr_setup(P, &mp);
3045     redux = mp_dr_reduce;
3046#else
3047     err = MP_VAL;
3048     goto LBL_M;
3049#endif
3050  } else {
3051#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
3052     /* setup DR reduction for moduli of the form 2**k - b */
3053     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
3054        goto LBL_M;
3055     }
3056     redux = mp_reduce_2k;
3057#else
3058     err = MP_VAL;
3059     goto LBL_M;
3060#endif
3061  }
3062
3063  /* setup result */
3064  if ((err = mp_init (&res)) != MP_OKAY) {
3065    goto LBL_M;
3066  }
3067
3068  /* create M table
3069   *
3070
3071   *
3072   * The first half of the table is not computed though accept for M[0] and M[1]
3073   */
3074
3075  if (redmode == 0) {
3076#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
3077     /* now we need R mod m */
3078     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
3079       goto LBL_RES;
3080     }
3081#else
3082     err = MP_VAL;
3083     goto LBL_RES;
3084#endif
3085
3086     /* now set M[1] to G * R mod m */
3087     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
3088       goto LBL_RES;
3089     }
3090  } else {
3091     mp_set(&res, 1);
3092     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
3093        goto LBL_RES;
3094     }
3095  }
3096
3097  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
3098  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
3099    goto LBL_RES;
3100  }
3101
3102  for (x = 0; x < (winsize - 1); x++) {
3103    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
3104      goto LBL_RES;
3105    }
3106    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
3107      goto LBL_RES;
3108    }
3109  }
3110
3111  /* create upper table */
3112  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
3113    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
3114      goto LBL_RES;
3115    }
3116    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
3117      goto LBL_RES;
3118    }
3119  }
3120
3121  /* set initial mode and bit cnt */
3122  mode   = 0;
3123  bitcnt = 1;
3124  buf    = 0;
3125  digidx = X->used - 1;
3126  bitcpy = 0;
3127  bitbuf = 0;
3128
3129  for (;;) {
3130    /* grab next digit as required */
3131    if (--bitcnt == 0) {
3132      /* if digidx == -1 we are out of digits so break */
3133      if (digidx == -1) {
3134        break;
3135      }
3136      /* read next digit and reset bitcnt */
3137      buf    = X->dp[digidx--];
3138      bitcnt = (int)DIGIT_BIT;
3139    }
3140
3141    /* grab the next msb from the exponent */
3142    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
3143    buf <<= (mp_digit)1;
3144
3145    /* if the bit is zero and mode == 0 then we ignore it
3146     * These represent the leading zero bits before the first 1 bit
3147     * in the exponent.  Technically this opt is not required but it
3148     * does lower the # of trivial squaring/reductions used
3149     */
3150    if (mode == 0 && y == 0) {
3151      continue;
3152    }
3153
3154    /* if the bit is zero and mode == 1 then we square */
3155    if (mode == 1 && y == 0) {
3156      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3157        goto LBL_RES;
3158      }
3159      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3160        goto LBL_RES;
3161      }
3162      continue;
3163    }
3164
3165    /* else we add it to the window */
3166    bitbuf |= (y << (winsize - ++bitcpy));
3167    mode    = 2;
3168
3169    if (bitcpy == winsize) {
3170      /* ok window is filled so square as required and multiply  */
3171      /* square first */
3172      for (x = 0; x < winsize; x++) {
3173        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3174          goto LBL_RES;
3175        }
3176        if ((err = redux (&res, P, mp)) != MP_OKAY) {
3177          goto LBL_RES;
3178        }
3179      }
3180
3181      /* then multiply */
3182      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
3183        goto LBL_RES;
3184      }
3185      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3186        goto LBL_RES;
3187      }
3188
3189      /* empty window and reset */
3190      bitcpy = 0;
3191      bitbuf = 0;
3192      mode   = 1;
3193    }
3194  }
3195
3196  /* if bits remain then square/multiply */
3197  if (mode == 2 && bitcpy > 0) {
3198    /* square then multiply if the bit is set */
3199    for (x = 0; x < bitcpy; x++) {
3200      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3201        goto LBL_RES;
3202      }
3203      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3204        goto LBL_RES;
3205      }
3206
3207      /* get next bit of the window */
3208      bitbuf <<= 1;
3209      if ((bitbuf & (1 << winsize)) != 0) {
3210        /* then multiply */
3211        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
3212          goto LBL_RES;
3213        }
3214        if ((err = redux (&res, P, mp)) != MP_OKAY) {
3215          goto LBL_RES;
3216        }
3217      }
3218    }
3219  }
3220
3221  if (redmode == 0) {
3222     /* fixup result if Montgomery reduction is used
3223      * recall that any value in a Montgomery system is
3224      * actually multiplied by R mod n.  So we have
3225      * to reduce one more time to cancel out the factor
3226      * of R.
3227      */
3228     if ((err = redux(&res, P, mp)) != MP_OKAY) {
3229       goto LBL_RES;
3230     }
3231  }
3232
3233  /* swap res with Y */
3234  mp_exch (&res, Y);
3235  err = MP_OKAY;
3236LBL_RES:mp_clear (&res);
3237LBL_M:
3238  mp_clear(&M[1]);
3239  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3240    mp_clear (&M[x]);
3241  }
3242  return err;
3243}
3244#endif
3245
3246
3247#ifdef BN_FAST_S_MP_SQR_C
3248/* the jist of squaring...
3249 * you do like mult except the offset of the tmpx [one that
3250 * starts closer to zero] can't equal the offset of tmpy.
3251 * So basically you set up iy like before then you min it with
3252 * (ty-tx) so that it never happens.  You double all those
3253 * you add in the inner loop
3254
3255After that loop you do the squares and add them in.
3256*/
3257
3258static int fast_s_mp_sqr (mp_int * a, mp_int * b)
3259{
3260  int       olduse, res, pa, ix, iz;
3261  mp_digit   W[MP_WARRAY], *tmpx;
3262  mp_word   W1;
3263
3264  /* grow the destination as required */
3265  pa = a->used + a->used;
3266  if (b->alloc < pa) {
3267    if ((res = mp_grow (b, pa)) != MP_OKAY) {
3268      return res;
3269    }
3270  }
3271
3272  /* number of output digits to produce */
3273  W1 = 0;
3274  for (ix = 0; ix < pa; ix++) {
3275      int      tx, ty, iy;
3276      mp_word  _W;
3277      mp_digit *tmpy;
3278
3279      /* clear counter */
3280      _W = 0;
3281
3282      /* get offsets into the two bignums */
3283      ty = MIN(a->used-1, ix);
3284      tx = ix - ty;
3285
3286      /* setup temp aliases */
3287      tmpx = a->dp + tx;
3288      tmpy = a->dp + ty;
3289
3290      /* this is the number of times the loop will iterrate, essentially
3291         while (tx++ < a->used && ty-- >= 0) { ... }
3292       */
3293      iy = MIN(a->used-tx, ty+1);
3294
3295      /* now for squaring tx can never equal ty
3296       * we halve the distance since they approach at a rate of 2x
3297       * and we have to round because odd cases need to be executed
3298       */
3299      iy = MIN(iy, (ty-tx+1)>>1);
3300
3301      /* execute loop */
3302      for (iz = 0; iz < iy; iz++) {
3303         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
3304      }
3305
3306      /* double the inner product and add carry */
3307      _W = _W + _W + W1;
3308
3309      /* even columns have the square term in them */
3310      if ((ix&1) == 0) {
3311         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
3312      }
3313
3314      /* store it */
3315      W[ix] = (mp_digit)(_W & MP_MASK);
3316
3317      /* make next carry */
3318      W1 = _W >> ((mp_word)DIGIT_BIT);
3319  }
3320
3321  /* setup dest */
3322  olduse  = b->used;
3323  b->used = a->used+a->used;
3324
3325  {
3326    mp_digit *tmpb;
3327    tmpb = b->dp;
3328    for (ix = 0; ix < pa; ix++) {
3329      *tmpb++ = W[ix] & MP_MASK;
3330    }
3331
3332    /* clear unused digits [that existed in the old copy of c] */
3333    for (; ix < olduse; ix++) {
3334      *tmpb++ = 0;
3335    }
3336  }
3337  mp_clamp (b);
3338  return MP_OKAY;
3339}
3340#endif
3341
3342
3343#ifdef BN_MP_MUL_D_C
3344/* multiply by a digit */
3345static int
3346mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
3347{
3348  mp_digit u, *tmpa, *tmpc;
3349  mp_word  r;
3350  int      ix, res, olduse;
3351
3352  /* make sure c is big enough to hold a*b */
3353  if (c->alloc < a->used + 1) {
3354    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
3355      return res;
3356    }
3357  }
3358
3359  /* get the original destinations used count */
3360  olduse = c->used;
3361
3362  /* set the sign */
3363  c->sign = a->sign;
3364
3365  /* alias for a->dp [source] */
3366  tmpa = a->dp;
3367
3368  /* alias for c->dp [dest] */
3369  tmpc = c->dp;
3370
3371  /* zero carry */
3372  u = 0;
3373
3374  /* compute columns */
3375  for (ix = 0; ix < a->used; ix++) {
3376    /* compute product and carry sum for this term */
3377    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
3378
3379    /* mask off higher bits to get a single digit */
3380    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
3381
3382    /* send carry into next iteration */
3383    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
3384  }
3385
3386  /* store final carry [if any] and increment ix offset  */
3387  *tmpc++ = u;
3388  ++ix;
3389
3390  /* now zero digits above the top */
3391  while (ix++ < olduse) {
3392     *tmpc++ = 0;
3393  }
3394
3395  /* set used count */
3396  c->used = a->used + 1;
3397  mp_clamp(c);
3398
3399  return MP_OKAY;
3400}
3401#endif
3402