libtommath.c revision 8d520ff1dc2da35cdca849e982051b86468016d8
1/*
2 * Minimal code for RSA support from LibTomMath 0.41
3 * http://libtom.org/
4 * http://libtom.org/files/ltm-0.41.tar.bz2
5 * This library was released in public domain by Tom St Denis.
6 *
7 * The combination in this file may not use all of the optimized algorithms
8 * from LibTomMath and may be considerable slower than the LibTomMath with its
9 * default settings. The main purpose of having this version here is to make it
10 * easier to build bignum.c wrapper without having to install and build an
11 * external library.
12 *
13 * If CONFIG_INTERNAL_LIBTOMMATH is defined, bignum.c includes this
14 * libtommath.c file instead of using the external LibTomMath library.
15 */
16
17#ifndef CHAR_BIT
18#define CHAR_BIT 8
19#endif
20
21#define BN_MP_INVMOD_C
22#define BN_S_MP_EXPTMOD_C /* Note: #undef in tommath_superclass.h; this would
23			   * require BN_MP_EXPTMOD_FAST_C instead */
24#define BN_S_MP_MUL_DIGS_C
25#define BN_MP_INVMOD_SLOW_C
26#define BN_S_MP_SQR_C
27#define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
28				 * would require other than mp_reduce */
29
30#ifdef LTM_FAST
31
32/* Use faster div at the cost of about 1 kB */
33#define BN_MP_MUL_D_C
34
35/* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
36#define BN_MP_EXPTMOD_FAST_C
37#define BN_MP_MONTGOMERY_SETUP_C
38#define BN_FAST_MP_MONTGOMERY_REDUCE_C
39#define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
40#define BN_MP_MUL_2_C
41
42/* Include faster sqr at the cost of about 0.5 kB in code */
43#define BN_FAST_S_MP_SQR_C
44
45#else /* LTM_FAST */
46
47#define BN_MP_DIV_SMALL
48#define BN_MP_INIT_MULTI_C
49#define BN_MP_CLEAR_MULTI_C
50#define BN_MP_ABS_C
51#endif /* LTM_FAST */
52
53/* Current uses do not require support for negative exponent in exptmod, so we
54 * can save about 1.5 kB in leaving out invmod. */
55#define LTM_NO_NEG_EXP
56
57/* from tommath.h */
58
59#ifndef MIN
60   #define MIN(x,y) ((x)<(y)?(x):(y))
61#endif
62
63#ifndef MAX
64   #define MAX(x,y) ((x)>(y)?(x):(y))
65#endif
66
67#define  OPT_CAST(x)
68
69typedef unsigned long mp_digit;
70typedef u64 mp_word;
71
72#define DIGIT_BIT          28
73#define MP_28BIT
74
75
76#define XMALLOC  os_malloc
77#define XFREE    os_free
78#define XREALLOC os_realloc
79
80
81#define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
82
83#define MP_LT        -1   /* less than */
84#define MP_EQ         0   /* equal to */
85#define MP_GT         1   /* greater than */
86
87#define MP_ZPOS       0   /* positive integer */
88#define MP_NEG        1   /* negative */
89
90#define MP_OKAY       0   /* ok result */
91#define MP_MEM        -2  /* out of mem */
92#define MP_VAL        -3  /* invalid input */
93
94#define MP_YES        1   /* yes response */
95#define MP_NO         0   /* no response */
96
97typedef int           mp_err;
98
99/* define this to use lower memory usage routines (exptmods mostly) */
100#define MP_LOW_MEM
101
102/* default precision */
103#ifndef MP_PREC
104   #ifndef MP_LOW_MEM
105      #define MP_PREC                 32     /* default digits of precision */
106   #else
107      #define MP_PREC                 8      /* default digits of precision */
108   #endif
109#endif
110
111/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
112#define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))
113
114/* the infamous mp_int structure */
115typedef struct  {
116    int used, alloc, sign;
117    mp_digit *dp;
118} mp_int;
119
120
121/* ---> Basic Manipulations <--- */
122#define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
123#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? MP_YES : MP_NO)
124#define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? MP_YES : MP_NO)
125
126
127/* prototypes for copied functions */
128#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
129static int s_mp_exptmod(mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
130static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
131static int s_mp_sqr(mp_int * a, mp_int * b);
132static int s_mp_mul_high_digs(mp_int * a, mp_int * b, mp_int * c, int digs);
133
134static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
135
136#ifdef BN_MP_INIT_MULTI_C
137static int mp_init_multi(mp_int *mp, ...);
138#endif
139#ifdef BN_MP_CLEAR_MULTI_C
140static void mp_clear_multi(mp_int *mp, ...);
141#endif
142static int mp_lshd(mp_int * a, int b);
143static void mp_set(mp_int * a, mp_digit b);
144static void mp_clamp(mp_int * a);
145static void mp_exch(mp_int * a, mp_int * b);
146static void mp_rshd(mp_int * a, int b);
147static void mp_zero(mp_int * a);
148static int mp_mod_2d(mp_int * a, int b, mp_int * c);
149static int mp_div_2d(mp_int * a, int b, mp_int * c, mp_int * d);
150static int mp_init_copy(mp_int * a, mp_int * b);
151static int mp_mul_2d(mp_int * a, int b, mp_int * c);
152#ifndef LTM_NO_NEG_EXP
153static int mp_div_2(mp_int * a, mp_int * b);
154static int mp_invmod(mp_int * a, mp_int * b, mp_int * c);
155static int mp_invmod_slow(mp_int * a, mp_int * b, mp_int * c);
156#endif /* LTM_NO_NEG_EXP */
157static int mp_copy(mp_int * a, mp_int * b);
158static int mp_count_bits(mp_int * a);
159static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d);
160static int mp_mod(mp_int * a, mp_int * b, mp_int * c);
161static int mp_grow(mp_int * a, int size);
162static int mp_cmp_mag(mp_int * a, mp_int * b);
163#ifdef BN_MP_ABS_C
164static int mp_abs(mp_int * a, mp_int * b);
165#endif
166static int mp_sqr(mp_int * a, mp_int * b);
167static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
168static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
169static int mp_2expt(mp_int * a, int b);
170static int mp_reduce_setup(mp_int * a, mp_int * b);
171static int mp_reduce(mp_int * x, mp_int * m, mp_int * mu);
172static int mp_init_size(mp_int * a, int size);
173#ifdef BN_MP_EXPTMOD_FAST_C
174static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
175#endif /* BN_MP_EXPTMOD_FAST_C */
176#ifdef BN_FAST_S_MP_SQR_C
177static int fast_s_mp_sqr (mp_int * a, mp_int * b);
178#endif /* BN_FAST_S_MP_SQR_C */
179#ifdef BN_MP_MUL_D_C
180static int mp_mul_d (mp_int * a, mp_digit b, mp_int * c);
181#endif /* BN_MP_MUL_D_C */
182
183
184
185/* functions from bn_<func name>.c */
186
187
188/* reverse an array, used for radix code */
189static void bn_reverse (unsigned char *s, int len)
190{
191  int     ix, iy;
192  unsigned char t;
193
194  ix = 0;
195  iy = len - 1;
196  while (ix < iy) {
197    t     = s[ix];
198    s[ix] = s[iy];
199    s[iy] = t;
200    ++ix;
201    --iy;
202  }
203}
204
205
206/* low level addition, based on HAC pp.594, Algorithm 14.7 */
207static int s_mp_add (mp_int * a, mp_int * b, mp_int * c)
208{
209  mp_int *x;
210  int     olduse, res, min, max;
211
212  /* find sizes, we let |a| <= |b| which means we have to sort
213   * them.  "x" will point to the input with the most digits
214   */
215  if (a->used > b->used) {
216    min = b->used;
217    max = a->used;
218    x = a;
219  } else {
220    min = a->used;
221    max = b->used;
222    x = b;
223  }
224
225  /* init result */
226  if (c->alloc < max + 1) {
227    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
228      return res;
229    }
230  }
231
232  /* get old used digit count and set new one */
233  olduse = c->used;
234  c->used = max + 1;
235
236  {
237    register mp_digit u, *tmpa, *tmpb, *tmpc;
238    register int i;
239
240    /* alias for digit pointers */
241
242    /* first input */
243    tmpa = a->dp;
244
245    /* second input */
246    tmpb = b->dp;
247
248    /* destination */
249    tmpc = c->dp;
250
251    /* zero the carry */
252    u = 0;
253    for (i = 0; i < min; i++) {
254      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
255      *tmpc = *tmpa++ + *tmpb++ + u;
256
257      /* U = carry bit of T[i] */
258      u = *tmpc >> ((mp_digit)DIGIT_BIT);
259
260      /* take away carry bit from T[i] */
261      *tmpc++ &= MP_MASK;
262    }
263
264    /* now copy higher words if any, that is in A+B
265     * if A or B has more digits add those in
266     */
267    if (min != max) {
268      for (; i < max; i++) {
269        /* T[i] = X[i] + U */
270        *tmpc = x->dp[i] + u;
271
272        /* U = carry bit of T[i] */
273        u = *tmpc >> ((mp_digit)DIGIT_BIT);
274
275        /* take away carry bit from T[i] */
276        *tmpc++ &= MP_MASK;
277      }
278    }
279
280    /* add carry */
281    *tmpc++ = u;
282
283    /* clear digits above oldused */
284    for (i = c->used; i < olduse; i++) {
285      *tmpc++ = 0;
286    }
287  }
288
289  mp_clamp (c);
290  return MP_OKAY;
291}
292
293
294/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
295static int s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
296{
297  int     olduse, res, min, max;
298
299  /* find sizes */
300  min = b->used;
301  max = a->used;
302
303  /* init result */
304  if (c->alloc < max) {
305    if ((res = mp_grow (c, max)) != MP_OKAY) {
306      return res;
307    }
308  }
309  olduse = c->used;
310  c->used = max;
311
312  {
313    register mp_digit u, *tmpa, *tmpb, *tmpc;
314    register int i;
315
316    /* alias for digit pointers */
317    tmpa = a->dp;
318    tmpb = b->dp;
319    tmpc = c->dp;
320
321    /* set carry to zero */
322    u = 0;
323    for (i = 0; i < min; i++) {
324      /* T[i] = A[i] - B[i] - U */
325      *tmpc = *tmpa++ - *tmpb++ - u;
326
327      /* U = carry bit of T[i]
328       * Note this saves performing an AND operation since
329       * if a carry does occur it will propagate all the way to the
330       * MSB.  As a result a single shift is enough to get the carry
331       */
332      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
333
334      /* Clear carry from T[i] */
335      *tmpc++ &= MP_MASK;
336    }
337
338    /* now copy higher words if any, e.g. if A has more digits than B  */
339    for (; i < max; i++) {
340      /* T[i] = A[i] - U */
341      *tmpc = *tmpa++ - u;
342
343      /* U = carry bit of T[i] */
344      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
345
346      /* Clear carry from T[i] */
347      *tmpc++ &= MP_MASK;
348    }
349
350    /* clear digits above used (since we may not have grown result above) */
351    for (i = c->used; i < olduse; i++) {
352      *tmpc++ = 0;
353    }
354  }
355
356  mp_clamp (c);
357  return MP_OKAY;
358}
359
360
361/* init a new mp_int */
362static int mp_init (mp_int * a)
363{
364  int i;
365
366  /* allocate memory required and clear it */
367  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
368  if (a->dp == NULL) {
369    return MP_MEM;
370  }
371
372  /* set the digits to zero */
373  for (i = 0; i < MP_PREC; i++) {
374      a->dp[i] = 0;
375  }
376
377  /* set the used to zero, allocated digits to the default precision
378   * and sign to positive */
379  a->used  = 0;
380  a->alloc = MP_PREC;
381  a->sign  = MP_ZPOS;
382
383  return MP_OKAY;
384}
385
386
387/* clear one (frees)  */
388static void mp_clear (mp_int * a)
389{
390  int i;
391
392  /* only do anything if a hasn't been freed previously */
393  if (a->dp != NULL) {
394    /* first zero the digits */
395    for (i = 0; i < a->used; i++) {
396        a->dp[i] = 0;
397    }
398
399    /* free ram */
400    XFREE(a->dp);
401
402    /* reset members to make debugging easier */
403    a->dp    = NULL;
404    a->alloc = a->used = 0;
405    a->sign  = MP_ZPOS;
406  }
407}
408
409
410/* high level addition (handles signs) */
411static int mp_add (mp_int * a, mp_int * b, mp_int * c)
412{
413  int     sa, sb, res;
414
415  /* get sign of both inputs */
416  sa = a->sign;
417  sb = b->sign;
418
419  /* handle two cases, not four */
420  if (sa == sb) {
421    /* both positive or both negative */
422    /* add their magnitudes, copy the sign */
423    c->sign = sa;
424    res = s_mp_add (a, b, c);
425  } else {
426    /* one positive, the other negative */
427    /* subtract the one with the greater magnitude from */
428    /* the one of the lesser magnitude.  The result gets */
429    /* the sign of the one with the greater magnitude. */
430    if (mp_cmp_mag (a, b) == MP_LT) {
431      c->sign = sb;
432      res = s_mp_sub (b, a, c);
433    } else {
434      c->sign = sa;
435      res = s_mp_sub (a, b, c);
436    }
437  }
438  return res;
439}
440
441
442/* high level subtraction (handles signs) */
443static int mp_sub (mp_int * a, mp_int * b, mp_int * c)
444{
445  int     sa, sb, res;
446
447  sa = a->sign;
448  sb = b->sign;
449
450  if (sa != sb) {
451    /* subtract a negative from a positive, OR */
452    /* subtract a positive from a negative. */
453    /* In either case, ADD their magnitudes, */
454    /* and use the sign of the first number. */
455    c->sign = sa;
456    res = s_mp_add (a, b, c);
457  } else {
458    /* subtract a positive from a positive, OR */
459    /* subtract a negative from a negative. */
460    /* First, take the difference between their */
461    /* magnitudes, then... */
462    if (mp_cmp_mag (a, b) != MP_LT) {
463      /* Copy the sign from the first */
464      c->sign = sa;
465      /* The first has a larger or equal magnitude */
466      res = s_mp_sub (a, b, c);
467    } else {
468      /* The result has the *opposite* sign from */
469      /* the first number. */
470      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
471      /* The second has a larger magnitude */
472      res = s_mp_sub (b, a, c);
473    }
474  }
475  return res;
476}
477
478
479/* high level multiplication (handles sign) */
480static int mp_mul (mp_int * a, mp_int * b, mp_int * c)
481{
482  int     res, neg;
483  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
484
485  /* use Toom-Cook? */
486#ifdef BN_MP_TOOM_MUL_C
487  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
488    res = mp_toom_mul(a, b, c);
489  } else
490#endif
491#ifdef BN_MP_KARATSUBA_MUL_C
492  /* use Karatsuba? */
493  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
494    res = mp_karatsuba_mul (a, b, c);
495  } else
496#endif
497  {
498    /* can we use the fast multiplier?
499     *
500     * The fast multiplier can be used if the output will
501     * have less than MP_WARRAY digits and the number of
502     * digits won't affect carry propagation
503     */
504#ifdef BN_FAST_S_MP_MUL_DIGS_C
505    int     digs = a->used + b->used + 1;
506
507    if ((digs < MP_WARRAY) &&
508        MIN(a->used, b->used) <=
509        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
510      res = fast_s_mp_mul_digs (a, b, c, digs);
511    } else
512#endif
513#ifdef BN_S_MP_MUL_DIGS_C
514      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
515#else
516#error mp_mul could fail
517      res = MP_VAL;
518#endif
519
520  }
521  c->sign = (c->used > 0) ? neg : MP_ZPOS;
522  return res;
523}
524
525
526/* d = a * b (mod c) */
527static int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
528{
529  int     res;
530  mp_int  t;
531
532  if ((res = mp_init (&t)) != MP_OKAY) {
533    return res;
534  }
535
536  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
537    mp_clear (&t);
538    return res;
539  }
540  res = mp_mod (&t, c, d);
541  mp_clear (&t);
542  return res;
543}
544
545
546/* c = a mod b, 0 <= c < b */
547static int mp_mod (mp_int * a, mp_int * b, mp_int * c)
548{
549  mp_int  t;
550  int     res;
551
552  if ((res = mp_init (&t)) != MP_OKAY) {
553    return res;
554  }
555
556  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
557    mp_clear (&t);
558    return res;
559  }
560
561  if (t.sign != b->sign) {
562    res = mp_add (b, &t, c);
563  } else {
564    res = MP_OKAY;
565    mp_exch (&t, c);
566  }
567
568  mp_clear (&t);
569  return res;
570}
571
572
573/* this is a shell function that calls either the normal or Montgomery
574 * exptmod functions.  Originally the call to the montgomery code was
575 * embedded in the normal function but that wasted alot of stack space
576 * for nothing (since 99% of the time the Montgomery code would be called)
577 */
578static int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
579{
580  int dr;
581
582  /* modulus P must be positive */
583  if (P->sign == MP_NEG) {
584     return MP_VAL;
585  }
586
587  /* if exponent X is negative we have to recurse */
588  if (X->sign == MP_NEG) {
589#ifdef LTM_NO_NEG_EXP
590        return MP_VAL;
591#else /* LTM_NO_NEG_EXP */
592#ifdef BN_MP_INVMOD_C
593     mp_int tmpG, tmpX;
594     int err;
595
596     /* first compute 1/G mod P */
597     if ((err = mp_init(&tmpG)) != MP_OKAY) {
598        return err;
599     }
600     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
601        mp_clear(&tmpG);
602        return err;
603     }
604
605     /* now get |X| */
606     if ((err = mp_init(&tmpX)) != MP_OKAY) {
607        mp_clear(&tmpG);
608        return err;
609     }
610     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
611        mp_clear_multi(&tmpG, &tmpX, NULL);
612        return err;
613     }
614
615     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
616     err = mp_exptmod(&tmpG, &tmpX, P, Y);
617     mp_clear_multi(&tmpG, &tmpX, NULL);
618     return err;
619#else
620#error mp_exptmod would always fail
621     /* no invmod */
622     return MP_VAL;
623#endif
624#endif /* LTM_NO_NEG_EXP */
625  }
626
627/* modified diminished radix reduction */
628#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
629  if (mp_reduce_is_2k_l(P) == MP_YES) {
630     return s_mp_exptmod(G, X, P, Y, 1);
631  }
632#endif
633
634#ifdef BN_MP_DR_IS_MODULUS_C
635  /* is it a DR modulus? */
636  dr = mp_dr_is_modulus(P);
637#else
638  /* default to no */
639  dr = 0;
640#endif
641
642#ifdef BN_MP_REDUCE_IS_2K_C
643  /* if not, is it a unrestricted DR modulus? */
644  if (dr == 0) {
645     dr = mp_reduce_is_2k(P) << 1;
646  }
647#endif
648
649  /* if the modulus is odd or dr != 0 use the montgomery method */
650#ifdef BN_MP_EXPTMOD_FAST_C
651  if (mp_isodd (P) == 1 || dr !=  0) {
652    return mp_exptmod_fast (G, X, P, Y, dr);
653  } else {
654#endif
655#ifdef BN_S_MP_EXPTMOD_C
656    /* otherwise use the generic Barrett reduction technique */
657    return s_mp_exptmod (G, X, P, Y, 0);
658#else
659#error mp_exptmod could fail
660    /* no exptmod for evens */
661    return MP_VAL;
662#endif
663#ifdef BN_MP_EXPTMOD_FAST_C
664  }
665#endif
666}
667
668
669/* compare two ints (signed)*/
670static int mp_cmp (mp_int * a, mp_int * b)
671{
672  /* compare based on sign */
673  if (a->sign != b->sign) {
674     if (a->sign == MP_NEG) {
675        return MP_LT;
676     } else {
677        return MP_GT;
678     }
679  }
680
681  /* compare digits */
682  if (a->sign == MP_NEG) {
683     /* if negative compare opposite direction */
684     return mp_cmp_mag(b, a);
685  } else {
686     return mp_cmp_mag(a, b);
687  }
688}
689
690
691/* compare a digit */
692static int mp_cmp_d(mp_int * a, mp_digit b)
693{
694  /* compare based on sign */
695  if (a->sign == MP_NEG) {
696    return MP_LT;
697  }
698
699  /* compare based on magnitude */
700  if (a->used > 1) {
701    return MP_GT;
702  }
703
704  /* compare the only digit of a to b */
705  if (a->dp[0] > b) {
706    return MP_GT;
707  } else if (a->dp[0] < b) {
708    return MP_LT;
709  } else {
710    return MP_EQ;
711  }
712}
713
714
715#ifndef LTM_NO_NEG_EXP
716/* hac 14.61, pp608 */
717static int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
718{
719  /* b cannot be negative */
720  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
721    return MP_VAL;
722  }
723
724#ifdef BN_FAST_MP_INVMOD_C
725  /* if the modulus is odd we can use a faster routine instead */
726  if (mp_isodd (b) == 1) {
727    return fast_mp_invmod (a, b, c);
728  }
729#endif
730
731#ifdef BN_MP_INVMOD_SLOW_C
732  return mp_invmod_slow(a, b, c);
733#endif
734
735#ifndef BN_FAST_MP_INVMOD_C
736#ifndef BN_MP_INVMOD_SLOW_C
737#error mp_invmod would always fail
738#endif
739#endif
740  return MP_VAL;
741}
742#endif /* LTM_NO_NEG_EXP */
743
744
745/* get the size for an unsigned equivalent */
746static int mp_unsigned_bin_size (mp_int * a)
747{
748  int     size = mp_count_bits (a);
749  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
750}
751
752
753#ifndef LTM_NO_NEG_EXP
754/* hac 14.61, pp608 */
755static int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
756{
757  mp_int  x, y, u, v, A, B, C, D;
758  int     res;
759
760  /* b cannot be negative */
761  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
762    return MP_VAL;
763  }
764
765  /* init temps */
766  if ((res = mp_init_multi(&x, &y, &u, &v,
767                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
768     return res;
769  }
770
771  /* x = a, y = b */
772  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
773      goto LBL_ERR;
774  }
775  if ((res = mp_copy (b, &y)) != MP_OKAY) {
776    goto LBL_ERR;
777  }
778
779  /* 2. [modified] if x,y are both even then return an error! */
780  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
781    res = MP_VAL;
782    goto LBL_ERR;
783  }
784
785  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
786  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
787    goto LBL_ERR;
788  }
789  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
790    goto LBL_ERR;
791  }
792  mp_set (&A, 1);
793  mp_set (&D, 1);
794
795top:
796  /* 4.  while u is even do */
797  while (mp_iseven (&u) == 1) {
798    /* 4.1 u = u/2 */
799    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
800      goto LBL_ERR;
801    }
802    /* 4.2 if A or B is odd then */
803    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
804      /* A = (A+y)/2, B = (B-x)/2 */
805      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
806         goto LBL_ERR;
807      }
808      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
809         goto LBL_ERR;
810      }
811    }
812    /* A = A/2, B = B/2 */
813    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
814      goto LBL_ERR;
815    }
816    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
817      goto LBL_ERR;
818    }
819  }
820
821  /* 5.  while v is even do */
822  while (mp_iseven (&v) == 1) {
823    /* 5.1 v = v/2 */
824    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
825      goto LBL_ERR;
826    }
827    /* 5.2 if C or D is odd then */
828    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
829      /* C = (C+y)/2, D = (D-x)/2 */
830      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
831         goto LBL_ERR;
832      }
833      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
834         goto LBL_ERR;
835      }
836    }
837    /* C = C/2, D = D/2 */
838    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
839      goto LBL_ERR;
840    }
841    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
842      goto LBL_ERR;
843    }
844  }
845
846  /* 6.  if u >= v then */
847  if (mp_cmp (&u, &v) != MP_LT) {
848    /* u = u - v, A = A - C, B = B - D */
849    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
850      goto LBL_ERR;
851    }
852
853    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
854      goto LBL_ERR;
855    }
856
857    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
858      goto LBL_ERR;
859    }
860  } else {
861    /* v - v - u, C = C - A, D = D - B */
862    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
863      goto LBL_ERR;
864    }
865
866    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
867      goto LBL_ERR;
868    }
869
870    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
871      goto LBL_ERR;
872    }
873  }
874
875  /* if not zero goto step 4 */
876  if (mp_iszero (&u) == 0)
877    goto top;
878
879  /* now a = C, b = D, gcd == g*v */
880
881  /* if v != 1 then there is no inverse */
882  if (mp_cmp_d (&v, 1) != MP_EQ) {
883    res = MP_VAL;
884    goto LBL_ERR;
885  }
886
887  /* if its too low */
888  while (mp_cmp_d(&C, 0) == MP_LT) {
889      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
890         goto LBL_ERR;
891      }
892  }
893
894  /* too big */
895  while (mp_cmp_mag(&C, b) != MP_LT) {
896      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
897         goto LBL_ERR;
898      }
899  }
900
901  /* C is now the inverse */
902  mp_exch (&C, c);
903  res = MP_OKAY;
904LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
905  return res;
906}
907#endif /* LTM_NO_NEG_EXP */
908
909
910/* compare maginitude of two ints (unsigned) */
911static int mp_cmp_mag (mp_int * a, mp_int * b)
912{
913  int     n;
914  mp_digit *tmpa, *tmpb;
915
916  /* compare based on # of non-zero digits */
917  if (a->used > b->used) {
918    return MP_GT;
919  }
920
921  if (a->used < b->used) {
922    return MP_LT;
923  }
924
925  /* alias for a */
926  tmpa = a->dp + (a->used - 1);
927
928  /* alias for b */
929  tmpb = b->dp + (a->used - 1);
930
931  /* compare based on digits  */
932  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
933    if (*tmpa > *tmpb) {
934      return MP_GT;
935    }
936
937    if (*tmpa < *tmpb) {
938      return MP_LT;
939    }
940  }
941  return MP_EQ;
942}
943
944
945/* reads a unsigned char array, assumes the msb is stored first [big endian] */
946static int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
947{
948  int     res;
949
950  /* make sure there are at least two digits */
951  if (a->alloc < 2) {
952     if ((res = mp_grow(a, 2)) != MP_OKAY) {
953        return res;
954     }
955  }
956
957  /* zero the int */
958  mp_zero (a);
959
960  /* read the bytes in */
961  while (c-- > 0) {
962    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
963      return res;
964    }
965
966#ifndef MP_8BIT
967      a->dp[0] |= *b++;
968      a->used += 1;
969#else
970      a->dp[0] = (*b & MP_MASK);
971      a->dp[1] |= ((*b++ >> 7U) & 1);
972      a->used += 2;
973#endif
974  }
975  mp_clamp (a);
976  return MP_OKAY;
977}
978
979
980/* store in unsigned [big endian] format */
981static int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
982{
983  int     x, res;
984  mp_int  t;
985
986  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
987    return res;
988  }
989
990  x = 0;
991  while (mp_iszero (&t) == 0) {
992#ifndef MP_8BIT
993      b[x++] = (unsigned char) (t.dp[0] & 255);
994#else
995      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
996#endif
997    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
998      mp_clear (&t);
999      return res;
1000    }
1001  }
1002  bn_reverse (b, x);
1003  mp_clear (&t);
1004  return MP_OKAY;
1005}
1006
1007
1008/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1009static int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
1010{
1011  mp_digit D, r, rr;
1012  int     x, res;
1013  mp_int  t;
1014
1015
1016  /* if the shift count is <= 0 then we do no work */
1017  if (b <= 0) {
1018    res = mp_copy (a, c);
1019    if (d != NULL) {
1020      mp_zero (d);
1021    }
1022    return res;
1023  }
1024
1025  if ((res = mp_init (&t)) != MP_OKAY) {
1026    return res;
1027  }
1028
1029  /* get the remainder */
1030  if (d != NULL) {
1031    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1032      mp_clear (&t);
1033      return res;
1034    }
1035  }
1036
1037  /* copy */
1038  if ((res = mp_copy (a, c)) != MP_OKAY) {
1039    mp_clear (&t);
1040    return res;
1041  }
1042
1043  /* shift by as many digits in the bit count */
1044  if (b >= (int)DIGIT_BIT) {
1045    mp_rshd (c, b / DIGIT_BIT);
1046  }
1047
1048  /* shift any bit count < DIGIT_BIT */
1049  D = (mp_digit) (b % DIGIT_BIT);
1050  if (D != 0) {
1051    register mp_digit *tmpc, mask, shift;
1052
1053    /* mask */
1054    mask = (((mp_digit)1) << D) - 1;
1055
1056    /* shift for lsb */
1057    shift = DIGIT_BIT - D;
1058
1059    /* alias */
1060    tmpc = c->dp + (c->used - 1);
1061
1062    /* carry */
1063    r = 0;
1064    for (x = c->used - 1; x >= 0; x--) {
1065      /* get the lower  bits of this word in a temp */
1066      rr = *tmpc & mask;
1067
1068      /* shift the current word and mix in the carry bits from the previous word */
1069      *tmpc = (*tmpc >> D) | (r << shift);
1070      --tmpc;
1071
1072      /* set the carry to the carry bits of the current word found above */
1073      r = rr;
1074    }
1075  }
1076  mp_clamp (c);
1077  if (d != NULL) {
1078    mp_exch (&t, d);
1079  }
1080  mp_clear (&t);
1081  return MP_OKAY;
1082}
1083
1084
1085static int mp_init_copy (mp_int * a, mp_int * b)
1086{
1087  int     res;
1088
1089  if ((res = mp_init (a)) != MP_OKAY) {
1090    return res;
1091  }
1092  return mp_copy (b, a);
1093}
1094
1095
1096/* set to zero */
1097static void mp_zero (mp_int * a)
1098{
1099  int       n;
1100  mp_digit *tmp;
1101
1102  a->sign = MP_ZPOS;
1103  a->used = 0;
1104
1105  tmp = a->dp;
1106  for (n = 0; n < a->alloc; n++) {
1107     *tmp++ = 0;
1108  }
1109}
1110
1111
1112/* copy, b = a */
1113static int mp_copy (mp_int * a, mp_int * b)
1114{
1115  int     res, n;
1116
1117  /* if dst == src do nothing */
1118  if (a == b) {
1119    return MP_OKAY;
1120  }
1121
1122  /* grow dest */
1123  if (b->alloc < a->used) {
1124     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1125        return res;
1126     }
1127  }
1128
1129  /* zero b and copy the parameters over */
1130  {
1131    register mp_digit *tmpa, *tmpb;
1132
1133    /* pointer aliases */
1134
1135    /* source */
1136    tmpa = a->dp;
1137
1138    /* destination */
1139    tmpb = b->dp;
1140
1141    /* copy all the digits */
1142    for (n = 0; n < a->used; n++) {
1143      *tmpb++ = *tmpa++;
1144    }
1145
1146    /* clear high digits */
1147    for (; n < b->used; n++) {
1148      *tmpb++ = 0;
1149    }
1150  }
1151
1152  /* copy used count and sign */
1153  b->used = a->used;
1154  b->sign = a->sign;
1155  return MP_OKAY;
1156}
1157
1158
1159/* shift right a certain amount of digits */
1160static void mp_rshd (mp_int * a, int b)
1161{
1162  int     x;
1163
1164  /* if b <= 0 then ignore it */
1165  if (b <= 0) {
1166    return;
1167  }
1168
1169  /* if b > used then simply zero it and return */
1170  if (a->used <= b) {
1171    mp_zero (a);
1172    return;
1173  }
1174
1175  {
1176    register mp_digit *bottom, *top;
1177
1178    /* shift the digits down */
1179
1180    /* bottom */
1181    bottom = a->dp;
1182
1183    /* top [offset into digits] */
1184    top = a->dp + b;
1185
1186    /* this is implemented as a sliding window where
1187     * the window is b-digits long and digits from
1188     * the top of the window are copied to the bottom
1189     *
1190     * e.g.
1191
1192     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
1193                 /\                   |      ---->
1194                  \-------------------/      ---->
1195     */
1196    for (x = 0; x < (a->used - b); x++) {
1197      *bottom++ = *top++;
1198    }
1199
1200    /* zero the top digits */
1201    for (; x < a->used; x++) {
1202      *bottom++ = 0;
1203    }
1204  }
1205
1206  /* remove excess digits */
1207  a->used -= b;
1208}
1209
1210
1211/* swap the elements of two integers, for cases where you can't simply swap the
1212 * mp_int pointers around
1213 */
1214static void mp_exch (mp_int * a, mp_int * b)
1215{
1216  mp_int  t;
1217
1218  t  = *a;
1219  *a = *b;
1220  *b = t;
1221}
1222
1223
1224/* trim unused digits
1225 *
1226 * This is used to ensure that leading zero digits are
1227 * trimed and the leading "used" digit will be non-zero
1228 * Typically very fast.  Also fixes the sign if there
1229 * are no more leading digits
1230 */
1231static void mp_clamp (mp_int * a)
1232{
1233  /* decrease used while the most significant digit is
1234   * zero.
1235   */
1236  while (a->used > 0 && a->dp[a->used - 1] == 0) {
1237    --(a->used);
1238  }
1239
1240  /* reset the sign flag if used == 0 */
1241  if (a->used == 0) {
1242    a->sign = MP_ZPOS;
1243  }
1244}
1245
1246
1247/* grow as required */
1248static int mp_grow (mp_int * a, int size)
1249{
1250  int     i;
1251  mp_digit *tmp;
1252
1253  /* if the alloc size is smaller alloc more ram */
1254  if (a->alloc < size) {
1255    /* ensure there are always at least MP_PREC digits extra on top */
1256    size += (MP_PREC * 2) - (size % MP_PREC);
1257
1258    /* reallocate the array a->dp
1259     *
1260     * We store the return in a temporary variable
1261     * in case the operation failed we don't want
1262     * to overwrite the dp member of a.
1263     */
1264    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
1265    if (tmp == NULL) {
1266      /* reallocation failed but "a" is still valid [can be freed] */
1267      return MP_MEM;
1268    }
1269
1270    /* reallocation succeeded so set a->dp */
1271    a->dp = tmp;
1272
1273    /* zero excess digits */
1274    i        = a->alloc;
1275    a->alloc = size;
1276    for (; i < a->alloc; i++) {
1277      a->dp[i] = 0;
1278    }
1279  }
1280  return MP_OKAY;
1281}
1282
1283
1284#ifdef BN_MP_ABS_C
1285/* b = |a|
1286 *
1287 * Simple function copies the input and fixes the sign to positive
1288 */
1289static int mp_abs (mp_int * a, mp_int * b)
1290{
1291  int     res;
1292
1293  /* copy a to b */
1294  if (a != b) {
1295     if ((res = mp_copy (a, b)) != MP_OKAY) {
1296       return res;
1297     }
1298  }
1299
1300  /* force the sign of b to positive */
1301  b->sign = MP_ZPOS;
1302
1303  return MP_OKAY;
1304}
1305#endif
1306
1307
1308/* set to a digit */
1309static void mp_set (mp_int * a, mp_digit b)
1310{
1311  mp_zero (a);
1312  a->dp[0] = b & MP_MASK;
1313  a->used  = (a->dp[0] != 0) ? 1 : 0;
1314}
1315
1316
1317#ifndef LTM_NO_NEG_EXP
1318/* b = a/2 */
1319static int mp_div_2(mp_int * a, mp_int * b)
1320{
1321  int     x, res, oldused;
1322
1323  /* copy */
1324  if (b->alloc < a->used) {
1325    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1326      return res;
1327    }
1328  }
1329
1330  oldused = b->used;
1331  b->used = a->used;
1332  {
1333    register mp_digit r, rr, *tmpa, *tmpb;
1334
1335    /* source alias */
1336    tmpa = a->dp + b->used - 1;
1337
1338    /* dest alias */
1339    tmpb = b->dp + b->used - 1;
1340
1341    /* carry */
1342    r = 0;
1343    for (x = b->used - 1; x >= 0; x--) {
1344      /* get the carry for the next iteration */
1345      rr = *tmpa & 1;
1346
1347      /* shift the current digit, add in carry and store */
1348      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1349
1350      /* forward carry to next iteration */
1351      r = rr;
1352    }
1353
1354    /* zero excess digits */
1355    tmpb = b->dp + b->used;
1356    for (x = b->used; x < oldused; x++) {
1357      *tmpb++ = 0;
1358    }
1359  }
1360  b->sign = a->sign;
1361  mp_clamp (b);
1362  return MP_OKAY;
1363}
1364#endif /* LTM_NO_NEG_EXP */
1365
1366
1367/* shift left by a certain bit count */
1368static int mp_mul_2d (mp_int * a, int b, mp_int * c)
1369{
1370  mp_digit d;
1371  int      res;
1372
1373  /* copy */
1374  if (a != c) {
1375     if ((res = mp_copy (a, c)) != MP_OKAY) {
1376       return res;
1377     }
1378  }
1379
1380  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
1381     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
1382       return res;
1383     }
1384  }
1385
1386  /* shift by as many digits in the bit count */
1387  if (b >= (int)DIGIT_BIT) {
1388    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
1389      return res;
1390    }
1391  }
1392
1393  /* shift any bit count < DIGIT_BIT */
1394  d = (mp_digit) (b % DIGIT_BIT);
1395  if (d != 0) {
1396    register mp_digit *tmpc, shift, mask, r, rr;
1397    register int x;
1398
1399    /* bitmask for carries */
1400    mask = (((mp_digit)1) << d) - 1;
1401
1402    /* shift for msbs */
1403    shift = DIGIT_BIT - d;
1404
1405    /* alias */
1406    tmpc = c->dp;
1407
1408    /* carry */
1409    r    = 0;
1410    for (x = 0; x < c->used; x++) {
1411      /* get the higher bits of the current word */
1412      rr = (*tmpc >> shift) & mask;
1413
1414      /* shift the current word and OR in the carry */
1415      *tmpc = ((*tmpc << d) | r) & MP_MASK;
1416      ++tmpc;
1417
1418      /* set the carry to the carry bits of the current word */
1419      r = rr;
1420    }
1421
1422    /* set final carry */
1423    if (r != 0) {
1424       c->dp[(c->used)++] = r;
1425    }
1426  }
1427  mp_clamp (c);
1428  return MP_OKAY;
1429}
1430
1431
1432#ifdef BN_MP_INIT_MULTI_C
1433static int mp_init_multi(mp_int *mp, ...)
1434{
1435    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
1436    int n = 0;                 /* Number of ok inits */
1437    mp_int* cur_arg = mp;
1438    va_list args;
1439
1440    va_start(args, mp);        /* init args to next argument from caller */
1441    while (cur_arg != NULL) {
1442        if (mp_init(cur_arg) != MP_OKAY) {
1443            /* Oops - error! Back-track and mp_clear what we already
1444               succeeded in init-ing, then return error.
1445            */
1446            va_list clean_args;
1447
1448            /* end the current list */
1449            va_end(args);
1450
1451            /* now start cleaning up */
1452            cur_arg = mp;
1453            va_start(clean_args, mp);
1454            while (n--) {
1455                mp_clear(cur_arg);
1456                cur_arg = va_arg(clean_args, mp_int*);
1457            }
1458            va_end(clean_args);
1459            res = MP_MEM;
1460            break;
1461        }
1462        n++;
1463        cur_arg = va_arg(args, mp_int*);
1464    }
1465    va_end(args);
1466    return res;                /* Assumed ok, if error flagged above. */
1467}
1468#endif
1469
1470
1471#ifdef BN_MP_CLEAR_MULTI_C
1472static void mp_clear_multi(mp_int *mp, ...)
1473{
1474    mp_int* next_mp = mp;
1475    va_list args;
1476    va_start(args, mp);
1477    while (next_mp != NULL) {
1478        mp_clear(next_mp);
1479        next_mp = va_arg(args, mp_int*);
1480    }
1481    va_end(args);
1482}
1483#endif
1484
1485
1486/* shift left a certain amount of digits */
1487static int mp_lshd (mp_int * a, int b)
1488{
1489  int     x, res;
1490
1491  /* if its less than zero return */
1492  if (b <= 0) {
1493    return MP_OKAY;
1494  }
1495
1496  /* grow to fit the new digits */
1497  if (a->alloc < a->used + b) {
1498     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
1499       return res;
1500     }
1501  }
1502
1503  {
1504    register mp_digit *top, *bottom;
1505
1506    /* increment the used by the shift amount then copy upwards */
1507    a->used += b;
1508
1509    /* top */
1510    top = a->dp + a->used - 1;
1511
1512    /* base */
1513    bottom = a->dp + a->used - 1 - b;
1514
1515    /* much like mp_rshd this is implemented using a sliding window
1516     * except the window goes the otherway around.  Copying from
1517     * the bottom to the top.  see bn_mp_rshd.c for more info.
1518     */
1519    for (x = a->used - 1; x >= b; x--) {
1520      *top-- = *bottom--;
1521    }
1522
1523    /* zero the lower digits */
1524    top = a->dp;
1525    for (x = 0; x < b; x++) {
1526      *top++ = 0;
1527    }
1528  }
1529  return MP_OKAY;
1530}
1531
1532
1533/* returns the number of bits in an int */
1534static int mp_count_bits (mp_int * a)
1535{
1536  int     r;
1537  mp_digit q;
1538
1539  /* shortcut */
1540  if (a->used == 0) {
1541    return 0;
1542  }
1543
1544  /* get number of digits and add that */
1545  r = (a->used - 1) * DIGIT_BIT;
1546
1547  /* take the last digit and count the bits in it */
1548  q = a->dp[a->used - 1];
1549  while (q > ((mp_digit) 0)) {
1550    ++r;
1551    q >>= ((mp_digit) 1);
1552  }
1553  return r;
1554}
1555
1556
1557/* calc a value mod 2**b */
1558static int mp_mod_2d (mp_int * a, int b, mp_int * c)
1559{
1560  int     x, res;
1561
1562  /* if b is <= 0 then zero the int */
1563  if (b <= 0) {
1564    mp_zero (c);
1565    return MP_OKAY;
1566  }
1567
1568  /* if the modulus is larger than the value than return */
1569  if (b >= (int) (a->used * DIGIT_BIT)) {
1570    res = mp_copy (a, c);
1571    return res;
1572  }
1573
1574  /* copy */
1575  if ((res = mp_copy (a, c)) != MP_OKAY) {
1576    return res;
1577  }
1578
1579  /* zero digits above the last digit of the modulus */
1580  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
1581    c->dp[x] = 0;
1582  }
1583  /* clear the digit that is not completely outside/inside the modulus */
1584  c->dp[b / DIGIT_BIT] &=
1585    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
1586  mp_clamp (c);
1587  return MP_OKAY;
1588}
1589
1590
1591#ifdef BN_MP_DIV_SMALL
1592
1593/* slower bit-bang division... also smaller */
1594static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1595{
1596   mp_int ta, tb, tq, q;
1597   int    res, n, n2;
1598
1599  /* is divisor zero ? */
1600  if (mp_iszero (b) == 1) {
1601    return MP_VAL;
1602  }
1603
1604  /* if a < b then q=0, r = a */
1605  if (mp_cmp_mag (a, b) == MP_LT) {
1606    if (d != NULL) {
1607      res = mp_copy (a, d);
1608    } else {
1609      res = MP_OKAY;
1610    }
1611    if (c != NULL) {
1612      mp_zero (c);
1613    }
1614    return res;
1615  }
1616
1617  /* init our temps */
1618  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
1619     return res;
1620  }
1621
1622
1623  mp_set(&tq, 1);
1624  n = mp_count_bits(a) - mp_count_bits(b);
1625  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
1626      ((res = mp_abs(b, &tb)) != MP_OKAY) ||
1627      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
1628      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
1629      goto LBL_ERR;
1630  }
1631
1632  while (n-- >= 0) {
1633     if (mp_cmp(&tb, &ta) != MP_GT) {
1634        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
1635            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
1636           goto LBL_ERR;
1637        }
1638     }
1639     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
1640         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
1641           goto LBL_ERR;
1642     }
1643  }
1644
1645  /* now q == quotient and ta == remainder */
1646  n  = a->sign;
1647  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
1648  if (c != NULL) {
1649     mp_exch(c, &q);
1650     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
1651  }
1652  if (d != NULL) {
1653     mp_exch(d, &ta);
1654     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
1655  }
1656LBL_ERR:
1657   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
1658   return res;
1659}
1660
1661#else
1662
1663/* integer signed division.
1664 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1665 * HAC pp.598 Algorithm 14.20
1666 *
1667 * Note that the description in HAC is horribly
1668 * incomplete.  For example, it doesn't consider
1669 * the case where digits are removed from 'x' in
1670 * the inner loop.  It also doesn't consider the
1671 * case that y has fewer than three digits, etc..
1672 *
1673 * The overall algorithm is as described as
1674 * 14.20 from HAC but fixed to treat these cases.
1675*/
1676static int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1677{
1678  mp_int  q, x, y, t1, t2;
1679  int     res, n, t, i, norm, neg;
1680
1681  /* is divisor zero ? */
1682  if (mp_iszero (b) == 1) {
1683    return MP_VAL;
1684  }
1685
1686  /* if a < b then q=0, r = a */
1687  if (mp_cmp_mag (a, b) == MP_LT) {
1688    if (d != NULL) {
1689      res = mp_copy (a, d);
1690    } else {
1691      res = MP_OKAY;
1692    }
1693    if (c != NULL) {
1694      mp_zero (c);
1695    }
1696    return res;
1697  }
1698
1699  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1700    return res;
1701  }
1702  q.used = a->used + 2;
1703
1704  if ((res = mp_init (&t1)) != MP_OKAY) {
1705    goto LBL_Q;
1706  }
1707
1708  if ((res = mp_init (&t2)) != MP_OKAY) {
1709    goto LBL_T1;
1710  }
1711
1712  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1713    goto LBL_T2;
1714  }
1715
1716  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1717    goto LBL_X;
1718  }
1719
1720  /* fix the sign */
1721  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1722  x.sign = y.sign = MP_ZPOS;
1723
1724  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1725  norm = mp_count_bits(&y) % DIGIT_BIT;
1726  if (norm < (int)(DIGIT_BIT-1)) {
1727     norm = (DIGIT_BIT-1) - norm;
1728     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1729       goto LBL_Y;
1730     }
1731     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1732       goto LBL_Y;
1733     }
1734  } else {
1735     norm = 0;
1736  }
1737
1738  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1739  n = x.used - 1;
1740  t = y.used - 1;
1741
1742  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1743  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1744    goto LBL_Y;
1745  }
1746
1747  while (mp_cmp (&x, &y) != MP_LT) {
1748    ++(q.dp[n - t]);
1749    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1750      goto LBL_Y;
1751    }
1752  }
1753
1754  /* reset y by shifting it back down */
1755  mp_rshd (&y, n - t);
1756
1757  /* step 3. for i from n down to (t + 1) */
1758  for (i = n; i >= (t + 1); i--) {
1759    if (i > x.used) {
1760      continue;
1761    }
1762
1763    /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1764     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1765    if (x.dp[i] == y.dp[t]) {
1766      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1767    } else {
1768      mp_word tmp;
1769      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1770      tmp |= ((mp_word) x.dp[i - 1]);
1771      tmp /= ((mp_word) y.dp[t]);
1772      if (tmp > (mp_word) MP_MASK)
1773        tmp = MP_MASK;
1774      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1775    }
1776
1777    /* while (q{i-t-1} * (yt * b + y{t-1})) >
1778             xi * b**2 + xi-1 * b + xi-2
1779
1780       do q{i-t-1} -= 1;
1781    */
1782    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1783    do {
1784      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1785
1786      /* find left hand */
1787      mp_zero (&t1);
1788      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1789      t1.dp[1] = y.dp[t];
1790      t1.used = 2;
1791      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1792        goto LBL_Y;
1793      }
1794
1795      /* find right hand */
1796      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1797      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1798      t2.dp[2] = x.dp[i];
1799      t2.used = 3;
1800    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1801
1802    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1803    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1804      goto LBL_Y;
1805    }
1806
1807    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1808      goto LBL_Y;
1809    }
1810
1811    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1812      goto LBL_Y;
1813    }
1814
1815    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1816    if (x.sign == MP_NEG) {
1817      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1818        goto LBL_Y;
1819      }
1820      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1821        goto LBL_Y;
1822      }
1823      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1824        goto LBL_Y;
1825      }
1826
1827      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1828    }
1829  }
1830
1831  /* now q is the quotient and x is the remainder
1832   * [which we have to normalize]
1833   */
1834
1835  /* get sign before writing to c */
1836  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1837
1838  if (c != NULL) {
1839    mp_clamp (&q);
1840    mp_exch (&q, c);
1841    c->sign = neg;
1842  }
1843
1844  if (d != NULL) {
1845    mp_div_2d (&x, norm, &x, NULL);
1846    mp_exch (&x, d);
1847  }
1848
1849  res = MP_OKAY;
1850
1851LBL_Y:mp_clear (&y);
1852LBL_X:mp_clear (&x);
1853LBL_T2:mp_clear (&t2);
1854LBL_T1:mp_clear (&t1);
1855LBL_Q:mp_clear (&q);
1856  return res;
1857}
1858
1859#endif
1860
1861
1862#ifdef MP_LOW_MEM
1863   #define TAB_SIZE 32
1864#else
1865   #define TAB_SIZE 256
1866#endif
1867
1868static int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
1869{
1870  mp_int  M[TAB_SIZE], res, mu;
1871  mp_digit buf;
1872  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1873  int (*redux)(mp_int*,mp_int*,mp_int*);
1874
1875  /* find window size */
1876  x = mp_count_bits (X);
1877  if (x <= 7) {
1878    winsize = 2;
1879  } else if (x <= 36) {
1880    winsize = 3;
1881  } else if (x <= 140) {
1882    winsize = 4;
1883  } else if (x <= 450) {
1884    winsize = 5;
1885  } else if (x <= 1303) {
1886    winsize = 6;
1887  } else if (x <= 3529) {
1888    winsize = 7;
1889  } else {
1890    winsize = 8;
1891  }
1892
1893#ifdef MP_LOW_MEM
1894    if (winsize > 5) {
1895       winsize = 5;
1896    }
1897#endif
1898
1899  /* init M array */
1900  /* init first cell */
1901  if ((err = mp_init(&M[1])) != MP_OKAY) {
1902     return err;
1903  }
1904
1905  /* now init the second half of the array */
1906  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1907    if ((err = mp_init(&M[x])) != MP_OKAY) {
1908      for (y = 1<<(winsize-1); y < x; y++) {
1909        mp_clear (&M[y]);
1910      }
1911      mp_clear(&M[1]);
1912      return err;
1913    }
1914  }
1915
1916  /* create mu, used for Barrett reduction */
1917  if ((err = mp_init (&mu)) != MP_OKAY) {
1918    goto LBL_M;
1919  }
1920
1921  if (redmode == 0) {
1922     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
1923        goto LBL_MU;
1924     }
1925     redux = mp_reduce;
1926  } else {
1927     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
1928        goto LBL_MU;
1929     }
1930     redux = mp_reduce_2k_l;
1931  }
1932
1933  /* create M table
1934   *
1935   * The M table contains powers of the base,
1936   * e.g. M[x] = G**x mod P
1937   *
1938   * The first half of the table is not
1939   * computed though accept for M[0] and M[1]
1940   */
1941  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
1942    goto LBL_MU;
1943  }
1944
1945  /* compute the value at M[1<<(winsize-1)] by squaring
1946   * M[1] (winsize-1) times
1947   */
1948  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1949    goto LBL_MU;
1950  }
1951
1952  for (x = 0; x < (winsize - 1); x++) {
1953    /* square it */
1954    if ((err = mp_sqr (&M[1 << (winsize - 1)],
1955                       &M[1 << (winsize - 1)])) != MP_OKAY) {
1956      goto LBL_MU;
1957    }
1958
1959    /* reduce modulo P */
1960    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
1961      goto LBL_MU;
1962    }
1963  }
1964
1965  /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
1966   * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
1967   */
1968  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1969    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1970      goto LBL_MU;
1971    }
1972    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
1973      goto LBL_MU;
1974    }
1975  }
1976
1977  /* setup result */
1978  if ((err = mp_init (&res)) != MP_OKAY) {
1979    goto LBL_MU;
1980  }
1981  mp_set (&res, 1);
1982
1983  /* set initial mode and bit cnt */
1984  mode   = 0;
1985  bitcnt = 1;
1986  buf    = 0;
1987  digidx = X->used - 1;
1988  bitcpy = 0;
1989  bitbuf = 0;
1990
1991  for (;;) {
1992    /* grab next digit as required */
1993    if (--bitcnt == 0) {
1994      /* if digidx == -1 we are out of digits */
1995      if (digidx == -1) {
1996        break;
1997      }
1998      /* read next digit and reset the bitcnt */
1999      buf    = X->dp[digidx--];
2000      bitcnt = (int) DIGIT_BIT;
2001    }
2002
2003    /* grab the next msb from the exponent */
2004    y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
2005    buf <<= (mp_digit)1;
2006
2007    /* if the bit is zero and mode == 0 then we ignore it
2008     * These represent the leading zero bits before the first 1 bit
2009     * in the exponent.  Technically this opt is not required but it
2010     * does lower the # of trivial squaring/reductions used
2011     */
2012    if (mode == 0 && y == 0) {
2013      continue;
2014    }
2015
2016    /* if the bit is zero and mode == 1 then we square */
2017    if (mode == 1 && y == 0) {
2018      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2019        goto LBL_RES;
2020      }
2021      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2022        goto LBL_RES;
2023      }
2024      continue;
2025    }
2026
2027    /* else we add it to the window */
2028    bitbuf |= (y << (winsize - ++bitcpy));
2029    mode    = 2;
2030
2031    if (bitcpy == winsize) {
2032      /* ok window is filled so square as required and multiply  */
2033      /* square first */
2034      for (x = 0; x < winsize; x++) {
2035        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2036          goto LBL_RES;
2037        }
2038        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2039          goto LBL_RES;
2040        }
2041      }
2042
2043      /* then multiply */
2044      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
2045        goto LBL_RES;
2046      }
2047      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2048        goto LBL_RES;
2049      }
2050
2051      /* empty window and reset */
2052      bitcpy = 0;
2053      bitbuf = 0;
2054      mode   = 1;
2055    }
2056  }
2057
2058  /* if bits remain then square/multiply */
2059  if (mode == 2 && bitcpy > 0) {
2060    /* square then multiply if the bit is set */
2061    for (x = 0; x < bitcpy; x++) {
2062      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2063        goto LBL_RES;
2064      }
2065      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2066        goto LBL_RES;
2067      }
2068
2069      bitbuf <<= 1;
2070      if ((bitbuf & (1 << winsize)) != 0) {
2071        /* then multiply */
2072        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
2073          goto LBL_RES;
2074        }
2075        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2076          goto LBL_RES;
2077        }
2078      }
2079    }
2080  }
2081
2082  mp_exch (&res, Y);
2083  err = MP_OKAY;
2084LBL_RES:mp_clear (&res);
2085LBL_MU:mp_clear (&mu);
2086LBL_M:
2087  mp_clear(&M[1]);
2088  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
2089    mp_clear (&M[x]);
2090  }
2091  return err;
2092}
2093
2094
2095/* computes b = a*a */
2096static int mp_sqr (mp_int * a, mp_int * b)
2097{
2098  int     res;
2099
2100#ifdef BN_MP_TOOM_SQR_C
2101  /* use Toom-Cook? */
2102  if (a->used >= TOOM_SQR_CUTOFF) {
2103    res = mp_toom_sqr(a, b);
2104  /* Karatsuba? */
2105  } else
2106#endif
2107#ifdef BN_MP_KARATSUBA_SQR_C
2108if (a->used >= KARATSUBA_SQR_CUTOFF) {
2109    res = mp_karatsuba_sqr (a, b);
2110  } else
2111#endif
2112  {
2113#ifdef BN_FAST_S_MP_SQR_C
2114    /* can we use the fast comba multiplier? */
2115    if ((a->used * 2 + 1) < MP_WARRAY &&
2116         a->used <
2117         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
2118      res = fast_s_mp_sqr (a, b);
2119    } else
2120#endif
2121#ifdef BN_S_MP_SQR_C
2122      res = s_mp_sqr (a, b);
2123#else
2124#error mp_sqr could fail
2125      res = MP_VAL;
2126#endif
2127  }
2128  b->sign = MP_ZPOS;
2129  return res;
2130}
2131
2132
2133/* reduces a modulo n where n is of the form 2**p - d
2134   This differs from reduce_2k since "d" can be larger
2135   than a single digit.
2136*/
2137static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
2138{
2139   mp_int q;
2140   int    p, res;
2141
2142   if ((res = mp_init(&q)) != MP_OKAY) {
2143      return res;
2144   }
2145
2146   p = mp_count_bits(n);
2147top:
2148   /* q = a/2**p, a = a mod 2**p */
2149   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
2150      goto ERR;
2151   }
2152
2153   /* q = q * d */
2154   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) {
2155      goto ERR;
2156   }
2157
2158   /* a = a + q */
2159   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
2160      goto ERR;
2161   }
2162
2163   if (mp_cmp_mag(a, n) != MP_LT) {
2164      s_mp_sub(a, n, a);
2165      goto top;
2166   }
2167
2168ERR:
2169   mp_clear(&q);
2170   return res;
2171}
2172
2173
2174/* determines the setup value */
2175static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
2176{
2177   int    res;
2178   mp_int tmp;
2179
2180   if ((res = mp_init(&tmp)) != MP_OKAY) {
2181      return res;
2182   }
2183
2184   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
2185      goto ERR;
2186   }
2187
2188   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
2189      goto ERR;
2190   }
2191
2192ERR:
2193   mp_clear(&tmp);
2194   return res;
2195}
2196
2197
2198/* computes a = 2**b
2199 *
2200 * Simple algorithm which zeroes the int, grows it then just sets one bit
2201 * as required.
2202 */
2203static int mp_2expt (mp_int * a, int b)
2204{
2205  int     res;
2206
2207  /* zero a as per default */
2208  mp_zero (a);
2209
2210  /* grow a to accomodate the single bit */
2211  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
2212    return res;
2213  }
2214
2215  /* set the used count of where the bit will go */
2216  a->used = b / DIGIT_BIT + 1;
2217
2218  /* put the single bit in its place */
2219  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
2220
2221  return MP_OKAY;
2222}
2223
2224
2225/* pre-calculate the value required for Barrett reduction
2226 * For a given modulus "b" it calulates the value required in "a"
2227 */
2228static int mp_reduce_setup (mp_int * a, mp_int * b)
2229{
2230  int     res;
2231
2232  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
2233    return res;
2234  }
2235  return mp_div (a, b, a, NULL);
2236}
2237
2238
2239/* reduces x mod m, assumes 0 < x < m**2, mu is
2240 * precomputed via mp_reduce_setup.
2241 * From HAC pp.604 Algorithm 14.42
2242 */
2243static int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
2244{
2245  mp_int  q;
2246  int     res, um = m->used;
2247
2248  /* q = x */
2249  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
2250    return res;
2251  }
2252
2253  /* q1 = x / b**(k-1)  */
2254  mp_rshd (&q, um - 1);
2255
2256  /* according to HAC this optimization is ok */
2257  if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
2258    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
2259      goto CLEANUP;
2260    }
2261  } else {
2262#ifdef BN_S_MP_MUL_HIGH_DIGS_C
2263    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2264      goto CLEANUP;
2265    }
2266#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
2267    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2268      goto CLEANUP;
2269    }
2270#else
2271    {
2272#error mp_reduce would always fail
2273      res = MP_VAL;
2274      goto CLEANUP;
2275    }
2276#endif
2277  }
2278
2279  /* q3 = q2 / b**(k+1) */
2280  mp_rshd (&q, um + 1);
2281
2282  /* x = x mod b**(k+1), quick (no division) */
2283  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
2284    goto CLEANUP;
2285  }
2286
2287  /* q = q * m mod b**(k+1), quick (no division) */
2288  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
2289    goto CLEANUP;
2290  }
2291
2292  /* x = x - q */
2293  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
2294    goto CLEANUP;
2295  }
2296
2297  /* If x < 0, add b**(k+1) to it */
2298  if (mp_cmp_d (x, 0) == MP_LT) {
2299    mp_set (&q, 1);
2300    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) {
2301      goto CLEANUP;
2302    }
2303    if ((res = mp_add (x, &q, x)) != MP_OKAY) {
2304      goto CLEANUP;
2305    }
2306  }
2307
2308  /* Back off if it's too big */
2309  while (mp_cmp (x, m) != MP_LT) {
2310    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
2311      goto CLEANUP;
2312    }
2313  }
2314
2315CLEANUP:
2316  mp_clear (&q);
2317
2318  return res;
2319}
2320
2321
2322/* multiplies |a| * |b| and only computes upto digs digits of result
2323 * HAC pp. 595, Algorithm 14.12  Modified so you can control how
2324 * many digits of output are created.
2325 */
2326static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2327{
2328  mp_int  t;
2329  int     res, pa, pb, ix, iy;
2330  mp_digit u;
2331  mp_word r;
2332  mp_digit tmpx, *tmpt, *tmpy;
2333
2334  /* can we use the fast multiplier? */
2335  if (((digs) < MP_WARRAY) &&
2336      MIN (a->used, b->used) <
2337          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2338    return fast_s_mp_mul_digs (a, b, c, digs);
2339  }
2340
2341  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
2342    return res;
2343  }
2344  t.used = digs;
2345
2346  /* compute the digits of the product directly */
2347  pa = a->used;
2348  for (ix = 0; ix < pa; ix++) {
2349    /* set the carry to zero */
2350    u = 0;
2351
2352    /* limit ourselves to making digs digits of output */
2353    pb = MIN (b->used, digs - ix);
2354
2355    /* setup some aliases */
2356    /* copy of the digit from a used within the nested loop */
2357    tmpx = a->dp[ix];
2358
2359    /* an alias for the destination shifted ix places */
2360    tmpt = t.dp + ix;
2361
2362    /* an alias for the digits of b */
2363    tmpy = b->dp;
2364
2365    /* compute the columns of the output and propagate the carry */
2366    for (iy = 0; iy < pb; iy++) {
2367      /* compute the column as a mp_word */
2368      r       = ((mp_word)*tmpt) +
2369                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2370                ((mp_word) u);
2371
2372      /* the new column is the lower part of the result */
2373      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2374
2375      /* get the carry word from the result */
2376      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2377    }
2378    /* set carry if it is placed below digs */
2379    if (ix + iy < digs) {
2380      *tmpt = u;
2381    }
2382  }
2383
2384  mp_clamp (&t);
2385  mp_exch (&t, c);
2386
2387  mp_clear (&t);
2388  return MP_OKAY;
2389}
2390
2391
2392/* Fast (comba) multiplier
2393 *
2394 * This is the fast column-array [comba] multiplier.  It is
2395 * designed to compute the columns of the product first
2396 * then handle the carries afterwards.  This has the effect
2397 * of making the nested loops that compute the columns very
2398 * simple and schedulable on super-scalar processors.
2399 *
2400 * This has been modified to produce a variable number of
2401 * digits of output so if say only a half-product is required
2402 * you don't have to compute the upper half (a feature
2403 * required for fast Barrett reduction).
2404 *
2405 * Based on Algorithm 14.12 on pp.595 of HAC.
2406 *
2407 */
2408static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2409{
2410  int     olduse, res, pa, ix, iz;
2411  mp_digit W[MP_WARRAY];
2412  register mp_word  _W;
2413
2414  /* grow the destination as required */
2415  if (c->alloc < digs) {
2416    if ((res = mp_grow (c, digs)) != MP_OKAY) {
2417      return res;
2418    }
2419  }
2420
2421  /* number of output digits to produce */
2422  pa = MIN(digs, a->used + b->used);
2423
2424  /* clear the carry */
2425  _W = 0;
2426  for (ix = 0; ix < pa; ix++) {
2427      int      tx, ty;
2428      int      iy;
2429      mp_digit *tmpx, *tmpy;
2430
2431      /* get offsets into the two bignums */
2432      ty = MIN(b->used-1, ix);
2433      tx = ix - ty;
2434
2435      /* setup temp aliases */
2436      tmpx = a->dp + tx;
2437      tmpy = b->dp + ty;
2438
2439      /* this is the number of times the loop will iterrate, essentially
2440         while (tx++ < a->used && ty-- >= 0) { ... }
2441       */
2442      iy = MIN(a->used-tx, ty+1);
2443
2444      /* execute loop */
2445      for (iz = 0; iz < iy; ++iz) {
2446         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2447
2448      }
2449
2450      /* store term */
2451      W[ix] = ((mp_digit)_W) & MP_MASK;
2452
2453      /* make next carry */
2454      _W = _W >> ((mp_word)DIGIT_BIT);
2455 }
2456
2457  /* setup dest */
2458  olduse  = c->used;
2459  c->used = pa;
2460
2461  {
2462    register mp_digit *tmpc;
2463    tmpc = c->dp;
2464    for (ix = 0; ix < pa+1; ix++) {
2465      /* now extract the previous digit [below the carry] */
2466      *tmpc++ = W[ix];
2467    }
2468
2469    /* clear unused digits [that existed in the old copy of c] */
2470    for (; ix < olduse; ix++) {
2471      *tmpc++ = 0;
2472    }
2473  }
2474  mp_clamp (c);
2475  return MP_OKAY;
2476}
2477
2478
2479/* init an mp_init for a given size */
2480static int mp_init_size (mp_int * a, int size)
2481{
2482  int x;
2483
2484  /* pad size so there are always extra digits */
2485  size += (MP_PREC * 2) - (size % MP_PREC);
2486
2487  /* alloc mem */
2488  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
2489  if (a->dp == NULL) {
2490    return MP_MEM;
2491  }
2492
2493  /* set the members */
2494  a->used  = 0;
2495  a->alloc = size;
2496  a->sign  = MP_ZPOS;
2497
2498  /* zero the digits */
2499  for (x = 0; x < size; x++) {
2500      a->dp[x] = 0;
2501  }
2502
2503  return MP_OKAY;
2504}
2505
2506
2507/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
2508static int s_mp_sqr (mp_int * a, mp_int * b)
2509{
2510  mp_int  t;
2511  int     res, ix, iy, pa;
2512  mp_word r;
2513  mp_digit u, tmpx, *tmpt;
2514
2515  pa = a->used;
2516  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
2517    return res;
2518  }
2519
2520  /* default used is maximum possible size */
2521  t.used = 2*pa + 1;
2522
2523  for (ix = 0; ix < pa; ix++) {
2524    /* first calculate the digit at 2*ix */
2525    /* calculate double precision result */
2526    r = ((mp_word) t.dp[2*ix]) +
2527        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
2528
2529    /* store lower part in result */
2530    t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
2531
2532    /* get the carry */
2533    u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2534
2535    /* left hand side of A[ix] * A[iy] */
2536    tmpx        = a->dp[ix];
2537
2538    /* alias for where to store the results */
2539    tmpt        = t.dp + (2*ix + 1);
2540
2541    for (iy = ix + 1; iy < pa; iy++) {
2542      /* first calculate the product */
2543      r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
2544
2545      /* now calculate the double precision result, note we use
2546       * addition instead of *2 since it's easier to optimize
2547       */
2548      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
2549
2550      /* store lower part */
2551      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2552
2553      /* get carry */
2554      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2555    }
2556    /* propagate upwards */
2557    while (u != ((mp_digit) 0)) {
2558      r       = ((mp_word) *tmpt) + ((mp_word) u);
2559      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2560      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2561    }
2562  }
2563
2564  mp_clamp (&t);
2565  mp_exch (&t, b);
2566  mp_clear (&t);
2567  return MP_OKAY;
2568}
2569
2570
2571/* multiplies |a| * |b| and does not compute the lower digs digits
2572 * [meant to get the higher part of the product]
2573 */
2574static int s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2575{
2576  mp_int  t;
2577  int     res, pa, pb, ix, iy;
2578  mp_digit u;
2579  mp_word r;
2580  mp_digit tmpx, *tmpt, *tmpy;
2581
2582  /* can we use the fast multiplier? */
2583#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
2584  if (((a->used + b->used + 1) < MP_WARRAY)
2585      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2586    return fast_s_mp_mul_high_digs (a, b, c, digs);
2587  }
2588#endif
2589
2590  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
2591    return res;
2592  }
2593  t.used = a->used + b->used + 1;
2594
2595  pa = a->used;
2596  pb = b->used;
2597  for (ix = 0; ix < pa; ix++) {
2598    /* clear the carry */
2599    u = 0;
2600
2601    /* left hand side of A[ix] * B[iy] */
2602    tmpx = a->dp[ix];
2603
2604    /* alias to the address of where the digits will be stored */
2605    tmpt = &(t.dp[digs]);
2606
2607    /* alias for where to read the right hand side from */
2608    tmpy = b->dp + (digs - ix);
2609
2610    for (iy = digs - ix; iy < pb; iy++) {
2611      /* calculate the double precision result */
2612      r       = ((mp_word)*tmpt) +
2613                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2614                ((mp_word) u);
2615
2616      /* get the lower part */
2617      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2618
2619      /* carry the carry */
2620      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2621    }
2622    *tmpt = u;
2623  }
2624  mp_clamp (&t);
2625  mp_exch (&t, c);
2626  mp_clear (&t);
2627  return MP_OKAY;
2628}
2629
2630
2631#ifdef BN_MP_MONTGOMERY_SETUP_C
2632/* setups the montgomery reduction stuff */
2633static int
2634mp_montgomery_setup (mp_int * n, mp_digit * rho)
2635{
2636  mp_digit x, b;
2637
2638/* fast inversion mod 2**k
2639 *
2640 * Based on the fact that
2641 *
2642 * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
2643 *                    =>  2*X*A - X*X*A*A = 1
2644 *                    =>  2*(1) - (1)     = 1
2645 */
2646  b = n->dp[0];
2647
2648  if ((b & 1) == 0) {
2649    return MP_VAL;
2650  }
2651
2652  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2653  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
2654#if !defined(MP_8BIT)
2655  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
2656#endif
2657#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
2658  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
2659#endif
2660#ifdef MP_64BIT
2661  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
2662#endif
2663
2664  /* rho = -1/m mod b */
2665  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2666
2667  return MP_OKAY;
2668}
2669#endif
2670
2671
2672#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
2673/* computes xR**-1 == x (mod N) via Montgomery Reduction
2674 *
2675 * This is an optimized implementation of montgomery_reduce
2676 * which uses the comba method to quickly calculate the columns of the
2677 * reduction.
2678 *
2679 * Based on Algorithm 14.32 on pp.601 of HAC.
2680*/
2681int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
2682{
2683  int     ix, res, olduse;
2684  mp_word W[MP_WARRAY];
2685
2686  /* get old used count */
2687  olduse = x->used;
2688
2689  /* grow a as required */
2690  if (x->alloc < n->used + 1) {
2691    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
2692      return res;
2693    }
2694  }
2695
2696  /* first we have to get the digits of the input into
2697   * an array of double precision words W[...]
2698   */
2699  {
2700    register mp_word *_W;
2701    register mp_digit *tmpx;
2702
2703    /* alias for the W[] array */
2704    _W   = W;
2705
2706    /* alias for the digits of  x*/
2707    tmpx = x->dp;
2708
2709    /* copy the digits of a into W[0..a->used-1] */
2710    for (ix = 0; ix < x->used; ix++) {
2711      *_W++ = *tmpx++;
2712    }
2713
2714    /* zero the high words of W[a->used..m->used*2] */
2715    for (; ix < n->used * 2 + 1; ix++) {
2716      *_W++ = 0;
2717    }
2718  }
2719
2720  /* now we proceed to zero successive digits
2721   * from the least significant upwards
2722   */
2723  for (ix = 0; ix < n->used; ix++) {
2724    /* mu = ai * m' mod b
2725     *
2726     * We avoid a double precision multiplication (which isn't required)
2727     * by casting the value down to a mp_digit.  Note this requires
2728     * that W[ix-1] have  the carry cleared (see after the inner loop)
2729     */
2730    register mp_digit mu;
2731    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
2732
2733    /* a = a + mu * m * b**i
2734     *
2735     * This is computed in place and on the fly.  The multiplication
2736     * by b**i is handled by offseting which columns the results
2737     * are added to.
2738     *
2739     * Note the comba method normally doesn't handle carries in the
2740     * inner loop In this case we fix the carry from the previous
2741     * column since the Montgomery reduction requires digits of the
2742     * result (so far) [see above] to work.  This is
2743     * handled by fixing up one carry after the inner loop.  The
2744     * carry fixups are done in order so after these loops the
2745     * first m->used words of W[] have the carries fixed
2746     */
2747    {
2748      register int iy;
2749      register mp_digit *tmpn;
2750      register mp_word *_W;
2751
2752      /* alias for the digits of the modulus */
2753      tmpn = n->dp;
2754
2755      /* Alias for the columns set by an offset of ix */
2756      _W = W + ix;
2757
2758      /* inner loop */
2759      for (iy = 0; iy < n->used; iy++) {
2760          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
2761      }
2762    }
2763
2764    /* now fix carry for next digit, W[ix+1] */
2765    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
2766  }
2767
2768  /* now we have to propagate the carries and
2769   * shift the words downward [all those least
2770   * significant digits we zeroed].
2771   */
2772  {
2773    register mp_digit *tmpx;
2774    register mp_word *_W, *_W1;
2775
2776    /* nox fix rest of carries */
2777
2778    /* alias for current word */
2779    _W1 = W + ix;
2780
2781    /* alias for next word, where the carry goes */
2782    _W = W + ++ix;
2783
2784    for (; ix <= n->used * 2 + 1; ix++) {
2785      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
2786    }
2787
2788    /* copy out, A = A/b**n
2789     *
2790     * The result is A/b**n but instead of converting from an
2791     * array of mp_word to mp_digit than calling mp_rshd
2792     * we just copy them in the right order
2793     */
2794
2795    /* alias for destination word */
2796    tmpx = x->dp;
2797
2798    /* alias for shifted double precision result */
2799    _W = W + n->used;
2800
2801    for (ix = 0; ix < n->used + 1; ix++) {
2802      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
2803    }
2804
2805    /* zero oldused digits, if the input a was larger than
2806     * m->used+1 we'll have to clear the digits
2807     */
2808    for (; ix < olduse; ix++) {
2809      *tmpx++ = 0;
2810    }
2811  }
2812
2813  /* set the max used and clamp */
2814  x->used = n->used + 1;
2815  mp_clamp (x);
2816
2817  /* if A >= m then A = A - m */
2818  if (mp_cmp_mag (x, n) != MP_LT) {
2819    return s_mp_sub (x, n, x);
2820  }
2821  return MP_OKAY;
2822}
2823#endif
2824
2825
2826#ifdef BN_MP_MUL_2_C
2827/* b = a*2 */
2828static int mp_mul_2(mp_int * a, mp_int * b)
2829{
2830  int     x, res, oldused;
2831
2832  /* grow to accomodate result */
2833  if (b->alloc < a->used + 1) {
2834    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2835      return res;
2836    }
2837  }
2838
2839  oldused = b->used;
2840  b->used = a->used;
2841
2842  {
2843    register mp_digit r, rr, *tmpa, *tmpb;
2844
2845    /* alias for source */
2846    tmpa = a->dp;
2847
2848    /* alias for dest */
2849    tmpb = b->dp;
2850
2851    /* carry */
2852    r = 0;
2853    for (x = 0; x < a->used; x++) {
2854
2855      /* get what will be the *next* carry bit from the
2856       * MSB of the current digit
2857       */
2858      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2859
2860      /* now shift up this digit, add in the carry [from the previous] */
2861      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2862
2863      /* copy the carry that would be from the source
2864       * digit into the next iteration
2865       */
2866      r = rr;
2867    }
2868
2869    /* new leading digit? */
2870    if (r != 0) {
2871      /* add a MSB which is always 1 at this point */
2872      *tmpb = 1;
2873      ++(b->used);
2874    }
2875
2876    /* now zero any excess digits on the destination
2877     * that we didn't write to
2878     */
2879    tmpb = b->dp + b->used;
2880    for (x = b->used; x < oldused; x++) {
2881      *tmpb++ = 0;
2882    }
2883  }
2884  b->sign = a->sign;
2885  return MP_OKAY;
2886}
2887#endif
2888
2889
2890#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
2891/*
2892 * shifts with subtractions when the result is greater than b.
2893 *
2894 * The method is slightly modified to shift B unconditionally upto just under
2895 * the leading bit of b.  This saves alot of multiple precision shifting.
2896 */
2897static int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
2898{
2899  int     x, bits, res;
2900
2901  /* how many bits of last digit does b use */
2902  bits = mp_count_bits (b) % DIGIT_BIT;
2903
2904  if (b->used > 1) {
2905     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2906        return res;
2907     }
2908  } else {
2909     mp_set(a, 1);
2910     bits = 1;
2911  }
2912
2913
2914  /* now compute C = A * B mod b */
2915  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
2916    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2917      return res;
2918    }
2919    if (mp_cmp_mag (a, b) != MP_LT) {
2920      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2921        return res;
2922      }
2923    }
2924  }
2925
2926  return MP_OKAY;
2927}
2928#endif
2929
2930
2931#ifdef BN_MP_EXPTMOD_FAST_C
2932/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
2933 *
2934 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
2935 * The value of k changes based on the size of the exponent.
2936 *
2937 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
2938 */
2939
2940static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
2941{
2942  mp_int  M[TAB_SIZE], res;
2943  mp_digit buf, mp;
2944  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
2945
2946  /* use a pointer to the reduction algorithm.  This allows us to use
2947   * one of many reduction algorithms without modding the guts of
2948   * the code with if statements everywhere.
2949   */
2950  int     (*redux)(mp_int*,mp_int*,mp_digit);
2951
2952  /* find window size */
2953  x = mp_count_bits (X);
2954  if (x <= 7) {
2955    winsize = 2;
2956  } else if (x <= 36) {
2957    winsize = 3;
2958  } else if (x <= 140) {
2959    winsize = 4;
2960  } else if (x <= 450) {
2961    winsize = 5;
2962  } else if (x <= 1303) {
2963    winsize = 6;
2964  } else if (x <= 3529) {
2965    winsize = 7;
2966  } else {
2967    winsize = 8;
2968  }
2969
2970#ifdef MP_LOW_MEM
2971  if (winsize > 5) {
2972     winsize = 5;
2973  }
2974#endif
2975
2976  /* init M array */
2977  /* init first cell */
2978  if ((err = mp_init(&M[1])) != MP_OKAY) {
2979     return err;
2980  }
2981
2982  /* now init the second half of the array */
2983  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
2984    if ((err = mp_init(&M[x])) != MP_OKAY) {
2985      for (y = 1<<(winsize-1); y < x; y++) {
2986        mp_clear (&M[y]);
2987      }
2988      mp_clear(&M[1]);
2989      return err;
2990    }
2991  }
2992
2993  /* determine and setup reduction code */
2994  if (redmode == 0) {
2995#ifdef BN_MP_MONTGOMERY_SETUP_C
2996     /* now setup montgomery  */
2997     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
2998        goto LBL_M;
2999     }
3000#else
3001     err = MP_VAL;
3002     goto LBL_M;
3003#endif
3004
3005     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
3006#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
3007     if (((P->used * 2 + 1) < MP_WARRAY) &&
3008          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
3009        redux = fast_mp_montgomery_reduce;
3010     } else
3011#endif
3012     {
3013#ifdef BN_MP_MONTGOMERY_REDUCE_C
3014        /* use slower baseline Montgomery method */
3015        redux = mp_montgomery_reduce;
3016#else
3017        err = MP_VAL;
3018        goto LBL_M;
3019#endif
3020     }
3021  } else if (redmode == 1) {
3022#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
3023     /* setup DR reduction for moduli of the form B**k - b */
3024     mp_dr_setup(P, &mp);
3025     redux = mp_dr_reduce;
3026#else
3027     err = MP_VAL;
3028     goto LBL_M;
3029#endif
3030  } else {
3031#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
3032     /* setup DR reduction for moduli of the form 2**k - b */
3033     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
3034        goto LBL_M;
3035     }
3036     redux = mp_reduce_2k;
3037#else
3038     err = MP_VAL;
3039     goto LBL_M;
3040#endif
3041  }
3042
3043  /* setup result */
3044  if ((err = mp_init (&res)) != MP_OKAY) {
3045    goto LBL_M;
3046  }
3047
3048  /* create M table
3049   *
3050
3051   *
3052   * The first half of the table is not computed though accept for M[0] and M[1]
3053   */
3054
3055  if (redmode == 0) {
3056#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
3057     /* now we need R mod m */
3058     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
3059       goto LBL_RES;
3060     }
3061#else
3062     err = MP_VAL;
3063     goto LBL_RES;
3064#endif
3065
3066     /* now set M[1] to G * R mod m */
3067     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
3068       goto LBL_RES;
3069     }
3070  } else {
3071     mp_set(&res, 1);
3072     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
3073        goto LBL_RES;
3074     }
3075  }
3076
3077  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
3078  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
3079    goto LBL_RES;
3080  }
3081
3082  for (x = 0; x < (winsize - 1); x++) {
3083    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
3084      goto LBL_RES;
3085    }
3086    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
3087      goto LBL_RES;
3088    }
3089  }
3090
3091  /* create upper table */
3092  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
3093    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
3094      goto LBL_RES;
3095    }
3096    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
3097      goto LBL_RES;
3098    }
3099  }
3100
3101  /* set initial mode and bit cnt */
3102  mode   = 0;
3103  bitcnt = 1;
3104  buf    = 0;
3105  digidx = X->used - 1;
3106  bitcpy = 0;
3107  bitbuf = 0;
3108
3109  for (;;) {
3110    /* grab next digit as required */
3111    if (--bitcnt == 0) {
3112      /* if digidx == -1 we are out of digits so break */
3113      if (digidx == -1) {
3114        break;
3115      }
3116      /* read next digit and reset bitcnt */
3117      buf    = X->dp[digidx--];
3118      bitcnt = (int)DIGIT_BIT;
3119    }
3120
3121    /* grab the next msb from the exponent */
3122    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
3123    buf <<= (mp_digit)1;
3124
3125    /* if the bit is zero and mode == 0 then we ignore it
3126     * These represent the leading zero bits before the first 1 bit
3127     * in the exponent.  Technically this opt is not required but it
3128     * does lower the # of trivial squaring/reductions used
3129     */
3130    if (mode == 0 && y == 0) {
3131      continue;
3132    }
3133
3134    /* if the bit is zero and mode == 1 then we square */
3135    if (mode == 1 && y == 0) {
3136      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3137        goto LBL_RES;
3138      }
3139      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3140        goto LBL_RES;
3141      }
3142      continue;
3143    }
3144
3145    /* else we add it to the window */
3146    bitbuf |= (y << (winsize - ++bitcpy));
3147    mode    = 2;
3148
3149    if (bitcpy == winsize) {
3150      /* ok window is filled so square as required and multiply  */
3151      /* square first */
3152      for (x = 0; x < winsize; x++) {
3153        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3154          goto LBL_RES;
3155        }
3156        if ((err = redux (&res, P, mp)) != MP_OKAY) {
3157          goto LBL_RES;
3158        }
3159      }
3160
3161      /* then multiply */
3162      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
3163        goto LBL_RES;
3164      }
3165      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3166        goto LBL_RES;
3167      }
3168
3169      /* empty window and reset */
3170      bitcpy = 0;
3171      bitbuf = 0;
3172      mode   = 1;
3173    }
3174  }
3175
3176  /* if bits remain then square/multiply */
3177  if (mode == 2 && bitcpy > 0) {
3178    /* square then multiply if the bit is set */
3179    for (x = 0; x < bitcpy; x++) {
3180      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3181        goto LBL_RES;
3182      }
3183      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3184        goto LBL_RES;
3185      }
3186
3187      /* get next bit of the window */
3188      bitbuf <<= 1;
3189      if ((bitbuf & (1 << winsize)) != 0) {
3190        /* then multiply */
3191        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
3192          goto LBL_RES;
3193        }
3194        if ((err = redux (&res, P, mp)) != MP_OKAY) {
3195          goto LBL_RES;
3196        }
3197      }
3198    }
3199  }
3200
3201  if (redmode == 0) {
3202     /* fixup result if Montgomery reduction is used
3203      * recall that any value in a Montgomery system is
3204      * actually multiplied by R mod n.  So we have
3205      * to reduce one more time to cancel out the factor
3206      * of R.
3207      */
3208     if ((err = redux(&res, P, mp)) != MP_OKAY) {
3209       goto LBL_RES;
3210     }
3211  }
3212
3213  /* swap res with Y */
3214  mp_exch (&res, Y);
3215  err = MP_OKAY;
3216LBL_RES:mp_clear (&res);
3217LBL_M:
3218  mp_clear(&M[1]);
3219  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3220    mp_clear (&M[x]);
3221  }
3222  return err;
3223}
3224#endif
3225
3226
3227#ifdef BN_FAST_S_MP_SQR_C
3228/* the jist of squaring...
3229 * you do like mult except the offset of the tmpx [one that
3230 * starts closer to zero] can't equal the offset of tmpy.
3231 * So basically you set up iy like before then you min it with
3232 * (ty-tx) so that it never happens.  You double all those
3233 * you add in the inner loop
3234
3235After that loop you do the squares and add them in.
3236*/
3237
3238static int fast_s_mp_sqr (mp_int * a, mp_int * b)
3239{
3240  int       olduse, res, pa, ix, iz;
3241  mp_digit   W[MP_WARRAY], *tmpx;
3242  mp_word   W1;
3243
3244  /* grow the destination as required */
3245  pa = a->used + a->used;
3246  if (b->alloc < pa) {
3247    if ((res = mp_grow (b, pa)) != MP_OKAY) {
3248      return res;
3249    }
3250  }
3251
3252  /* number of output digits to produce */
3253  W1 = 0;
3254  for (ix = 0; ix < pa; ix++) {
3255      int      tx, ty, iy;
3256      mp_word  _W;
3257      mp_digit *tmpy;
3258
3259      /* clear counter */
3260      _W = 0;
3261
3262      /* get offsets into the two bignums */
3263      ty = MIN(a->used-1, ix);
3264      tx = ix - ty;
3265
3266      /* setup temp aliases */
3267      tmpx = a->dp + tx;
3268      tmpy = a->dp + ty;
3269
3270      /* this is the number of times the loop will iterrate, essentially
3271         while (tx++ < a->used && ty-- >= 0) { ... }
3272       */
3273      iy = MIN(a->used-tx, ty+1);
3274
3275      /* now for squaring tx can never equal ty
3276       * we halve the distance since they approach at a rate of 2x
3277       * and we have to round because odd cases need to be executed
3278       */
3279      iy = MIN(iy, (ty-tx+1)>>1);
3280
3281      /* execute loop */
3282      for (iz = 0; iz < iy; iz++) {
3283         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
3284      }
3285
3286      /* double the inner product and add carry */
3287      _W = _W + _W + W1;
3288
3289      /* even columns have the square term in them */
3290      if ((ix&1) == 0) {
3291         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
3292      }
3293
3294      /* store it */
3295      W[ix] = (mp_digit)(_W & MP_MASK);
3296
3297      /* make next carry */
3298      W1 = _W >> ((mp_word)DIGIT_BIT);
3299  }
3300
3301  /* setup dest */
3302  olduse  = b->used;
3303  b->used = a->used+a->used;
3304
3305  {
3306    mp_digit *tmpb;
3307    tmpb = b->dp;
3308    for (ix = 0; ix < pa; ix++) {
3309      *tmpb++ = W[ix] & MP_MASK;
3310    }
3311
3312    /* clear unused digits [that existed in the old copy of c] */
3313    for (; ix < olduse; ix++) {
3314      *tmpb++ = 0;
3315    }
3316  }
3317  mp_clamp (b);
3318  return MP_OKAY;
3319}
3320#endif
3321
3322
3323#ifdef BN_MP_MUL_D_C
3324/* multiply by a digit */
3325static int
3326mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
3327{
3328  mp_digit u, *tmpa, *tmpc;
3329  mp_word  r;
3330  int      ix, res, olduse;
3331
3332  /* make sure c is big enough to hold a*b */
3333  if (c->alloc < a->used + 1) {
3334    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
3335      return res;
3336    }
3337  }
3338
3339  /* get the original destinations used count */
3340  olduse = c->used;
3341
3342  /* set the sign */
3343  c->sign = a->sign;
3344
3345  /* alias for a->dp [source] */
3346  tmpa = a->dp;
3347
3348  /* alias for c->dp [dest] */
3349  tmpc = c->dp;
3350
3351  /* zero carry */
3352  u = 0;
3353
3354  /* compute columns */
3355  for (ix = 0; ix < a->used; ix++) {
3356    /* compute product and carry sum for this term */
3357    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
3358
3359    /* mask off higher bits to get a single digit */
3360    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
3361
3362    /* send carry into next iteration */
3363    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
3364  }
3365
3366  /* store final carry [if any] and increment ix offset  */
3367  *tmpc++ = u;
3368  ++ix;
3369
3370  /* now zero digits above the top */
3371  while (ix++ < olduse) {
3372     *tmpc++ = 0;
3373  }
3374
3375  /* set used count */
3376  c->used = a->used + 1;
3377  mp_clamp(c);
3378
3379  return MP_OKAY;
3380}
3381#endif
3382