1/*
2 * Minimal code for RSA support from LibTomMath 0.41
3 * http://libtom.org/
4 * http://libtom.org/files/ltm-0.41.tar.bz2
5 * This library was released in public domain by Tom St Denis.
6 *
7 * The combination in this file may not use all of the optimized algorithms
8 * from LibTomMath and may be considerable slower than the LibTomMath with its
9 * default settings. The main purpose of having this version here is to make it
10 * easier to build bignum.c wrapper without having to install and build an
11 * external library.
12 *
13 * If CONFIG_INTERNAL_LIBTOMMATH is defined, bignum.c includes this
14 * libtommath.c file instead of using the external LibTomMath library.
15 */
16
17#ifndef CHAR_BIT
18#define CHAR_BIT 8
19#endif
20
21#define BN_MP_INVMOD_C
22#define BN_S_MP_EXPTMOD_C /* Note: #undef in tommath_superclass.h; this would
23			   * require BN_MP_EXPTMOD_FAST_C instead */
24#define BN_S_MP_MUL_DIGS_C
25#define BN_MP_INVMOD_SLOW_C
26#define BN_S_MP_SQR_C
27#define BN_S_MP_MUL_HIGH_DIGS_C /* Note: #undef in tommath_superclass.h; this
28				 * would require other than mp_reduce */
29
30#ifdef LTM_FAST
31
32/* Use faster div at the cost of about 1 kB */
33#define BN_MP_MUL_D_C
34
35/* Include faster exptmod (Montgomery) at the cost of about 2.5 kB in code */
36#define BN_MP_EXPTMOD_FAST_C
37#define BN_MP_MONTGOMERY_SETUP_C
38#define BN_FAST_MP_MONTGOMERY_REDUCE_C
39#define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
40#define BN_MP_MUL_2_C
41
42/* Include faster sqr at the cost of about 0.5 kB in code */
43#define BN_FAST_S_MP_SQR_C
44
45/* About 0.25 kB of code, but ~1.7kB of stack space! */
46#define BN_FAST_S_MP_MUL_DIGS_C
47
48#else /* LTM_FAST */
49
50#define BN_MP_DIV_SMALL
51#define BN_MP_INIT_MULTI_C
52#define BN_MP_CLEAR_MULTI_C
53#define BN_MP_ABS_C
54#endif /* LTM_FAST */
55
56/* Current uses do not require support for negative exponent in exptmod, so we
57 * can save about 1.5 kB in leaving out invmod. */
58#define LTM_NO_NEG_EXP
59
60/* from tommath.h */
61
62#ifndef MIN
63   #define MIN(x,y) ((x)<(y)?(x):(y))
64#endif
65
66#ifndef MAX
67   #define MAX(x,y) ((x)>(y)?(x):(y))
68#endif
69
70#define  OPT_CAST(x)
71
72#ifdef __x86_64__
73typedef unsigned long mp_digit;
74typedef unsigned long mp_word __attribute__((mode(TI)));
75
76#define DIGIT_BIT 60
77#define MP_64BIT
78#else
79typedef unsigned long mp_digit;
80typedef u64 mp_word;
81
82#define DIGIT_BIT          28
83#define MP_28BIT
84#endif
85
86
87#define XMALLOC  os_malloc
88#define XFREE    os_free
89#define XREALLOC os_realloc
90
91
92#define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
93
94#define MP_LT        -1   /* less than */
95#define MP_EQ         0   /* equal to */
96#define MP_GT         1   /* greater than */
97
98#define MP_ZPOS       0   /* positive integer */
99#define MP_NEG        1   /* negative */
100
101#define MP_OKAY       0   /* ok result */
102#define MP_MEM        -2  /* out of mem */
103#define MP_VAL        -3  /* invalid input */
104
105#define MP_YES        1   /* yes response */
106#define MP_NO         0   /* no response */
107
108typedef int           mp_err;
109
110/* define this to use lower memory usage routines (exptmods mostly) */
111#define MP_LOW_MEM
112
113/* default precision */
114#ifndef MP_PREC
115   #ifndef MP_LOW_MEM
116      #define MP_PREC                 32     /* default digits of precision */
117   #else
118      #define MP_PREC                 8      /* default digits of precision */
119   #endif
120#endif
121
122/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
123#define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))
124
125/* the infamous mp_int structure */
126typedef struct  {
127    int used, alloc, sign;
128    mp_digit *dp;
129} mp_int;
130
131
132/* ---> Basic Manipulations <--- */
133#define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
134#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? MP_YES : MP_NO)
135#define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? MP_YES : MP_NO)
136
137
138/* prototypes for copied functions */
139#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
140static int s_mp_exptmod(mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
141static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
142static int s_mp_sqr(mp_int * a, mp_int * b);
143static int s_mp_mul_high_digs(mp_int * a, mp_int * b, mp_int * c, int digs);
144
145#ifdef BN_FAST_S_MP_MUL_DIGS_C
146static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs);
147#endif
148
149#ifdef BN_MP_INIT_MULTI_C
150static int mp_init_multi(mp_int *mp, ...);
151#endif
152#ifdef BN_MP_CLEAR_MULTI_C
153static void mp_clear_multi(mp_int *mp, ...);
154#endif
155static int mp_lshd(mp_int * a, int b);
156static void mp_set(mp_int * a, mp_digit b);
157static void mp_clamp(mp_int * a);
158static void mp_exch(mp_int * a, mp_int * b);
159static void mp_rshd(mp_int * a, int b);
160static void mp_zero(mp_int * a);
161static int mp_mod_2d(mp_int * a, int b, mp_int * c);
162static int mp_div_2d(mp_int * a, int b, mp_int * c, mp_int * d);
163static int mp_init_copy(mp_int * a, mp_int * b);
164static int mp_mul_2d(mp_int * a, int b, mp_int * c);
165#ifndef LTM_NO_NEG_EXP
166static int mp_div_2(mp_int * a, mp_int * b);
167static int mp_invmod(mp_int * a, mp_int * b, mp_int * c);
168static int mp_invmod_slow(mp_int * a, mp_int * b, mp_int * c);
169#endif /* LTM_NO_NEG_EXP */
170static int mp_copy(mp_int * a, mp_int * b);
171static int mp_count_bits(mp_int * a);
172static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d);
173static int mp_mod(mp_int * a, mp_int * b, mp_int * c);
174static int mp_grow(mp_int * a, int size);
175static int mp_cmp_mag(mp_int * a, mp_int * b);
176#ifdef BN_MP_ABS_C
177static int mp_abs(mp_int * a, mp_int * b);
178#endif
179static int mp_sqr(mp_int * a, mp_int * b);
180static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
181static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
182static int mp_2expt(mp_int * a, int b);
183static int mp_reduce_setup(mp_int * a, mp_int * b);
184static int mp_reduce(mp_int * x, mp_int * m, mp_int * mu);
185static int mp_init_size(mp_int * a, int size);
186#ifdef BN_MP_EXPTMOD_FAST_C
187static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
188#endif /* BN_MP_EXPTMOD_FAST_C */
189#ifdef BN_FAST_S_MP_SQR_C
190static int fast_s_mp_sqr (mp_int * a, mp_int * b);
191#endif /* BN_FAST_S_MP_SQR_C */
192#ifdef BN_MP_MUL_D_C
193static int mp_mul_d (mp_int * a, mp_digit b, mp_int * c);
194#endif /* BN_MP_MUL_D_C */
195
196
197
198/* functions from bn_<func name>.c */
199
200
201/* reverse an array, used for radix code */
202static void bn_reverse (unsigned char *s, int len)
203{
204  int     ix, iy;
205  unsigned char t;
206
207  ix = 0;
208  iy = len - 1;
209  while (ix < iy) {
210    t     = s[ix];
211    s[ix] = s[iy];
212    s[iy] = t;
213    ++ix;
214    --iy;
215  }
216}
217
218
219/* low level addition, based on HAC pp.594, Algorithm 14.7 */
220static int s_mp_add (mp_int * a, mp_int * b, mp_int * c)
221{
222  mp_int *x;
223  int     olduse, res, min, max;
224
225  /* find sizes, we let |a| <= |b| which means we have to sort
226   * them.  "x" will point to the input with the most digits
227   */
228  if (a->used > b->used) {
229    min = b->used;
230    max = a->used;
231    x = a;
232  } else {
233    min = a->used;
234    max = b->used;
235    x = b;
236  }
237
238  /* init result */
239  if (c->alloc < max + 1) {
240    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
241      return res;
242    }
243  }
244
245  /* get old used digit count and set new one */
246  olduse = c->used;
247  c->used = max + 1;
248
249  {
250    register mp_digit u, *tmpa, *tmpb, *tmpc;
251    register int i;
252
253    /* alias for digit pointers */
254
255    /* first input */
256    tmpa = a->dp;
257
258    /* second input */
259    tmpb = b->dp;
260
261    /* destination */
262    tmpc = c->dp;
263
264    /* zero the carry */
265    u = 0;
266    for (i = 0; i < min; i++) {
267      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
268      *tmpc = *tmpa++ + *tmpb++ + u;
269
270      /* U = carry bit of T[i] */
271      u = *tmpc >> ((mp_digit)DIGIT_BIT);
272
273      /* take away carry bit from T[i] */
274      *tmpc++ &= MP_MASK;
275    }
276
277    /* now copy higher words if any, that is in A+B
278     * if A or B has more digits add those in
279     */
280    if (min != max) {
281      for (; i < max; i++) {
282        /* T[i] = X[i] + U */
283        *tmpc = x->dp[i] + u;
284
285        /* U = carry bit of T[i] */
286        u = *tmpc >> ((mp_digit)DIGIT_BIT);
287
288        /* take away carry bit from T[i] */
289        *tmpc++ &= MP_MASK;
290      }
291    }
292
293    /* add carry */
294    *tmpc++ = u;
295
296    /* clear digits above oldused */
297    for (i = c->used; i < olduse; i++) {
298      *tmpc++ = 0;
299    }
300  }
301
302  mp_clamp (c);
303  return MP_OKAY;
304}
305
306
307/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
308static int s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
309{
310  int     olduse, res, min, max;
311
312  /* find sizes */
313  min = b->used;
314  max = a->used;
315
316  /* init result */
317  if (c->alloc < max) {
318    if ((res = mp_grow (c, max)) != MP_OKAY) {
319      return res;
320    }
321  }
322  olduse = c->used;
323  c->used = max;
324
325  {
326    register mp_digit u, *tmpa, *tmpb, *tmpc;
327    register int i;
328
329    /* alias for digit pointers */
330    tmpa = a->dp;
331    tmpb = b->dp;
332    tmpc = c->dp;
333
334    /* set carry to zero */
335    u = 0;
336    for (i = 0; i < min; i++) {
337      /* T[i] = A[i] - B[i] - U */
338      *tmpc = *tmpa++ - *tmpb++ - u;
339
340      /* U = carry bit of T[i]
341       * Note this saves performing an AND operation since
342       * if a carry does occur it will propagate all the way to the
343       * MSB.  As a result a single shift is enough to get the carry
344       */
345      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
346
347      /* Clear carry from T[i] */
348      *tmpc++ &= MP_MASK;
349    }
350
351    /* now copy higher words if any, e.g. if A has more digits than B  */
352    for (; i < max; i++) {
353      /* T[i] = A[i] - U */
354      *tmpc = *tmpa++ - u;
355
356      /* U = carry bit of T[i] */
357      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
358
359      /* Clear carry from T[i] */
360      *tmpc++ &= MP_MASK;
361    }
362
363    /* clear digits above used (since we may not have grown result above) */
364    for (i = c->used; i < olduse; i++) {
365      *tmpc++ = 0;
366    }
367  }
368
369  mp_clamp (c);
370  return MP_OKAY;
371}
372
373
374/* init a new mp_int */
375static int mp_init (mp_int * a)
376{
377  int i;
378
379  /* allocate memory required and clear it */
380  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
381  if (a->dp == NULL) {
382    return MP_MEM;
383  }
384
385  /* set the digits to zero */
386  for (i = 0; i < MP_PREC; i++) {
387      a->dp[i] = 0;
388  }
389
390  /* set the used to zero, allocated digits to the default precision
391   * and sign to positive */
392  a->used  = 0;
393  a->alloc = MP_PREC;
394  a->sign  = MP_ZPOS;
395
396  return MP_OKAY;
397}
398
399
400/* clear one (frees)  */
401static void mp_clear (mp_int * a)
402{
403  int i;
404
405  /* only do anything if a hasn't been freed previously */
406  if (a->dp != NULL) {
407    /* first zero the digits */
408    for (i = 0; i < a->used; i++) {
409        a->dp[i] = 0;
410    }
411
412    /* free ram */
413    XFREE(a->dp);
414
415    /* reset members to make debugging easier */
416    a->dp    = NULL;
417    a->alloc = a->used = 0;
418    a->sign  = MP_ZPOS;
419  }
420}
421
422
423/* high level addition (handles signs) */
424static int mp_add (mp_int * a, mp_int * b, mp_int * c)
425{
426  int     sa, sb, res;
427
428  /* get sign of both inputs */
429  sa = a->sign;
430  sb = b->sign;
431
432  /* handle two cases, not four */
433  if (sa == sb) {
434    /* both positive or both negative */
435    /* add their magnitudes, copy the sign */
436    c->sign = sa;
437    res = s_mp_add (a, b, c);
438  } else {
439    /* one positive, the other negative */
440    /* subtract the one with the greater magnitude from */
441    /* the one of the lesser magnitude.  The result gets */
442    /* the sign of the one with the greater magnitude. */
443    if (mp_cmp_mag (a, b) == MP_LT) {
444      c->sign = sb;
445      res = s_mp_sub (b, a, c);
446    } else {
447      c->sign = sa;
448      res = s_mp_sub (a, b, c);
449    }
450  }
451  return res;
452}
453
454
455/* high level subtraction (handles signs) */
456static int mp_sub (mp_int * a, mp_int * b, mp_int * c)
457{
458  int     sa, sb, res;
459
460  sa = a->sign;
461  sb = b->sign;
462
463  if (sa != sb) {
464    /* subtract a negative from a positive, OR */
465    /* subtract a positive from a negative. */
466    /* In either case, ADD their magnitudes, */
467    /* and use the sign of the first number. */
468    c->sign = sa;
469    res = s_mp_add (a, b, c);
470  } else {
471    /* subtract a positive from a positive, OR */
472    /* subtract a negative from a negative. */
473    /* First, take the difference between their */
474    /* magnitudes, then... */
475    if (mp_cmp_mag (a, b) != MP_LT) {
476      /* Copy the sign from the first */
477      c->sign = sa;
478      /* The first has a larger or equal magnitude */
479      res = s_mp_sub (a, b, c);
480    } else {
481      /* The result has the *opposite* sign from */
482      /* the first number. */
483      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
484      /* The second has a larger magnitude */
485      res = s_mp_sub (b, a, c);
486    }
487  }
488  return res;
489}
490
491
492/* high level multiplication (handles sign) */
493static int mp_mul (mp_int * a, mp_int * b, mp_int * c)
494{
495  int     res, neg;
496  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
497
498  /* use Toom-Cook? */
499#ifdef BN_MP_TOOM_MUL_C
500  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
501    res = mp_toom_mul(a, b, c);
502  } else
503#endif
504#ifdef BN_MP_KARATSUBA_MUL_C
505  /* use Karatsuba? */
506  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
507    res = mp_karatsuba_mul (a, b, c);
508  } else
509#endif
510  {
511    /* can we use the fast multiplier?
512     *
513     * The fast multiplier can be used if the output will
514     * have less than MP_WARRAY digits and the number of
515     * digits won't affect carry propagation
516     */
517#ifdef BN_FAST_S_MP_MUL_DIGS_C
518    int     digs = a->used + b->used + 1;
519
520    if ((digs < MP_WARRAY) &&
521        MIN(a->used, b->used) <=
522        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
523      res = fast_s_mp_mul_digs (a, b, c, digs);
524    } else
525#endif
526#ifdef BN_S_MP_MUL_DIGS_C
527      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
528#else
529#error mp_mul could fail
530      res = MP_VAL;
531#endif
532
533  }
534  c->sign = (c->used > 0) ? neg : MP_ZPOS;
535  return res;
536}
537
538
539/* d = a * b (mod c) */
540static int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
541{
542  int     res;
543  mp_int  t;
544
545  if ((res = mp_init (&t)) != MP_OKAY) {
546    return res;
547  }
548
549  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
550    mp_clear (&t);
551    return res;
552  }
553  res = mp_mod (&t, c, d);
554  mp_clear (&t);
555  return res;
556}
557
558
559/* c = a mod b, 0 <= c < b */
560static int mp_mod (mp_int * a, mp_int * b, mp_int * c)
561{
562  mp_int  t;
563  int     res;
564
565  if ((res = mp_init (&t)) != MP_OKAY) {
566    return res;
567  }
568
569  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
570    mp_clear (&t);
571    return res;
572  }
573
574  if (t.sign != b->sign) {
575    res = mp_add (b, &t, c);
576  } else {
577    res = MP_OKAY;
578    mp_exch (&t, c);
579  }
580
581  mp_clear (&t);
582  return res;
583}
584
585
586/* this is a shell function that calls either the normal or Montgomery
587 * exptmod functions.  Originally the call to the montgomery code was
588 * embedded in the normal function but that wasted a lot of stack space
589 * for nothing (since 99% of the time the Montgomery code would be called)
590 */
591static int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
592{
593  int dr;
594
595  /* modulus P must be positive */
596  if (P->sign == MP_NEG) {
597     return MP_VAL;
598  }
599
600  /* if exponent X is negative we have to recurse */
601  if (X->sign == MP_NEG) {
602#ifdef LTM_NO_NEG_EXP
603        return MP_VAL;
604#else /* LTM_NO_NEG_EXP */
605#ifdef BN_MP_INVMOD_C
606     mp_int tmpG, tmpX;
607     int err;
608
609     /* first compute 1/G mod P */
610     if ((err = mp_init(&tmpG)) != MP_OKAY) {
611        return err;
612     }
613     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
614        mp_clear(&tmpG);
615        return err;
616     }
617
618     /* now get |X| */
619     if ((err = mp_init(&tmpX)) != MP_OKAY) {
620        mp_clear(&tmpG);
621        return err;
622     }
623     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
624        mp_clear_multi(&tmpG, &tmpX, NULL);
625        return err;
626     }
627
628     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
629     err = mp_exptmod(&tmpG, &tmpX, P, Y);
630     mp_clear_multi(&tmpG, &tmpX, NULL);
631     return err;
632#else
633#error mp_exptmod would always fail
634     /* no invmod */
635     return MP_VAL;
636#endif
637#endif /* LTM_NO_NEG_EXP */
638  }
639
640/* modified diminished radix reduction */
641#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
642  if (mp_reduce_is_2k_l(P) == MP_YES) {
643     return s_mp_exptmod(G, X, P, Y, 1);
644  }
645#endif
646
647#ifdef BN_MP_DR_IS_MODULUS_C
648  /* is it a DR modulus? */
649  dr = mp_dr_is_modulus(P);
650#else
651  /* default to no */
652  dr = 0;
653#endif
654
655#ifdef BN_MP_REDUCE_IS_2K_C
656  /* if not, is it a unrestricted DR modulus? */
657  if (dr == 0) {
658     dr = mp_reduce_is_2k(P) << 1;
659  }
660#endif
661
662  /* if the modulus is odd or dr != 0 use the montgomery method */
663#ifdef BN_MP_EXPTMOD_FAST_C
664  if (mp_isodd (P) == 1 || dr !=  0) {
665    return mp_exptmod_fast (G, X, P, Y, dr);
666  } else {
667#endif
668#ifdef BN_S_MP_EXPTMOD_C
669    /* otherwise use the generic Barrett reduction technique */
670    return s_mp_exptmod (G, X, P, Y, 0);
671#else
672#error mp_exptmod could fail
673    /* no exptmod for evens */
674    return MP_VAL;
675#endif
676#ifdef BN_MP_EXPTMOD_FAST_C
677  }
678#endif
679  if (dr == 0) {
680    /* avoid compiler warnings about possibly unused variable */
681  }
682}
683
684
685/* compare two ints (signed)*/
686static int mp_cmp (mp_int * a, mp_int * b)
687{
688  /* compare based on sign */
689  if (a->sign != b->sign) {
690     if (a->sign == MP_NEG) {
691        return MP_LT;
692     } else {
693        return MP_GT;
694     }
695  }
696
697  /* compare digits */
698  if (a->sign == MP_NEG) {
699     /* if negative compare opposite direction */
700     return mp_cmp_mag(b, a);
701  } else {
702     return mp_cmp_mag(a, b);
703  }
704}
705
706
707/* compare a digit */
708static int mp_cmp_d(mp_int * a, mp_digit b)
709{
710  /* compare based on sign */
711  if (a->sign == MP_NEG) {
712    return MP_LT;
713  }
714
715  /* compare based on magnitude */
716  if (a->used > 1) {
717    return MP_GT;
718  }
719
720  /* compare the only digit of a to b */
721  if (a->dp[0] > b) {
722    return MP_GT;
723  } else if (a->dp[0] < b) {
724    return MP_LT;
725  } else {
726    return MP_EQ;
727  }
728}
729
730
731#ifndef LTM_NO_NEG_EXP
732/* hac 14.61, pp608 */
733static int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
734{
735  /* b cannot be negative */
736  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
737    return MP_VAL;
738  }
739
740#ifdef BN_FAST_MP_INVMOD_C
741  /* if the modulus is odd we can use a faster routine instead */
742  if (mp_isodd (b) == 1) {
743    return fast_mp_invmod (a, b, c);
744  }
745#endif
746
747#ifdef BN_MP_INVMOD_SLOW_C
748  return mp_invmod_slow(a, b, c);
749#endif
750
751#ifndef BN_FAST_MP_INVMOD_C
752#ifndef BN_MP_INVMOD_SLOW_C
753#error mp_invmod would always fail
754#endif
755#endif
756  return MP_VAL;
757}
758#endif /* LTM_NO_NEG_EXP */
759
760
761/* get the size for an unsigned equivalent */
762static int mp_unsigned_bin_size (mp_int * a)
763{
764  int     size = mp_count_bits (a);
765  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
766}
767
768
769#ifndef LTM_NO_NEG_EXP
770/* hac 14.61, pp608 */
771static int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
772{
773  mp_int  x, y, u, v, A, B, C, D;
774  int     res;
775
776  /* b cannot be negative */
777  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
778    return MP_VAL;
779  }
780
781  /* init temps */
782  if ((res = mp_init_multi(&x, &y, &u, &v,
783                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
784     return res;
785  }
786
787  /* x = a, y = b */
788  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
789      goto LBL_ERR;
790  }
791  if ((res = mp_copy (b, &y)) != MP_OKAY) {
792    goto LBL_ERR;
793  }
794
795  /* 2. [modified] if x,y are both even then return an error! */
796  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
797    res = MP_VAL;
798    goto LBL_ERR;
799  }
800
801  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
802  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
803    goto LBL_ERR;
804  }
805  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
806    goto LBL_ERR;
807  }
808  mp_set (&A, 1);
809  mp_set (&D, 1);
810
811top:
812  /* 4.  while u is even do */
813  while (mp_iseven (&u) == 1) {
814    /* 4.1 u = u/2 */
815    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
816      goto LBL_ERR;
817    }
818    /* 4.2 if A or B is odd then */
819    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
820      /* A = (A+y)/2, B = (B-x)/2 */
821      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
822         goto LBL_ERR;
823      }
824      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
825         goto LBL_ERR;
826      }
827    }
828    /* A = A/2, B = B/2 */
829    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
830      goto LBL_ERR;
831    }
832    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
833      goto LBL_ERR;
834    }
835  }
836
837  /* 5.  while v is even do */
838  while (mp_iseven (&v) == 1) {
839    /* 5.1 v = v/2 */
840    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
841      goto LBL_ERR;
842    }
843    /* 5.2 if C or D is odd then */
844    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
845      /* C = (C+y)/2, D = (D-x)/2 */
846      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
847         goto LBL_ERR;
848      }
849      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
850         goto LBL_ERR;
851      }
852    }
853    /* C = C/2, D = D/2 */
854    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
855      goto LBL_ERR;
856    }
857    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
858      goto LBL_ERR;
859    }
860  }
861
862  /* 6.  if u >= v then */
863  if (mp_cmp (&u, &v) != MP_LT) {
864    /* u = u - v, A = A - C, B = B - D */
865    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
866      goto LBL_ERR;
867    }
868
869    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
870      goto LBL_ERR;
871    }
872
873    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
874      goto LBL_ERR;
875    }
876  } else {
877    /* v - v - u, C = C - A, D = D - B */
878    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
879      goto LBL_ERR;
880    }
881
882    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
883      goto LBL_ERR;
884    }
885
886    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
887      goto LBL_ERR;
888    }
889  }
890
891  /* if not zero goto step 4 */
892  if (mp_iszero (&u) == 0)
893    goto top;
894
895  /* now a = C, b = D, gcd == g*v */
896
897  /* if v != 1 then there is no inverse */
898  if (mp_cmp_d (&v, 1) != MP_EQ) {
899    res = MP_VAL;
900    goto LBL_ERR;
901  }
902
903  /* if its too low */
904  while (mp_cmp_d(&C, 0) == MP_LT) {
905      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
906         goto LBL_ERR;
907      }
908  }
909
910  /* too big */
911  while (mp_cmp_mag(&C, b) != MP_LT) {
912      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
913         goto LBL_ERR;
914      }
915  }
916
917  /* C is now the inverse */
918  mp_exch (&C, c);
919  res = MP_OKAY;
920LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
921  return res;
922}
923#endif /* LTM_NO_NEG_EXP */
924
925
926/* compare maginitude of two ints (unsigned) */
927static int mp_cmp_mag (mp_int * a, mp_int * b)
928{
929  int     n;
930  mp_digit *tmpa, *tmpb;
931
932  /* compare based on # of non-zero digits */
933  if (a->used > b->used) {
934    return MP_GT;
935  }
936
937  if (a->used < b->used) {
938    return MP_LT;
939  }
940
941  /* alias for a */
942  tmpa = a->dp + (a->used - 1);
943
944  /* alias for b */
945  tmpb = b->dp + (a->used - 1);
946
947  /* compare based on digits  */
948  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
949    if (*tmpa > *tmpb) {
950      return MP_GT;
951    }
952
953    if (*tmpa < *tmpb) {
954      return MP_LT;
955    }
956  }
957  return MP_EQ;
958}
959
960
961/* reads a unsigned char array, assumes the msb is stored first [big endian] */
962static int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
963{
964  int     res;
965
966  /* make sure there are at least two digits */
967  if (a->alloc < 2) {
968     if ((res = mp_grow(a, 2)) != MP_OKAY) {
969        return res;
970     }
971  }
972
973  /* zero the int */
974  mp_zero (a);
975
976  /* read the bytes in */
977  while (c-- > 0) {
978    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
979      return res;
980    }
981
982#ifndef MP_8BIT
983      a->dp[0] |= *b++;
984      a->used += 1;
985#else
986      a->dp[0] = (*b & MP_MASK);
987      a->dp[1] |= ((*b++ >> 7U) & 1);
988      a->used += 2;
989#endif
990  }
991  mp_clamp (a);
992  return MP_OKAY;
993}
994
995
996/* store in unsigned [big endian] format */
997static int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
998{
999  int     x, res;
1000  mp_int  t;
1001
1002  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
1003    return res;
1004  }
1005
1006  x = 0;
1007  while (mp_iszero (&t) == 0) {
1008#ifndef MP_8BIT
1009      b[x++] = (unsigned char) (t.dp[0] & 255);
1010#else
1011      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
1012#endif
1013    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
1014      mp_clear (&t);
1015      return res;
1016    }
1017  }
1018  bn_reverse (b, x);
1019  mp_clear (&t);
1020  return MP_OKAY;
1021}
1022
1023
1024/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1025static int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
1026{
1027  mp_digit D, r, rr;
1028  int     x, res;
1029  mp_int  t;
1030
1031
1032  /* if the shift count is <= 0 then we do no work */
1033  if (b <= 0) {
1034    res = mp_copy (a, c);
1035    if (d != NULL) {
1036      mp_zero (d);
1037    }
1038    return res;
1039  }
1040
1041  if ((res = mp_init (&t)) != MP_OKAY) {
1042    return res;
1043  }
1044
1045  /* get the remainder */
1046  if (d != NULL) {
1047    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1048      mp_clear (&t);
1049      return res;
1050    }
1051  }
1052
1053  /* copy */
1054  if ((res = mp_copy (a, c)) != MP_OKAY) {
1055    mp_clear (&t);
1056    return res;
1057  }
1058
1059  /* shift by as many digits in the bit count */
1060  if (b >= (int)DIGIT_BIT) {
1061    mp_rshd (c, b / DIGIT_BIT);
1062  }
1063
1064  /* shift any bit count < DIGIT_BIT */
1065  D = (mp_digit) (b % DIGIT_BIT);
1066  if (D != 0) {
1067    register mp_digit *tmpc, mask, shift;
1068
1069    /* mask */
1070    mask = (((mp_digit)1) << D) - 1;
1071
1072    /* shift for lsb */
1073    shift = DIGIT_BIT - D;
1074
1075    /* alias */
1076    tmpc = c->dp + (c->used - 1);
1077
1078    /* carry */
1079    r = 0;
1080    for (x = c->used - 1; x >= 0; x--) {
1081      /* get the lower  bits of this word in a temp */
1082      rr = *tmpc & mask;
1083
1084      /* shift the current word and mix in the carry bits from the previous word */
1085      *tmpc = (*tmpc >> D) | (r << shift);
1086      --tmpc;
1087
1088      /* set the carry to the carry bits of the current word found above */
1089      r = rr;
1090    }
1091  }
1092  mp_clamp (c);
1093  if (d != NULL) {
1094    mp_exch (&t, d);
1095  }
1096  mp_clear (&t);
1097  return MP_OKAY;
1098}
1099
1100
1101static int mp_init_copy (mp_int * a, mp_int * b)
1102{
1103  int     res;
1104
1105  if ((res = mp_init (a)) != MP_OKAY) {
1106    return res;
1107  }
1108  return mp_copy (b, a);
1109}
1110
1111
1112/* set to zero */
1113static void mp_zero (mp_int * a)
1114{
1115  int       n;
1116  mp_digit *tmp;
1117
1118  a->sign = MP_ZPOS;
1119  a->used = 0;
1120
1121  tmp = a->dp;
1122  for (n = 0; n < a->alloc; n++) {
1123     *tmp++ = 0;
1124  }
1125}
1126
1127
1128/* copy, b = a */
1129static int mp_copy (mp_int * a, mp_int * b)
1130{
1131  int     res, n;
1132
1133  /* if dst == src do nothing */
1134  if (a == b) {
1135    return MP_OKAY;
1136  }
1137
1138  /* grow dest */
1139  if (b->alloc < a->used) {
1140     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1141        return res;
1142     }
1143  }
1144
1145  /* zero b and copy the parameters over */
1146  {
1147    register mp_digit *tmpa, *tmpb;
1148
1149    /* pointer aliases */
1150
1151    /* source */
1152    tmpa = a->dp;
1153
1154    /* destination */
1155    tmpb = b->dp;
1156
1157    /* copy all the digits */
1158    for (n = 0; n < a->used; n++) {
1159      *tmpb++ = *tmpa++;
1160    }
1161
1162    /* clear high digits */
1163    for (; n < b->used; n++) {
1164      *tmpb++ = 0;
1165    }
1166  }
1167
1168  /* copy used count and sign */
1169  b->used = a->used;
1170  b->sign = a->sign;
1171  return MP_OKAY;
1172}
1173
1174
1175/* shift right a certain amount of digits */
1176static void mp_rshd (mp_int * a, int b)
1177{
1178  int     x;
1179
1180  /* if b <= 0 then ignore it */
1181  if (b <= 0) {
1182    return;
1183  }
1184
1185  /* if b > used then simply zero it and return */
1186  if (a->used <= b) {
1187    mp_zero (a);
1188    return;
1189  }
1190
1191  {
1192    register mp_digit *bottom, *top;
1193
1194    /* shift the digits down */
1195
1196    /* bottom */
1197    bottom = a->dp;
1198
1199    /* top [offset into digits] */
1200    top = a->dp + b;
1201
1202    /* this is implemented as a sliding window where
1203     * the window is b-digits long and digits from
1204     * the top of the window are copied to the bottom
1205     *
1206     * e.g.
1207
1208     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
1209                 /\                   |      ---->
1210                  \-------------------/      ---->
1211     */
1212    for (x = 0; x < (a->used - b); x++) {
1213      *bottom++ = *top++;
1214    }
1215
1216    /* zero the top digits */
1217    for (; x < a->used; x++) {
1218      *bottom++ = 0;
1219    }
1220  }
1221
1222  /* remove excess digits */
1223  a->used -= b;
1224}
1225
1226
1227/* swap the elements of two integers, for cases where you can't simply swap the
1228 * mp_int pointers around
1229 */
1230static void mp_exch (mp_int * a, mp_int * b)
1231{
1232  mp_int  t;
1233
1234  t  = *a;
1235  *a = *b;
1236  *b = t;
1237}
1238
1239
1240/* trim unused digits
1241 *
1242 * This is used to ensure that leading zero digits are
1243 * trimed and the leading "used" digit will be non-zero
1244 * Typically very fast.  Also fixes the sign if there
1245 * are no more leading digits
1246 */
1247static void mp_clamp (mp_int * a)
1248{
1249  /* decrease used while the most significant digit is
1250   * zero.
1251   */
1252  while (a->used > 0 && a->dp[a->used - 1] == 0) {
1253    --(a->used);
1254  }
1255
1256  /* reset the sign flag if used == 0 */
1257  if (a->used == 0) {
1258    a->sign = MP_ZPOS;
1259  }
1260}
1261
1262
1263/* grow as required */
1264static int mp_grow (mp_int * a, int size)
1265{
1266  int     i;
1267  mp_digit *tmp;
1268
1269  /* if the alloc size is smaller alloc more ram */
1270  if (a->alloc < size) {
1271    /* ensure there are always at least MP_PREC digits extra on top */
1272    size += (MP_PREC * 2) - (size % MP_PREC);
1273
1274    /* reallocate the array a->dp
1275     *
1276     * We store the return in a temporary variable
1277     * in case the operation failed we don't want
1278     * to overwrite the dp member of a.
1279     */
1280    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
1281    if (tmp == NULL) {
1282      /* reallocation failed but "a" is still valid [can be freed] */
1283      return MP_MEM;
1284    }
1285
1286    /* reallocation succeeded so set a->dp */
1287    a->dp = tmp;
1288
1289    /* zero excess digits */
1290    i        = a->alloc;
1291    a->alloc = size;
1292    for (; i < a->alloc; i++) {
1293      a->dp[i] = 0;
1294    }
1295  }
1296  return MP_OKAY;
1297}
1298
1299
1300#ifdef BN_MP_ABS_C
1301/* b = |a|
1302 *
1303 * Simple function copies the input and fixes the sign to positive
1304 */
1305static int mp_abs (mp_int * a, mp_int * b)
1306{
1307  int     res;
1308
1309  /* copy a to b */
1310  if (a != b) {
1311     if ((res = mp_copy (a, b)) != MP_OKAY) {
1312       return res;
1313     }
1314  }
1315
1316  /* force the sign of b to positive */
1317  b->sign = MP_ZPOS;
1318
1319  return MP_OKAY;
1320}
1321#endif
1322
1323
1324/* set to a digit */
1325static void mp_set (mp_int * a, mp_digit b)
1326{
1327  mp_zero (a);
1328  a->dp[0] = b & MP_MASK;
1329  a->used  = (a->dp[0] != 0) ? 1 : 0;
1330}
1331
1332
1333#ifndef LTM_NO_NEG_EXP
1334/* b = a/2 */
1335static int mp_div_2(mp_int * a, mp_int * b)
1336{
1337  int     x, res, oldused;
1338
1339  /* copy */
1340  if (b->alloc < a->used) {
1341    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1342      return res;
1343    }
1344  }
1345
1346  oldused = b->used;
1347  b->used = a->used;
1348  {
1349    register mp_digit r, rr, *tmpa, *tmpb;
1350
1351    /* source alias */
1352    tmpa = a->dp + b->used - 1;
1353
1354    /* dest alias */
1355    tmpb = b->dp + b->used - 1;
1356
1357    /* carry */
1358    r = 0;
1359    for (x = b->used - 1; x >= 0; x--) {
1360      /* get the carry for the next iteration */
1361      rr = *tmpa & 1;
1362
1363      /* shift the current digit, add in carry and store */
1364      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1365
1366      /* forward carry to next iteration */
1367      r = rr;
1368    }
1369
1370    /* zero excess digits */
1371    tmpb = b->dp + b->used;
1372    for (x = b->used; x < oldused; x++) {
1373      *tmpb++ = 0;
1374    }
1375  }
1376  b->sign = a->sign;
1377  mp_clamp (b);
1378  return MP_OKAY;
1379}
1380#endif /* LTM_NO_NEG_EXP */
1381
1382
1383/* shift left by a certain bit count */
1384static int mp_mul_2d (mp_int * a, int b, mp_int * c)
1385{
1386  mp_digit d;
1387  int      res;
1388
1389  /* copy */
1390  if (a != c) {
1391     if ((res = mp_copy (a, c)) != MP_OKAY) {
1392       return res;
1393     }
1394  }
1395
1396  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
1397     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
1398       return res;
1399     }
1400  }
1401
1402  /* shift by as many digits in the bit count */
1403  if (b >= (int)DIGIT_BIT) {
1404    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
1405      return res;
1406    }
1407  }
1408
1409  /* shift any bit count < DIGIT_BIT */
1410  d = (mp_digit) (b % DIGIT_BIT);
1411  if (d != 0) {
1412    register mp_digit *tmpc, shift, mask, r, rr;
1413    register int x;
1414
1415    /* bitmask for carries */
1416    mask = (((mp_digit)1) << d) - 1;
1417
1418    /* shift for msbs */
1419    shift = DIGIT_BIT - d;
1420
1421    /* alias */
1422    tmpc = c->dp;
1423
1424    /* carry */
1425    r    = 0;
1426    for (x = 0; x < c->used; x++) {
1427      /* get the higher bits of the current word */
1428      rr = (*tmpc >> shift) & mask;
1429
1430      /* shift the current word and OR in the carry */
1431      *tmpc = ((*tmpc << d) | r) & MP_MASK;
1432      ++tmpc;
1433
1434      /* set the carry to the carry bits of the current word */
1435      r = rr;
1436    }
1437
1438    /* set final carry */
1439    if (r != 0) {
1440       c->dp[(c->used)++] = r;
1441    }
1442  }
1443  mp_clamp (c);
1444  return MP_OKAY;
1445}
1446
1447
1448#ifdef BN_MP_INIT_MULTI_C
1449static int mp_init_multi(mp_int *mp, ...)
1450{
1451    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
1452    int n = 0;                 /* Number of ok inits */
1453    mp_int* cur_arg = mp;
1454    va_list args;
1455
1456    va_start(args, mp);        /* init args to next argument from caller */
1457    while (cur_arg != NULL) {
1458        if (mp_init(cur_arg) != MP_OKAY) {
1459            /* Oops - error! Back-track and mp_clear what we already
1460               succeeded in init-ing, then return error.
1461            */
1462            va_list clean_args;
1463
1464            /* end the current list */
1465            va_end(args);
1466
1467            /* now start cleaning up */
1468            cur_arg = mp;
1469            va_start(clean_args, mp);
1470            while (n--) {
1471                mp_clear(cur_arg);
1472                cur_arg = va_arg(clean_args, mp_int*);
1473            }
1474            va_end(clean_args);
1475            return MP_MEM;
1476        }
1477        n++;
1478        cur_arg = va_arg(args, mp_int*);
1479    }
1480    va_end(args);
1481    return res;                /* Assumed ok, if error flagged above. */
1482}
1483#endif
1484
1485
1486#ifdef BN_MP_CLEAR_MULTI_C
1487static void mp_clear_multi(mp_int *mp, ...)
1488{
1489    mp_int* next_mp = mp;
1490    va_list args;
1491    va_start(args, mp);
1492    while (next_mp != NULL) {
1493        mp_clear(next_mp);
1494        next_mp = va_arg(args, mp_int*);
1495    }
1496    va_end(args);
1497}
1498#endif
1499
1500
1501/* shift left a certain amount of digits */
1502static int mp_lshd (mp_int * a, int b)
1503{
1504  int     x, res;
1505
1506  /* if its less than zero return */
1507  if (b <= 0) {
1508    return MP_OKAY;
1509  }
1510
1511  /* grow to fit the new digits */
1512  if (a->alloc < a->used + b) {
1513     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
1514       return res;
1515     }
1516  }
1517
1518  {
1519    register mp_digit *top, *bottom;
1520
1521    /* increment the used by the shift amount then copy upwards */
1522    a->used += b;
1523
1524    /* top */
1525    top = a->dp + a->used - 1;
1526
1527    /* base */
1528    bottom = a->dp + a->used - 1 - b;
1529
1530    /* much like mp_rshd this is implemented using a sliding window
1531     * except the window goes the otherway around.  Copying from
1532     * the bottom to the top.  see bn_mp_rshd.c for more info.
1533     */
1534    for (x = a->used - 1; x >= b; x--) {
1535      *top-- = *bottom--;
1536    }
1537
1538    /* zero the lower digits */
1539    top = a->dp;
1540    for (x = 0; x < b; x++) {
1541      *top++ = 0;
1542    }
1543  }
1544  return MP_OKAY;
1545}
1546
1547
1548/* returns the number of bits in an int */
1549static int mp_count_bits (mp_int * a)
1550{
1551  int     r;
1552  mp_digit q;
1553
1554  /* shortcut */
1555  if (a->used == 0) {
1556    return 0;
1557  }
1558
1559  /* get number of digits and add that */
1560  r = (a->used - 1) * DIGIT_BIT;
1561
1562  /* take the last digit and count the bits in it */
1563  q = a->dp[a->used - 1];
1564  while (q > ((mp_digit) 0)) {
1565    ++r;
1566    q >>= ((mp_digit) 1);
1567  }
1568  return r;
1569}
1570
1571
1572/* calc a value mod 2**b */
1573static int mp_mod_2d (mp_int * a, int b, mp_int * c)
1574{
1575  int     x, res;
1576
1577  /* if b is <= 0 then zero the int */
1578  if (b <= 0) {
1579    mp_zero (c);
1580    return MP_OKAY;
1581  }
1582
1583  /* if the modulus is larger than the value than return */
1584  if (b >= (int) (a->used * DIGIT_BIT)) {
1585    res = mp_copy (a, c);
1586    return res;
1587  }
1588
1589  /* copy */
1590  if ((res = mp_copy (a, c)) != MP_OKAY) {
1591    return res;
1592  }
1593
1594  /* zero digits above the last digit of the modulus */
1595  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
1596    c->dp[x] = 0;
1597  }
1598  /* clear the digit that is not completely outside/inside the modulus */
1599  c->dp[b / DIGIT_BIT] &=
1600    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
1601  mp_clamp (c);
1602  return MP_OKAY;
1603}
1604
1605
1606#ifdef BN_MP_DIV_SMALL
1607
1608/* slower bit-bang division... also smaller */
1609static int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1610{
1611   mp_int ta, tb, tq, q;
1612   int    res, n, n2;
1613
1614  /* is divisor zero ? */
1615  if (mp_iszero (b) == 1) {
1616    return MP_VAL;
1617  }
1618
1619  /* if a < b then q=0, r = a */
1620  if (mp_cmp_mag (a, b) == MP_LT) {
1621    if (d != NULL) {
1622      res = mp_copy (a, d);
1623    } else {
1624      res = MP_OKAY;
1625    }
1626    if (c != NULL) {
1627      mp_zero (c);
1628    }
1629    return res;
1630  }
1631
1632  /* init our temps */
1633  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL)) != MP_OKAY) {
1634     return res;
1635  }
1636
1637
1638  mp_set(&tq, 1);
1639  n = mp_count_bits(a) - mp_count_bits(b);
1640  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
1641      ((res = mp_abs(b, &tb)) != MP_OKAY) ||
1642      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
1643      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
1644      goto LBL_ERR;
1645  }
1646
1647  while (n-- >= 0) {
1648     if (mp_cmp(&tb, &ta) != MP_GT) {
1649        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
1650            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
1651           goto LBL_ERR;
1652        }
1653     }
1654     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
1655         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
1656           goto LBL_ERR;
1657     }
1658  }
1659
1660  /* now q == quotient and ta == remainder */
1661  n  = a->sign;
1662  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
1663  if (c != NULL) {
1664     mp_exch(c, &q);
1665     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
1666  }
1667  if (d != NULL) {
1668     mp_exch(d, &ta);
1669     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
1670  }
1671LBL_ERR:
1672   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
1673   return res;
1674}
1675
1676#else
1677
1678/* integer signed division.
1679 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1680 * HAC pp.598 Algorithm 14.20
1681 *
1682 * Note that the description in HAC is horribly
1683 * incomplete.  For example, it doesn't consider
1684 * the case where digits are removed from 'x' in
1685 * the inner loop.  It also doesn't consider the
1686 * case that y has fewer than three digits, etc..
1687 *
1688 * The overall algorithm is as described as
1689 * 14.20 from HAC but fixed to treat these cases.
1690*/
1691static int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1692{
1693  mp_int  q, x, y, t1, t2;
1694  int     res, n, t, i, norm, neg;
1695
1696  /* is divisor zero ? */
1697  if (mp_iszero (b) == 1) {
1698    return MP_VAL;
1699  }
1700
1701  /* if a < b then q=0, r = a */
1702  if (mp_cmp_mag (a, b) == MP_LT) {
1703    if (d != NULL) {
1704      res = mp_copy (a, d);
1705    } else {
1706      res = MP_OKAY;
1707    }
1708    if (c != NULL) {
1709      mp_zero (c);
1710    }
1711    return res;
1712  }
1713
1714  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1715    return res;
1716  }
1717  q.used = a->used + 2;
1718
1719  if ((res = mp_init (&t1)) != MP_OKAY) {
1720    goto LBL_Q;
1721  }
1722
1723  if ((res = mp_init (&t2)) != MP_OKAY) {
1724    goto LBL_T1;
1725  }
1726
1727  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1728    goto LBL_T2;
1729  }
1730
1731  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1732    goto LBL_X;
1733  }
1734
1735  /* fix the sign */
1736  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1737  x.sign = y.sign = MP_ZPOS;
1738
1739  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1740  norm = mp_count_bits(&y) % DIGIT_BIT;
1741  if (norm < (int)(DIGIT_BIT-1)) {
1742     norm = (DIGIT_BIT-1) - norm;
1743     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1744       goto LBL_Y;
1745     }
1746     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1747       goto LBL_Y;
1748     }
1749  } else {
1750     norm = 0;
1751  }
1752
1753  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1754  n = x.used - 1;
1755  t = y.used - 1;
1756
1757  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1758  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1759    goto LBL_Y;
1760  }
1761
1762  while (mp_cmp (&x, &y) != MP_LT) {
1763    ++(q.dp[n - t]);
1764    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1765      goto LBL_Y;
1766    }
1767  }
1768
1769  /* reset y by shifting it back down */
1770  mp_rshd (&y, n - t);
1771
1772  /* step 3. for i from n down to (t + 1) */
1773  for (i = n; i >= (t + 1); i--) {
1774    if (i > x.used) {
1775      continue;
1776    }
1777
1778    /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1779     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1780    if (x.dp[i] == y.dp[t]) {
1781      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1782    } else {
1783      mp_word tmp;
1784      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1785      tmp |= ((mp_word) x.dp[i - 1]);
1786      tmp /= ((mp_word) y.dp[t]);
1787      if (tmp > (mp_word) MP_MASK)
1788        tmp = MP_MASK;
1789      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1790    }
1791
1792    /* while (q{i-t-1} * (yt * b + y{t-1})) >
1793             xi * b**2 + xi-1 * b + xi-2
1794
1795       do q{i-t-1} -= 1;
1796    */
1797    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1798    do {
1799      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1800
1801      /* find left hand */
1802      mp_zero (&t1);
1803      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1804      t1.dp[1] = y.dp[t];
1805      t1.used = 2;
1806      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1807        goto LBL_Y;
1808      }
1809
1810      /* find right hand */
1811      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1812      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1813      t2.dp[2] = x.dp[i];
1814      t2.used = 3;
1815    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1816
1817    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1818    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1819      goto LBL_Y;
1820    }
1821
1822    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1823      goto LBL_Y;
1824    }
1825
1826    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1827      goto LBL_Y;
1828    }
1829
1830    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1831    if (x.sign == MP_NEG) {
1832      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1833        goto LBL_Y;
1834      }
1835      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1836        goto LBL_Y;
1837      }
1838      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1839        goto LBL_Y;
1840      }
1841
1842      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1843    }
1844  }
1845
1846  /* now q is the quotient and x is the remainder
1847   * [which we have to normalize]
1848   */
1849
1850  /* get sign before writing to c */
1851  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1852
1853  if (c != NULL) {
1854    mp_clamp (&q);
1855    mp_exch (&q, c);
1856    c->sign = neg;
1857  }
1858
1859  if (d != NULL) {
1860    mp_div_2d (&x, norm, &x, NULL);
1861    mp_exch (&x, d);
1862  }
1863
1864  res = MP_OKAY;
1865
1866LBL_Y:mp_clear (&y);
1867LBL_X:mp_clear (&x);
1868LBL_T2:mp_clear (&t2);
1869LBL_T1:mp_clear (&t1);
1870LBL_Q:mp_clear (&q);
1871  return res;
1872}
1873
1874#endif
1875
1876
1877#ifdef MP_LOW_MEM
1878   #define TAB_SIZE 32
1879#else
1880   #define TAB_SIZE 256
1881#endif
1882
1883static int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
1884{
1885  mp_int  M[TAB_SIZE], res, mu;
1886  mp_digit buf;
1887  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1888  int (*redux)(mp_int*,mp_int*,mp_int*);
1889
1890  /* find window size */
1891  x = mp_count_bits (X);
1892  if (x <= 7) {
1893    winsize = 2;
1894  } else if (x <= 36) {
1895    winsize = 3;
1896  } else if (x <= 140) {
1897    winsize = 4;
1898  } else if (x <= 450) {
1899    winsize = 5;
1900  } else if (x <= 1303) {
1901    winsize = 6;
1902  } else if (x <= 3529) {
1903    winsize = 7;
1904  } else {
1905    winsize = 8;
1906  }
1907
1908#ifdef MP_LOW_MEM
1909    if (winsize > 5) {
1910       winsize = 5;
1911    }
1912#endif
1913
1914  /* init M array */
1915  /* init first cell */
1916  if ((err = mp_init(&M[1])) != MP_OKAY) {
1917     return err;
1918  }
1919
1920  /* now init the second half of the array */
1921  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1922    if ((err = mp_init(&M[x])) != MP_OKAY) {
1923      for (y = 1<<(winsize-1); y < x; y++) {
1924        mp_clear (&M[y]);
1925      }
1926      mp_clear(&M[1]);
1927      return err;
1928    }
1929  }
1930
1931  /* create mu, used for Barrett reduction */
1932  if ((err = mp_init (&mu)) != MP_OKAY) {
1933    goto LBL_M;
1934  }
1935
1936  if (redmode == 0) {
1937     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
1938        goto LBL_MU;
1939     }
1940     redux = mp_reduce;
1941  } else {
1942     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
1943        goto LBL_MU;
1944     }
1945     redux = mp_reduce_2k_l;
1946  }
1947
1948  /* create M table
1949   *
1950   * The M table contains powers of the base,
1951   * e.g. M[x] = G**x mod P
1952   *
1953   * The first half of the table is not
1954   * computed though accept for M[0] and M[1]
1955   */
1956  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
1957    goto LBL_MU;
1958  }
1959
1960  /* compute the value at M[1<<(winsize-1)] by squaring
1961   * M[1] (winsize-1) times
1962   */
1963  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1964    goto LBL_MU;
1965  }
1966
1967  for (x = 0; x < (winsize - 1); x++) {
1968    /* square it */
1969    if ((err = mp_sqr (&M[1 << (winsize - 1)],
1970                       &M[1 << (winsize - 1)])) != MP_OKAY) {
1971      goto LBL_MU;
1972    }
1973
1974    /* reduce modulo P */
1975    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
1976      goto LBL_MU;
1977    }
1978  }
1979
1980  /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
1981   * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
1982   */
1983  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1984    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1985      goto LBL_MU;
1986    }
1987    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
1988      goto LBL_MU;
1989    }
1990  }
1991
1992  /* setup result */
1993  if ((err = mp_init (&res)) != MP_OKAY) {
1994    goto LBL_MU;
1995  }
1996  mp_set (&res, 1);
1997
1998  /* set initial mode and bit cnt */
1999  mode   = 0;
2000  bitcnt = 1;
2001  buf    = 0;
2002  digidx = X->used - 1;
2003  bitcpy = 0;
2004  bitbuf = 0;
2005
2006  for (;;) {
2007    /* grab next digit as required */
2008    if (--bitcnt == 0) {
2009      /* if digidx == -1 we are out of digits */
2010      if (digidx == -1) {
2011        break;
2012      }
2013      /* read next digit and reset the bitcnt */
2014      buf    = X->dp[digidx--];
2015      bitcnt = (int) DIGIT_BIT;
2016    }
2017
2018    /* grab the next msb from the exponent */
2019    y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
2020    buf <<= (mp_digit)1;
2021
2022    /* if the bit is zero and mode == 0 then we ignore it
2023     * These represent the leading zero bits before the first 1 bit
2024     * in the exponent.  Technically this opt is not required but it
2025     * does lower the # of trivial squaring/reductions used
2026     */
2027    if (mode == 0 && y == 0) {
2028      continue;
2029    }
2030
2031    /* if the bit is zero and mode == 1 then we square */
2032    if (mode == 1 && y == 0) {
2033      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2034        goto LBL_RES;
2035      }
2036      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2037        goto LBL_RES;
2038      }
2039      continue;
2040    }
2041
2042    /* else we add it to the window */
2043    bitbuf |= (y << (winsize - ++bitcpy));
2044    mode    = 2;
2045
2046    if (bitcpy == winsize) {
2047      /* ok window is filled so square as required and multiply  */
2048      /* square first */
2049      for (x = 0; x < winsize; x++) {
2050        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2051          goto LBL_RES;
2052        }
2053        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2054          goto LBL_RES;
2055        }
2056      }
2057
2058      /* then multiply */
2059      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
2060        goto LBL_RES;
2061      }
2062      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2063        goto LBL_RES;
2064      }
2065
2066      /* empty window and reset */
2067      bitcpy = 0;
2068      bitbuf = 0;
2069      mode   = 1;
2070    }
2071  }
2072
2073  /* if bits remain then square/multiply */
2074  if (mode == 2 && bitcpy > 0) {
2075    /* square then multiply if the bit is set */
2076    for (x = 0; x < bitcpy; x++) {
2077      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
2078        goto LBL_RES;
2079      }
2080      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2081        goto LBL_RES;
2082      }
2083
2084      bitbuf <<= 1;
2085      if ((bitbuf & (1 << winsize)) != 0) {
2086        /* then multiply */
2087        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
2088          goto LBL_RES;
2089        }
2090        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
2091          goto LBL_RES;
2092        }
2093      }
2094    }
2095  }
2096
2097  mp_exch (&res, Y);
2098  err = MP_OKAY;
2099LBL_RES:mp_clear (&res);
2100LBL_MU:mp_clear (&mu);
2101LBL_M:
2102  mp_clear(&M[1]);
2103  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
2104    mp_clear (&M[x]);
2105  }
2106  return err;
2107}
2108
2109
2110/* computes b = a*a */
2111static int mp_sqr (mp_int * a, mp_int * b)
2112{
2113  int     res;
2114
2115#ifdef BN_MP_TOOM_SQR_C
2116  /* use Toom-Cook? */
2117  if (a->used >= TOOM_SQR_CUTOFF) {
2118    res = mp_toom_sqr(a, b);
2119  /* Karatsuba? */
2120  } else
2121#endif
2122#ifdef BN_MP_KARATSUBA_SQR_C
2123if (a->used >= KARATSUBA_SQR_CUTOFF) {
2124    res = mp_karatsuba_sqr (a, b);
2125  } else
2126#endif
2127  {
2128#ifdef BN_FAST_S_MP_SQR_C
2129    /* can we use the fast comba multiplier? */
2130    if ((a->used * 2 + 1) < MP_WARRAY &&
2131         a->used <
2132         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
2133      res = fast_s_mp_sqr (a, b);
2134    } else
2135#endif
2136#ifdef BN_S_MP_SQR_C
2137      res = s_mp_sqr (a, b);
2138#else
2139#error mp_sqr could fail
2140      res = MP_VAL;
2141#endif
2142  }
2143  b->sign = MP_ZPOS;
2144  return res;
2145}
2146
2147
2148/* reduces a modulo n where n is of the form 2**p - d
2149   This differs from reduce_2k since "d" can be larger
2150   than a single digit.
2151*/
2152static int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
2153{
2154   mp_int q;
2155   int    p, res;
2156
2157   if ((res = mp_init(&q)) != MP_OKAY) {
2158      return res;
2159   }
2160
2161   p = mp_count_bits(n);
2162top:
2163   /* q = a/2**p, a = a mod 2**p */
2164   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
2165      goto ERR;
2166   }
2167
2168   /* q = q * d */
2169   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) {
2170      goto ERR;
2171   }
2172
2173   /* a = a + q */
2174   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
2175      goto ERR;
2176   }
2177
2178   if (mp_cmp_mag(a, n) != MP_LT) {
2179      s_mp_sub(a, n, a);
2180      goto top;
2181   }
2182
2183ERR:
2184   mp_clear(&q);
2185   return res;
2186}
2187
2188
2189/* determines the setup value */
2190static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
2191{
2192   int    res;
2193   mp_int tmp;
2194
2195   if ((res = mp_init(&tmp)) != MP_OKAY) {
2196      return res;
2197   }
2198
2199   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
2200      goto ERR;
2201   }
2202
2203   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
2204      goto ERR;
2205   }
2206
2207ERR:
2208   mp_clear(&tmp);
2209   return res;
2210}
2211
2212
2213/* computes a = 2**b
2214 *
2215 * Simple algorithm which zeroes the int, grows it then just sets one bit
2216 * as required.
2217 */
2218static int mp_2expt (mp_int * a, int b)
2219{
2220  int     res;
2221
2222  /* zero a as per default */
2223  mp_zero (a);
2224
2225  /* grow a to accommodate the single bit */
2226  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
2227    return res;
2228  }
2229
2230  /* set the used count of where the bit will go */
2231  a->used = b / DIGIT_BIT + 1;
2232
2233  /* put the single bit in its place */
2234  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
2235
2236  return MP_OKAY;
2237}
2238
2239
2240/* pre-calculate the value required for Barrett reduction
2241 * For a given modulus "b" it calulates the value required in "a"
2242 */
2243static int mp_reduce_setup (mp_int * a, mp_int * b)
2244{
2245  int     res;
2246
2247  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
2248    return res;
2249  }
2250  return mp_div (a, b, a, NULL);
2251}
2252
2253
2254/* reduces x mod m, assumes 0 < x < m**2, mu is
2255 * precomputed via mp_reduce_setup.
2256 * From HAC pp.604 Algorithm 14.42
2257 */
2258static int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
2259{
2260  mp_int  q;
2261  int     res, um = m->used;
2262
2263  /* q = x */
2264  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
2265    return res;
2266  }
2267
2268  /* q1 = x / b**(k-1)  */
2269  mp_rshd (&q, um - 1);
2270
2271  /* according to HAC this optimization is ok */
2272  if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
2273    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
2274      goto CLEANUP;
2275    }
2276  } else {
2277#ifdef BN_S_MP_MUL_HIGH_DIGS_C
2278    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2279      goto CLEANUP;
2280    }
2281#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
2282    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2283      goto CLEANUP;
2284    }
2285#else
2286    {
2287#error mp_reduce would always fail
2288      res = MP_VAL;
2289      goto CLEANUP;
2290    }
2291#endif
2292  }
2293
2294  /* q3 = q2 / b**(k+1) */
2295  mp_rshd (&q, um + 1);
2296
2297  /* x = x mod b**(k+1), quick (no division) */
2298  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
2299    goto CLEANUP;
2300  }
2301
2302  /* q = q * m mod b**(k+1), quick (no division) */
2303  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
2304    goto CLEANUP;
2305  }
2306
2307  /* x = x - q */
2308  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
2309    goto CLEANUP;
2310  }
2311
2312  /* If x < 0, add b**(k+1) to it */
2313  if (mp_cmp_d (x, 0) == MP_LT) {
2314    mp_set (&q, 1);
2315    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) {
2316      goto CLEANUP;
2317    }
2318    if ((res = mp_add (x, &q, x)) != MP_OKAY) {
2319      goto CLEANUP;
2320    }
2321  }
2322
2323  /* Back off if it's too big */
2324  while (mp_cmp (x, m) != MP_LT) {
2325    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
2326      goto CLEANUP;
2327    }
2328  }
2329
2330CLEANUP:
2331  mp_clear (&q);
2332
2333  return res;
2334}
2335
2336
2337/* multiplies |a| * |b| and only computes up to digs digits of result
2338 * HAC pp. 595, Algorithm 14.12  Modified so you can control how
2339 * many digits of output are created.
2340 */
2341static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2342{
2343  mp_int  t;
2344  int     res, pa, pb, ix, iy;
2345  mp_digit u;
2346  mp_word r;
2347  mp_digit tmpx, *tmpt, *tmpy;
2348
2349#ifdef BN_FAST_S_MP_MUL_DIGS_C
2350  /* can we use the fast multiplier? */
2351  if (((digs) < MP_WARRAY) &&
2352      MIN (a->used, b->used) <
2353          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2354    return fast_s_mp_mul_digs (a, b, c, digs);
2355  }
2356#endif
2357
2358  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
2359    return res;
2360  }
2361  t.used = digs;
2362
2363  /* compute the digits of the product directly */
2364  pa = a->used;
2365  for (ix = 0; ix < pa; ix++) {
2366    /* set the carry to zero */
2367    u = 0;
2368
2369    /* limit ourselves to making digs digits of output */
2370    pb = MIN (b->used, digs - ix);
2371
2372    /* setup some aliases */
2373    /* copy of the digit from a used within the nested loop */
2374    tmpx = a->dp[ix];
2375
2376    /* an alias for the destination shifted ix places */
2377    tmpt = t.dp + ix;
2378
2379    /* an alias for the digits of b */
2380    tmpy = b->dp;
2381
2382    /* compute the columns of the output and propagate the carry */
2383    for (iy = 0; iy < pb; iy++) {
2384      /* compute the column as a mp_word */
2385      r       = ((mp_word)*tmpt) +
2386                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2387                ((mp_word) u);
2388
2389      /* the new column is the lower part of the result */
2390      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2391
2392      /* get the carry word from the result */
2393      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2394    }
2395    /* set carry if it is placed below digs */
2396    if (ix + iy < digs) {
2397      *tmpt = u;
2398    }
2399  }
2400
2401  mp_clamp (&t);
2402  mp_exch (&t, c);
2403
2404  mp_clear (&t);
2405  return MP_OKAY;
2406}
2407
2408
2409#ifdef BN_FAST_S_MP_MUL_DIGS_C
2410/* Fast (comba) multiplier
2411 *
2412 * This is the fast column-array [comba] multiplier.  It is
2413 * designed to compute the columns of the product first
2414 * then handle the carries afterwards.  This has the effect
2415 * of making the nested loops that compute the columns very
2416 * simple and schedulable on super-scalar processors.
2417 *
2418 * This has been modified to produce a variable number of
2419 * digits of output so if say only a half-product is required
2420 * you don't have to compute the upper half (a feature
2421 * required for fast Barrett reduction).
2422 *
2423 * Based on Algorithm 14.12 on pp.595 of HAC.
2424 *
2425 */
2426static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2427{
2428  int     olduse, res, pa, ix, iz;
2429  mp_digit W[MP_WARRAY];
2430  register mp_word  _W;
2431
2432  /* grow the destination as required */
2433  if (c->alloc < digs) {
2434    if ((res = mp_grow (c, digs)) != MP_OKAY) {
2435      return res;
2436    }
2437  }
2438
2439  /* number of output digits to produce */
2440  pa = MIN(digs, a->used + b->used);
2441
2442  /* clear the carry */
2443  _W = 0;
2444  for (ix = 0; ix < pa; ix++) {
2445      int      tx, ty;
2446      int      iy;
2447      mp_digit *tmpx, *tmpy;
2448
2449      /* get offsets into the two bignums */
2450      ty = MIN(b->used-1, ix);
2451      tx = ix - ty;
2452
2453      /* setup temp aliases */
2454      tmpx = a->dp + tx;
2455      tmpy = b->dp + ty;
2456
2457      /* this is the number of times the loop will iterrate, essentially
2458         while (tx++ < a->used && ty-- >= 0) { ... }
2459       */
2460      iy = MIN(a->used-tx, ty+1);
2461
2462      /* execute loop */
2463      for (iz = 0; iz < iy; ++iz) {
2464         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2465
2466      }
2467
2468      /* store term */
2469      W[ix] = ((mp_digit)_W) & MP_MASK;
2470
2471      /* make next carry */
2472      _W = _W >> ((mp_word)DIGIT_BIT);
2473 }
2474
2475  /* setup dest */
2476  olduse  = c->used;
2477  c->used = pa;
2478
2479  {
2480    register mp_digit *tmpc;
2481    tmpc = c->dp;
2482    for (ix = 0; ix < pa+1; ix++) {
2483      /* now extract the previous digit [below the carry] */
2484      *tmpc++ = W[ix];
2485    }
2486
2487    /* clear unused digits [that existed in the old copy of c] */
2488    for (; ix < olduse; ix++) {
2489      *tmpc++ = 0;
2490    }
2491  }
2492  mp_clamp (c);
2493  return MP_OKAY;
2494}
2495#endif /* BN_FAST_S_MP_MUL_DIGS_C */
2496
2497
2498/* init an mp_init for a given size */
2499static int mp_init_size (mp_int * a, int size)
2500{
2501  int x;
2502
2503  /* pad size so there are always extra digits */
2504  size += (MP_PREC * 2) - (size % MP_PREC);
2505
2506  /* alloc mem */
2507  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
2508  if (a->dp == NULL) {
2509    return MP_MEM;
2510  }
2511
2512  /* set the members */
2513  a->used  = 0;
2514  a->alloc = size;
2515  a->sign  = MP_ZPOS;
2516
2517  /* zero the digits */
2518  for (x = 0; x < size; x++) {
2519      a->dp[x] = 0;
2520  }
2521
2522  return MP_OKAY;
2523}
2524
2525
2526/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
2527static int s_mp_sqr (mp_int * a, mp_int * b)
2528{
2529  mp_int  t;
2530  int     res, ix, iy, pa;
2531  mp_word r;
2532  mp_digit u, tmpx, *tmpt;
2533
2534  pa = a->used;
2535  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
2536    return res;
2537  }
2538
2539  /* default used is maximum possible size */
2540  t.used = 2*pa + 1;
2541
2542  for (ix = 0; ix < pa; ix++) {
2543    /* first calculate the digit at 2*ix */
2544    /* calculate double precision result */
2545    r = ((mp_word) t.dp[2*ix]) +
2546        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
2547
2548    /* store lower part in result */
2549    t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
2550
2551    /* get the carry */
2552    u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2553
2554    /* left hand side of A[ix] * A[iy] */
2555    tmpx        = a->dp[ix];
2556
2557    /* alias for where to store the results */
2558    tmpt        = t.dp + (2*ix + 1);
2559
2560    for (iy = ix + 1; iy < pa; iy++) {
2561      /* first calculate the product */
2562      r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
2563
2564      /* now calculate the double precision result, note we use
2565       * addition instead of *2 since it's easier to optimize
2566       */
2567      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
2568
2569      /* store lower part */
2570      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2571
2572      /* get carry */
2573      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2574    }
2575    /* propagate upwards */
2576    while (u != ((mp_digit) 0)) {
2577      r       = ((mp_word) *tmpt) + ((mp_word) u);
2578      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2579      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2580    }
2581  }
2582
2583  mp_clamp (&t);
2584  mp_exch (&t, b);
2585  mp_clear (&t);
2586  return MP_OKAY;
2587}
2588
2589
2590/* multiplies |a| * |b| and does not compute the lower digs digits
2591 * [meant to get the higher part of the product]
2592 */
2593static int s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2594{
2595  mp_int  t;
2596  int     res, pa, pb, ix, iy;
2597  mp_digit u;
2598  mp_word r;
2599  mp_digit tmpx, *tmpt, *tmpy;
2600
2601  /* can we use the fast multiplier? */
2602#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
2603  if (((a->used + b->used + 1) < MP_WARRAY)
2604      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2605    return fast_s_mp_mul_high_digs (a, b, c, digs);
2606  }
2607#endif
2608
2609  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
2610    return res;
2611  }
2612  t.used = a->used + b->used + 1;
2613
2614  pa = a->used;
2615  pb = b->used;
2616  for (ix = 0; ix < pa; ix++) {
2617    /* clear the carry */
2618    u = 0;
2619
2620    /* left hand side of A[ix] * B[iy] */
2621    tmpx = a->dp[ix];
2622
2623    /* alias to the address of where the digits will be stored */
2624    tmpt = &(t.dp[digs]);
2625
2626    /* alias for where to read the right hand side from */
2627    tmpy = b->dp + (digs - ix);
2628
2629    for (iy = digs - ix; iy < pb; iy++) {
2630      /* calculate the double precision result */
2631      r       = ((mp_word)*tmpt) +
2632                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2633                ((mp_word) u);
2634
2635      /* get the lower part */
2636      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2637
2638      /* carry the carry */
2639      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2640    }
2641    *tmpt = u;
2642  }
2643  mp_clamp (&t);
2644  mp_exch (&t, c);
2645  mp_clear (&t);
2646  return MP_OKAY;
2647}
2648
2649
2650#ifdef BN_MP_MONTGOMERY_SETUP_C
2651/* setups the montgomery reduction stuff */
2652static int
2653mp_montgomery_setup (mp_int * n, mp_digit * rho)
2654{
2655  mp_digit x, b;
2656
2657/* fast inversion mod 2**k
2658 *
2659 * Based on the fact that
2660 *
2661 * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
2662 *                    =>  2*X*A - X*X*A*A = 1
2663 *                    =>  2*(1) - (1)     = 1
2664 */
2665  b = n->dp[0];
2666
2667  if ((b & 1) == 0) {
2668    return MP_VAL;
2669  }
2670
2671  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2672  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
2673#if !defined(MP_8BIT)
2674  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
2675#endif
2676#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
2677  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
2678#endif
2679#ifdef MP_64BIT
2680  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
2681#endif
2682
2683  /* rho = -1/m mod b */
2684  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2685
2686  return MP_OKAY;
2687}
2688#endif
2689
2690
2691#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
2692/* computes xR**-1 == x (mod N) via Montgomery Reduction
2693 *
2694 * This is an optimized implementation of montgomery_reduce
2695 * which uses the comba method to quickly calculate the columns of the
2696 * reduction.
2697 *
2698 * Based on Algorithm 14.32 on pp.601 of HAC.
2699*/
2700static int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
2701{
2702  int     ix, res, olduse;
2703  mp_word W[MP_WARRAY];
2704
2705  /* get old used count */
2706  olduse = x->used;
2707
2708  /* grow a as required */
2709  if (x->alloc < n->used + 1) {
2710    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
2711      return res;
2712    }
2713  }
2714
2715  /* first we have to get the digits of the input into
2716   * an array of double precision words W[...]
2717   */
2718  {
2719    register mp_word *_W;
2720    register mp_digit *tmpx;
2721
2722    /* alias for the W[] array */
2723    _W   = W;
2724
2725    /* alias for the digits of  x*/
2726    tmpx = x->dp;
2727
2728    /* copy the digits of a into W[0..a->used-1] */
2729    for (ix = 0; ix < x->used; ix++) {
2730      *_W++ = *tmpx++;
2731    }
2732
2733    /* zero the high words of W[a->used..m->used*2] */
2734    for (; ix < n->used * 2 + 1; ix++) {
2735      *_W++ = 0;
2736    }
2737  }
2738
2739  /* now we proceed to zero successive digits
2740   * from the least significant upwards
2741   */
2742  for (ix = 0; ix < n->used; ix++) {
2743    /* mu = ai * m' mod b
2744     *
2745     * We avoid a double precision multiplication (which isn't required)
2746     * by casting the value down to a mp_digit.  Note this requires
2747     * that W[ix-1] have  the carry cleared (see after the inner loop)
2748     */
2749    register mp_digit mu;
2750    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
2751
2752    /* a = a + mu * m * b**i
2753     *
2754     * This is computed in place and on the fly.  The multiplication
2755     * by b**i is handled by offseting which columns the results
2756     * are added to.
2757     *
2758     * Note the comba method normally doesn't handle carries in the
2759     * inner loop In this case we fix the carry from the previous
2760     * column since the Montgomery reduction requires digits of the
2761     * result (so far) [see above] to work.  This is
2762     * handled by fixing up one carry after the inner loop.  The
2763     * carry fixups are done in order so after these loops the
2764     * first m->used words of W[] have the carries fixed
2765     */
2766    {
2767      register int iy;
2768      register mp_digit *tmpn;
2769      register mp_word *_W;
2770
2771      /* alias for the digits of the modulus */
2772      tmpn = n->dp;
2773
2774      /* Alias for the columns set by an offset of ix */
2775      _W = W + ix;
2776
2777      /* inner loop */
2778      for (iy = 0; iy < n->used; iy++) {
2779          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
2780      }
2781    }
2782
2783    /* now fix carry for next digit, W[ix+1] */
2784    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
2785  }
2786
2787  /* now we have to propagate the carries and
2788   * shift the words downward [all those least
2789   * significant digits we zeroed].
2790   */
2791  {
2792    register mp_digit *tmpx;
2793    register mp_word *_W, *_W1;
2794
2795    /* nox fix rest of carries */
2796
2797    /* alias for current word */
2798    _W1 = W + ix;
2799
2800    /* alias for next word, where the carry goes */
2801    _W = W + ++ix;
2802
2803    for (; ix <= n->used * 2 + 1; ix++) {
2804      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
2805    }
2806
2807    /* copy out, A = A/b**n
2808     *
2809     * The result is A/b**n but instead of converting from an
2810     * array of mp_word to mp_digit than calling mp_rshd
2811     * we just copy them in the right order
2812     */
2813
2814    /* alias for destination word */
2815    tmpx = x->dp;
2816
2817    /* alias for shifted double precision result */
2818    _W = W + n->used;
2819
2820    for (ix = 0; ix < n->used + 1; ix++) {
2821      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
2822    }
2823
2824    /* zero oldused digits, if the input a was larger than
2825     * m->used+1 we'll have to clear the digits
2826     */
2827    for (; ix < olduse; ix++) {
2828      *tmpx++ = 0;
2829    }
2830  }
2831
2832  /* set the max used and clamp */
2833  x->used = n->used + 1;
2834  mp_clamp (x);
2835
2836  /* if A >= m then A = A - m */
2837  if (mp_cmp_mag (x, n) != MP_LT) {
2838    return s_mp_sub (x, n, x);
2839  }
2840  return MP_OKAY;
2841}
2842#endif
2843
2844
2845#ifdef BN_MP_MUL_2_C
2846/* b = a*2 */
2847static int mp_mul_2(mp_int * a, mp_int * b)
2848{
2849  int     x, res, oldused;
2850
2851  /* grow to accommodate result */
2852  if (b->alloc < a->used + 1) {
2853    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2854      return res;
2855    }
2856  }
2857
2858  oldused = b->used;
2859  b->used = a->used;
2860
2861  {
2862    register mp_digit r, rr, *tmpa, *tmpb;
2863
2864    /* alias for source */
2865    tmpa = a->dp;
2866
2867    /* alias for dest */
2868    tmpb = b->dp;
2869
2870    /* carry */
2871    r = 0;
2872    for (x = 0; x < a->used; x++) {
2873
2874      /* get what will be the *next* carry bit from the
2875       * MSB of the current digit
2876       */
2877      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2878
2879      /* now shift up this digit, add in the carry [from the previous] */
2880      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2881
2882      /* copy the carry that would be from the source
2883       * digit into the next iteration
2884       */
2885      r = rr;
2886    }
2887
2888    /* new leading digit? */
2889    if (r != 0) {
2890      /* add a MSB which is always 1 at this point */
2891      *tmpb = 1;
2892      ++(b->used);
2893    }
2894
2895    /* now zero any excess digits on the destination
2896     * that we didn't write to
2897     */
2898    tmpb = b->dp + b->used;
2899    for (x = b->used; x < oldused; x++) {
2900      *tmpb++ = 0;
2901    }
2902  }
2903  b->sign = a->sign;
2904  return MP_OKAY;
2905}
2906#endif
2907
2908
2909#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
2910/*
2911 * shifts with subtractions when the result is greater than b.
2912 *
2913 * The method is slightly modified to shift B unconditionally up to just under
2914 * the leading bit of b.  This saves a lot of multiple precision shifting.
2915 */
2916static int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
2917{
2918  int     x, bits, res;
2919
2920  /* how many bits of last digit does b use */
2921  bits = mp_count_bits (b) % DIGIT_BIT;
2922
2923  if (b->used > 1) {
2924     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2925        return res;
2926     }
2927  } else {
2928     mp_set(a, 1);
2929     bits = 1;
2930  }
2931
2932
2933  /* now compute C = A * B mod b */
2934  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
2935    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2936      return res;
2937    }
2938    if (mp_cmp_mag (a, b) != MP_LT) {
2939      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2940        return res;
2941      }
2942    }
2943  }
2944
2945  return MP_OKAY;
2946}
2947#endif
2948
2949
2950#ifdef BN_MP_EXPTMOD_FAST_C
2951/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
2952 *
2953 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
2954 * The value of k changes based on the size of the exponent.
2955 *
2956 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
2957 */
2958
2959static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
2960{
2961  mp_int  M[TAB_SIZE], res;
2962  mp_digit buf, mp;
2963  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
2964
2965  /* use a pointer to the reduction algorithm.  This allows us to use
2966   * one of many reduction algorithms without modding the guts of
2967   * the code with if statements everywhere.
2968   */
2969  int     (*redux)(mp_int*,mp_int*,mp_digit);
2970
2971  /* find window size */
2972  x = mp_count_bits (X);
2973  if (x <= 7) {
2974    winsize = 2;
2975  } else if (x <= 36) {
2976    winsize = 3;
2977  } else if (x <= 140) {
2978    winsize = 4;
2979  } else if (x <= 450) {
2980    winsize = 5;
2981  } else if (x <= 1303) {
2982    winsize = 6;
2983  } else if (x <= 3529) {
2984    winsize = 7;
2985  } else {
2986    winsize = 8;
2987  }
2988
2989#ifdef MP_LOW_MEM
2990  if (winsize > 5) {
2991     winsize = 5;
2992  }
2993#endif
2994
2995  /* init M array */
2996  /* init first cell */
2997  if ((err = mp_init(&M[1])) != MP_OKAY) {
2998     return err;
2999  }
3000
3001  /* now init the second half of the array */
3002  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3003    if ((err = mp_init(&M[x])) != MP_OKAY) {
3004      for (y = 1<<(winsize-1); y < x; y++) {
3005        mp_clear (&M[y]);
3006      }
3007      mp_clear(&M[1]);
3008      return err;
3009    }
3010  }
3011
3012  /* determine and setup reduction code */
3013  if (redmode == 0) {
3014#ifdef BN_MP_MONTGOMERY_SETUP_C
3015     /* now setup montgomery  */
3016     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
3017        goto LBL_M;
3018     }
3019#else
3020     err = MP_VAL;
3021     goto LBL_M;
3022#endif
3023
3024     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
3025#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
3026     if (((P->used * 2 + 1) < MP_WARRAY) &&
3027          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
3028        redux = fast_mp_montgomery_reduce;
3029     } else
3030#endif
3031     {
3032#ifdef BN_MP_MONTGOMERY_REDUCE_C
3033        /* use slower baseline Montgomery method */
3034        redux = mp_montgomery_reduce;
3035#else
3036        err = MP_VAL;
3037        goto LBL_M;
3038#endif
3039     }
3040  } else if (redmode == 1) {
3041#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
3042     /* setup DR reduction for moduli of the form B**k - b */
3043     mp_dr_setup(P, &mp);
3044     redux = mp_dr_reduce;
3045#else
3046     err = MP_VAL;
3047     goto LBL_M;
3048#endif
3049  } else {
3050#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
3051     /* setup DR reduction for moduli of the form 2**k - b */
3052     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
3053        goto LBL_M;
3054     }
3055     redux = mp_reduce_2k;
3056#else
3057     err = MP_VAL;
3058     goto LBL_M;
3059#endif
3060  }
3061
3062  /* setup result */
3063  if ((err = mp_init (&res)) != MP_OKAY) {
3064    goto LBL_M;
3065  }
3066
3067  /* create M table
3068   *
3069
3070   *
3071   * The first half of the table is not computed though accept for M[0] and M[1]
3072   */
3073
3074  if (redmode == 0) {
3075#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
3076     /* now we need R mod m */
3077     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
3078       goto LBL_RES;
3079     }
3080#else
3081     err = MP_VAL;
3082     goto LBL_RES;
3083#endif
3084
3085     /* now set M[1] to G * R mod m */
3086     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
3087       goto LBL_RES;
3088     }
3089  } else {
3090     mp_set(&res, 1);
3091     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
3092        goto LBL_RES;
3093     }
3094  }
3095
3096  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
3097  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
3098    goto LBL_RES;
3099  }
3100
3101  for (x = 0; x < (winsize - 1); x++) {
3102    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
3103      goto LBL_RES;
3104    }
3105    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
3106      goto LBL_RES;
3107    }
3108  }
3109
3110  /* create upper table */
3111  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
3112    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
3113      goto LBL_RES;
3114    }
3115    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
3116      goto LBL_RES;
3117    }
3118  }
3119
3120  /* set initial mode and bit cnt */
3121  mode   = 0;
3122  bitcnt = 1;
3123  buf    = 0;
3124  digidx = X->used - 1;
3125  bitcpy = 0;
3126  bitbuf = 0;
3127
3128  for (;;) {
3129    /* grab next digit as required */
3130    if (--bitcnt == 0) {
3131      /* if digidx == -1 we are out of digits so break */
3132      if (digidx == -1) {
3133        break;
3134      }
3135      /* read next digit and reset bitcnt */
3136      buf    = X->dp[digidx--];
3137      bitcnt = (int)DIGIT_BIT;
3138    }
3139
3140    /* grab the next msb from the exponent */
3141    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
3142    buf <<= (mp_digit)1;
3143
3144    /* if the bit is zero and mode == 0 then we ignore it
3145     * These represent the leading zero bits before the first 1 bit
3146     * in the exponent.  Technically this opt is not required but it
3147     * does lower the # of trivial squaring/reductions used
3148     */
3149    if (mode == 0 && y == 0) {
3150      continue;
3151    }
3152
3153    /* if the bit is zero and mode == 1 then we square */
3154    if (mode == 1 && y == 0) {
3155      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3156        goto LBL_RES;
3157      }
3158      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3159        goto LBL_RES;
3160      }
3161      continue;
3162    }
3163
3164    /* else we add it to the window */
3165    bitbuf |= (y << (winsize - ++bitcpy));
3166    mode    = 2;
3167
3168    if (bitcpy == winsize) {
3169      /* ok window is filled so square as required and multiply  */
3170      /* square first */
3171      for (x = 0; x < winsize; x++) {
3172        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3173          goto LBL_RES;
3174        }
3175        if ((err = redux (&res, P, mp)) != MP_OKAY) {
3176          goto LBL_RES;
3177        }
3178      }
3179
3180      /* then multiply */
3181      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
3182        goto LBL_RES;
3183      }
3184      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3185        goto LBL_RES;
3186      }
3187
3188      /* empty window and reset */
3189      bitcpy = 0;
3190      bitbuf = 0;
3191      mode   = 1;
3192    }
3193  }
3194
3195  /* if bits remain then square/multiply */
3196  if (mode == 2 && bitcpy > 0) {
3197    /* square then multiply if the bit is set */
3198    for (x = 0; x < bitcpy; x++) {
3199      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3200        goto LBL_RES;
3201      }
3202      if ((err = redux (&res, P, mp)) != MP_OKAY) {
3203        goto LBL_RES;
3204      }
3205
3206      /* get next bit of the window */
3207      bitbuf <<= 1;
3208      if ((bitbuf & (1 << winsize)) != 0) {
3209        /* then multiply */
3210        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
3211          goto LBL_RES;
3212        }
3213        if ((err = redux (&res, P, mp)) != MP_OKAY) {
3214          goto LBL_RES;
3215        }
3216      }
3217    }
3218  }
3219
3220  if (redmode == 0) {
3221     /* fixup result if Montgomery reduction is used
3222      * recall that any value in a Montgomery system is
3223      * actually multiplied by R mod n.  So we have
3224      * to reduce one more time to cancel out the factor
3225      * of R.
3226      */
3227     if ((err = redux(&res, P, mp)) != MP_OKAY) {
3228       goto LBL_RES;
3229     }
3230  }
3231
3232  /* swap res with Y */
3233  mp_exch (&res, Y);
3234  err = MP_OKAY;
3235LBL_RES:mp_clear (&res);
3236LBL_M:
3237  mp_clear(&M[1]);
3238  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3239    mp_clear (&M[x]);
3240  }
3241  return err;
3242}
3243#endif
3244
3245
3246#ifdef BN_FAST_S_MP_SQR_C
3247/* the jist of squaring...
3248 * you do like mult except the offset of the tmpx [one that
3249 * starts closer to zero] can't equal the offset of tmpy.
3250 * So basically you set up iy like before then you min it with
3251 * (ty-tx) so that it never happens.  You double all those
3252 * you add in the inner loop
3253
3254After that loop you do the squares and add them in.
3255*/
3256
3257static int fast_s_mp_sqr (mp_int * a, mp_int * b)
3258{
3259  int       olduse, res, pa, ix, iz;
3260  mp_digit   W[MP_WARRAY], *tmpx;
3261  mp_word   W1;
3262
3263  /* grow the destination as required */
3264  pa = a->used + a->used;
3265  if (b->alloc < pa) {
3266    if ((res = mp_grow (b, pa)) != MP_OKAY) {
3267      return res;
3268    }
3269  }
3270
3271  /* number of output digits to produce */
3272  W1 = 0;
3273  for (ix = 0; ix < pa; ix++) {
3274      int      tx, ty, iy;
3275      mp_word  _W;
3276      mp_digit *tmpy;
3277
3278      /* clear counter */
3279      _W = 0;
3280
3281      /* get offsets into the two bignums */
3282      ty = MIN(a->used-1, ix);
3283      tx = ix - ty;
3284
3285      /* setup temp aliases */
3286      tmpx = a->dp + tx;
3287      tmpy = a->dp + ty;
3288
3289      /* this is the number of times the loop will iterrate, essentially
3290         while (tx++ < a->used && ty-- >= 0) { ... }
3291       */
3292      iy = MIN(a->used-tx, ty+1);
3293
3294      /* now for squaring tx can never equal ty
3295       * we halve the distance since they approach at a rate of 2x
3296       * and we have to round because odd cases need to be executed
3297       */
3298      iy = MIN(iy, (ty-tx+1)>>1);
3299
3300      /* execute loop */
3301      for (iz = 0; iz < iy; iz++) {
3302         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
3303      }
3304
3305      /* double the inner product and add carry */
3306      _W = _W + _W + W1;
3307
3308      /* even columns have the square term in them */
3309      if ((ix&1) == 0) {
3310         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
3311      }
3312
3313      /* store it */
3314      W[ix] = (mp_digit)(_W & MP_MASK);
3315
3316      /* make next carry */
3317      W1 = _W >> ((mp_word)DIGIT_BIT);
3318  }
3319
3320  /* setup dest */
3321  olduse  = b->used;
3322  b->used = a->used+a->used;
3323
3324  {
3325    mp_digit *tmpb;
3326    tmpb = b->dp;
3327    for (ix = 0; ix < pa; ix++) {
3328      *tmpb++ = W[ix] & MP_MASK;
3329    }
3330
3331    /* clear unused digits [that existed in the old copy of c] */
3332    for (; ix < olduse; ix++) {
3333      *tmpb++ = 0;
3334    }
3335  }
3336  mp_clamp (b);
3337  return MP_OKAY;
3338}
3339#endif
3340
3341
3342#ifdef BN_MP_MUL_D_C
3343/* multiply by a digit */
3344static int
3345mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
3346{
3347  mp_digit u, *tmpa, *tmpc;
3348  mp_word  r;
3349  int      ix, res, olduse;
3350
3351  /* make sure c is big enough to hold a*b */
3352  if (c->alloc < a->used + 1) {
3353    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
3354      return res;
3355    }
3356  }
3357
3358  /* get the original destinations used count */
3359  olduse = c->used;
3360
3361  /* set the sign */
3362  c->sign = a->sign;
3363
3364  /* alias for a->dp [source] */
3365  tmpa = a->dp;
3366
3367  /* alias for c->dp [dest] */
3368  tmpc = c->dp;
3369
3370  /* zero carry */
3371  u = 0;
3372
3373  /* compute columns */
3374  for (ix = 0; ix < a->used; ix++) {
3375    /* compute product and carry sum for this term */
3376    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
3377
3378    /* mask off higher bits to get a single digit */
3379    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
3380
3381    /* send carry into next iteration */
3382    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
3383  }
3384
3385  /* store final carry [if any] and increment ix offset  */
3386  *tmpc++ = u;
3387  ++ix;
3388
3389  /* now zero digits above the top */
3390  while (ix++ < olduse) {
3391     *tmpc++ = 0;
3392  }
3393
3394  /* set used count */
3395  c->used = a->used + 1;
3396  mp_clamp(c);
3397
3398  return MP_OKAY;
3399}
3400#endif
3401