176 files changed, 6707 insertions, 8043 deletions
diff --git a/libtommath/TODO b/libtommath/TODO
deleted file mode 100644
index deffba1..0000000
--- a/libtommath/TODO
+++ /dev/null
@@ -1,16 +0,0 @@
-things for book in order of importance...
-
-- Fix up pseudo-code [only] for combas that are not consistent with source
-- Start in chapter 3 [basics] and work up...
-   - re-write to prose [less abrupt]
-   - clean up pseudo code [spacing]
-   - more examples where appropriate and figures
-
-Goal:
-   - Get sync done by mid January [roughly 8-12 hours work]
-   - Finish ch3-6 by end of January [roughly 12-16 hours of work]
-   - Finish ch7-end by mid Feb [roughly 20-24 hours of work].
-
-Goal isn't "first edition" but merely cleaner to read.
-
-
diff --git a/libtommath/bn.pdf b/libtommath/bn.pdf
index 9b873e1..392b649 100644
--- a/libtommath/bn.pdf
+++ b/libtommath/bn.pdf
diff --git a/libtommath/bn.tex b/libtommath/bn.tex
index 962d6ea..e8eb994 100644
--- a/libtommath/bn.tex
+++ b/libtommath/bn.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.33}
+\title{LibTomMath User Manual \\ v0.39}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
@@ -263,12 +263,12 @@ are the pros and cons of LibTomMath by comparing it to the math routines from Gn
 \begin{center}
 \begin{tabular}{|l|c|c|l|}
 \hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
-\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 76.04$ \\
+\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 71.97$ \\
 \hline Commented function prototypes & X && GnuPG function names are cryptic. \\
 \hline Speed && X & LibTomMath is slower.  \\
 \hline Totally free & X & & GPL has unfavourable restrictions.\\
 \hline Large function base & X & & GnuPG is barebones. \\
-\hline Four modular reduction algorithms & X & & Faster modular exponentiation. \\
+\hline Five modular reduction algorithms & X & & Faster modular exponentiation for a variety of moduli. \\
 \hline Portable & X & & GnuPG requires configuration to build. \\
 \hline
 \end{tabular}
@@ -284,9 +284,12 @@ would require when working with large integers.
 So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
 own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
 not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
-exponentiations.
+exponentiations.  It depends largely on the processor, compiler and the moduli being used.
 
-Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.
+Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.  However,
+on the other side of the coin LibTomMath offers you a totally free (public domain) well structured math library
+that is very flexible, complete and performs well in resource contrained environments.  Fast RSA for example can
+be performed with as little as 8KB of ram for data (again depending on build options).  
 
 \chapter{Getting Started with LibTomMath}
 \section{Building Programs}
@@ -809,7 +812,7 @@ mp\_int variables based on their digits only.
 
 \index{mp\_cmp\_mag}
 \begin{alltt}
-int mp_cmp(mp_int * a, mp_int * b);
+int mp_cmp_mag(mp_int * a, mp_int * b);
 \end{alltt}
 This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
 three compare codes listed in figure \ref{fig:CMP}.
@@ -1220,12 +1223,13 @@ int mp_sqr (mp_int * a, mp_int * b);
 \end{alltt}
 
 Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
-algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms.
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms because
+of the speed difference.  
 
 \section{Tuning Polynomial Basis Routines}
 
 Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
-the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require 
+the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectively they require 
 considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
 multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
 of 138).
@@ -1297,14 +1301,14 @@ of $b$.  This algorithm accepts an input $a$ of any range and is not limited by
 \section{Barrett Reduction}
 
 Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
-a decent speedup over straight division.  First a $mu$ value must be precomputed with the following function.
+a decent speedup over straight division.  First a $\mu$ value must be precomputed with the following function.
 
 \index{mp\_reduce\_setup}
 \begin{alltt}
 int mp_reduce_setup(mp_int *a, mp_int *b);
 \end{alltt}
 
-Given a modulus in $b$ this produces the required $mu$ value in $a$.  For any given modulus this only has to
+Given a modulus in $b$ this produces the required $\mu$ value in $a$.  For any given modulus this only has to
 be computed once.  Modular reduction can now be performed with the following.
 
 \index{mp\_reduce}
@@ -1312,7 +1316,7 @@ be computed once.  Modular reduction can now be performed with the following.
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 \end{alltt}
 
-This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$.  $a$ must be in the range
+This will reduce $a$ in place modulo $b$ with the precomputed $\mu$ value in $c$.  $a$ must be in the range
 $0 \le a < b^2$.
 
 \begin{alltt}
@@ -1578,7 +1582,8 @@ will return $-2$.
 This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
 the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
 values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
-$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$.
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$ or simply 
+$\left ( \left ( \left ( a^{1/2} \right )^{1/2} \right )^{1/2} \right )^{1/2}$
 
 \chapter{Prime Numbers}
 \section{Trial Division}
diff --git a/libtommath/bn_error.c b/libtommath/bn_error.c
index 1546784..6393bb0 100644
--- a/libtommath/bn_error.c
+++ b/libtommath/bn_error.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 static const struct {
diff --git a/libtommath/bn_fast_mp_invmod.c b/libtommath/bn_fast_mp_invmod.c
index b5b9f10..fafd9dc 100644
--- a/libtommath/bn_fast_mp_invmod.c
+++ b/libtommath/bn_fast_mp_invmod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes the modular inverse via binary extended euclidean algorithm, 
@@ -21,8 +21,7 @@
  * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
   mp_int  x, y, u, v, B, D;
   int     res, neg;
@@ -43,7 +42,7 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
     goto LBL_ERR;
   }
 
diff --git a/libtommath/bn_fast_mp_montgomery_reduce.c b/libtommath/bn_fast_mp_montgomery_reduce.c
index 7373ae6..e941dc2 100644
--- a/libtommath/bn_fast_mp_montgomery_reduce.c
+++ b/libtommath/bn_fast_mp_montgomery_reduce.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction
@@ -23,8 +23,7 @@
  *
  * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
   int     ix, res, olduse;
   mp_word W[MP_WARRAY];
diff --git a/libtommath/bn_fast_s_mp_mul_digs.c b/libtommath/bn_fast_s_mp_mul_digs.c
index e1ff5f3..ab157b9 100644
--- a/libtommath/bn_fast_s_mp_mul_digs.c
+++ b/libtommath/bn_fast_s_mp_mul_digs.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Fast (comba) multiplier
@@ -31,8 +31,7 @@
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
  */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -63,7 +62,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -71,6 +70,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       /* execute loop */
       for (iz = 0; iz < iy; ++iz) {
          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+
       }
 
       /* store term */
@@ -78,19 +78,16 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* make next carry */
       _W = _W >> ((mp_word)DIGIT_BIT);
-  }
-
-  /* store final carry */
-  W[ix] = _W;
+ }
 
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
diff --git a/libtommath/bn_fast_s_mp_mul_high_digs.c b/libtommath/bn_fast_s_mp_mul_high_digs.c
index 064a9dd..ec9f58a 100644
--- a/libtommath/bn_fast_s_mp_mul_high_digs.c
+++ b/libtommath/bn_fast_s_mp_mul_high_digs.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* this is a modified version of fast_s_mul_digs that only produces
@@ -24,8 +24,7 @@
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -71,9 +70,6 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
   
-  /* store final carry */
-  W[ix] = _W;
-
   /* setup dest */
   olduse  = c->used;
   c->used = pa;
@@ -82,7 +78,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     register mp_digit *tmpc;
 
     tmpc = c->dp + digs;
-    for (ix = digs; ix <= pa; ix++) {
+    for (ix = digs; ix < pa; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
diff --git a/libtommath/bn_fast_s_mp_sqr.c b/libtommath/bn_fast_s_mp_sqr.c
index d6014ab..1abf24b 100644
--- a/libtommath/bn_fast_s_mp_sqr.c
+++ b/libtommath/bn_fast_s_mp_sqr.c
@@ -12,36 +12,17 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -76,7 +57,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -101,7 +82,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       }
 
       /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
diff --git a/libtommath/bn_mp_2expt.c b/libtommath/bn_mp_2expt.c
index 45a6818..a32572d 100644
--- a/libtommath/bn_mp_2expt.c
+++ b/libtommath/bn_mp_2expt.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes a = 2**b 
diff --git a/libtommath/bn_mp_abs.c b/libtommath/bn_mp_abs.c
index 34f810f..dc51884 100644
--- a/libtommath/bn_mp_abs.c
+++ b/libtommath/bn_mp_abs.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = |a| 
diff --git a/libtommath/bn_mp_add.c b/libtommath/bn_mp_add.c
index 554b7f7..d9b8fa5 100644
--- a/libtommath/bn_mp_add.c
+++ b/libtommath/bn_mp_add.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* high level addition (handles signs) */
diff --git a/libtommath/bn_mp_add_d.c b/libtommath/bn_mp_add_d.c
index bdd0280..5281ad4 100644
--- a/libtommath/bn_mp_add_d.c
+++ b/libtommath/bn_mp_add_d.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* single digit addition */
@@ -37,8 +37,12 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
      /* c = |a| - b */
      res = mp_sub_d(a, b, c);
 
-     /* fix sign  */
-     a->sign = c->sign = MP_NEG;
+     /* fix signs  */
+     a->sign = MP_NEG;
+     c->sign = (c->used) ? MP_NEG : MP_ZPOS;
+
+     /* clamp */
+     mp_clamp(c);
 
      return res;
   }
diff --git a/libtommath/bn_mp_addmod.c b/libtommath/bn_mp_addmod.c
index 13eb33f..bff193f 100644
--- a/libtommath/bn_mp_addmod.c
+++ b/libtommath/bn_mp_addmod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* d = a + b (mod c) */
diff --git a/libtommath/bn_mp_and.c b/libtommath/bn_mp_and.c
index 61dc386..02bef18 100644
--- a/libtommath/bn_mp_and.c
+++ b/libtommath/bn_mp_and.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* AND two ints together */
diff --git a/libtommath/bn_mp_clamp.c b/libtommath/bn_mp_clamp.c
index c172611..74887bb 100644
--- a/libtommath/bn_mp_clamp.c
+++ b/libtommath/bn_mp_clamp.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* trim unused digits 
diff --git a/libtommath/bn_mp_clear.c b/libtommath/bn_mp_clear.c
index 5342648..bd07e76 100644
--- a/libtommath/bn_mp_clear.c
+++ b/libtommath/bn_mp_clear.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* clear one (frees)  */
diff --git a/libtommath/bn_mp_clear_multi.c b/libtommath/bn_mp_clear_multi.c
index 24cbe73..c3ad7a8 100644
--- a/libtommath/bn_mp_clear_multi.c
+++ b/libtommath/bn_mp_clear_multi.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 #include <stdarg.h>
 
diff --git a/libtommath/bn_mp_cmp.c b/libtommath/bn_mp_cmp.c
index 583b5f8..943249d 100644
--- a/libtommath/bn_mp_cmp.c
+++ b/libtommath/bn_mp_cmp.c
@@ -12,12 +12,12 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* compare two ints (signed)*/
 int
-mp_cmp (mp_int * a, mp_int * b)
+mp_cmp (const mp_int * a, const mp_int * b)
 {
   /* compare based on sign */
   if (a->sign != b->sign) {
diff --git a/libtommath/bn_mp_cmp_d.c b/libtommath/bn_mp_cmp_d.c
index 882b1c9..ecec091 100644
--- a/libtommath/bn_mp_cmp_d.c
+++ b/libtommath/bn_mp_cmp_d.c
@@ -12,11 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* compare a digit */
-int mp_cmp_d(mp_int * a, mp_digit b)
+int mp_cmp_d(const mp_int * a, mp_digit b)
 {
   /* compare based on sign */
   if (a->sign == MP_NEG) {
diff --git a/libtommath/bn_mp_cmp_mag.c b/libtommath/bn_mp_cmp_mag.c
index a0f351c..b23a191 100644
--- a/libtommath/bn_mp_cmp_mag.c
+++ b/libtommath/bn_mp_cmp_mag.c
@@ -12,11 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* compare maginitude of two ints (unsigned) */
-int mp_cmp_mag (mp_int * a, mp_int * b)
+int mp_cmp_mag (const mp_int * a, const mp_int * b)
 {
   int     n;
   mp_digit *tmpa, *tmpb;
diff --git a/libtommath/bn_mp_cnt_lsb.c b/libtommath/bn_mp_cnt_lsb.c
index 571f03f..f205e8c 100644
--- a/libtommath/bn_mp_cnt_lsb.c
+++ b/libtommath/bn_mp_cnt_lsb.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 static const int lnz[16] = { 
@@ -20,7 +20,7 @@ static const int lnz[16] = {
 };
 
 /* Counts the number of lsbs which are zero before the first zero bit */
-int mp_cnt_lsb(mp_int *a)
+int mp_cnt_lsb(const mp_int *a)
 {
    int x;
    mp_digit q, qq;
diff --git a/libtommath/bn_mp_copy.c b/libtommath/bn_mp_copy.c
index 183ec9b..ffbc0d4 100644
--- a/libtommath/bn_mp_copy.c
+++ b/libtommath/bn_mp_copy.c
@@ -12,12 +12,12 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* copy, b = a */
 int
-mp_copy (mp_int * a, mp_int * b)
+mp_copy (const mp_int * a, mp_int * b)
 {
   int     res, n;
 
diff --git a/libtommath/bn_mp_count_bits.c b/libtommath/bn_mp_count_bits.c
index f3f85ac..00d364e 100644
--- a/libtommath/bn_mp_count_bits.c
+++ b/libtommath/bn_mp_count_bits.c
@@ -12,12 +12,12 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* returns the number of bits in an int */
 int
-mp_count_bits (mp_int * a)
+mp_count_bits (const mp_int * a)
 {
   int     r;
   mp_digit q;
diff --git a/libtommath/bn_mp_div.c b/libtommath/bn_mp_div.c
index 6b2b8f0..de4ca04 100644
--- a/libtommath/bn_mp_div.c
+++ b/libtommath/bn_mp_div.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 #ifdef BN_MP_DIV_SMALL
diff --git a/libtommath/bn_mp_div_2.c b/libtommath/bn_mp_div_2.c
index 5777997..186a959 100644
--- a/libtommath/bn_mp_div_2.c
+++ b/libtommath/bn_mp_div_2.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = a/2 */
diff --git a/libtommath/bn_mp_div_2d.c b/libtommath/bn_mp_div_2d.c
index cf103f2..d7b7e05 100644
--- a/libtommath/bn_mp_div_2d.c
+++ b/libtommath/bn_mp_div_2d.c
@@ -12,11 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
-int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+int mp_div_2d (const mp_int * a, int b, mp_int * c, mp_int * d)
 {
   mp_digit D, r, rr;
   int     x, res;
diff --git a/libtommath/bn_mp_div_3.c b/libtommath/bn_mp_div_3.c
index 7cbafc1..79a9816 100644
--- a/libtommath/bn_mp_div_3.c
+++ b/libtommath/bn_mp_div_3.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* divide by three (based on routine from MPI and the GMP manual) */
diff --git a/libtommath/bn_mp_div_d.c b/libtommath/bn_mp_div_d.c
index 9b58aa6..af18d0a 100644
--- a/libtommath/bn_mp_div_d.c
+++ b/libtommath/bn_mp_div_d.c
@@ -12,13 +12,17 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 static int s_is_power_of_two(mp_digit b, int *p)
 {
    int x;
 
+   /* quick out - if (b & (b-1)) isn't zero, b isn't a power of two */
+   if ((b==0) || (b & (b-1))) {
+       return 0;
+   }
    for (x = 1; x < DIGIT_BIT; x++) {
       if (b == (((mp_digit)1)<<x)) {
          *p = x;
diff --git a/libtommath/bn_mp_dr_is_modulus.c b/libtommath/bn_mp_dr_is_modulus.c
index 5ef78a3..8ad31dc 100644
--- a/libtommath/bn_mp_dr_is_modulus.c
+++ b/libtommath/bn_mp_dr_is_modulus.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines if a number is a valid DR modulus */
diff --git a/libtommath/bn_mp_dr_reduce.c b/libtommath/bn_mp_dr_reduce.c
index 9bb7ad7..8337591 100644
--- a/libtommath/bn_mp_dr_reduce.c
+++ b/libtommath/bn_mp_dr_reduce.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
diff --git a/libtommath/bn_mp_dr_setup.c b/libtommath/bn_mp_dr_setup.c
index 029d310..de00e2d 100644
--- a/libtommath/bn_mp_dr_setup.c
+++ b/libtommath/bn_mp_dr_setup.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines the setup value */
diff --git a/libtommath/bn_mp_exch.c b/libtommath/bn_mp_exch.c
index 0ef485a..b7bd186 100644
--- a/libtommath/bn_mp_exch.c
+++ b/libtommath/bn_mp_exch.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* swap the elements of two integers, for cases where you can't simply swap the 
diff --git a/libtommath/bn_mp_expt_d.c b/libtommath/bn_mp_expt_d.c
index fdb8bd9..132f480 100644
--- a/libtommath/bn_mp_expt_d.c
+++ b/libtommath/bn_mp_expt_d.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* calculate c = a**b  using a square-multiply algorithm */
diff --git a/libtommath/bn_mp_exptmod.c b/libtommath/bn_mp_exptmod.c
index 7309170..b7d9fb7 100644
--- a/libtommath/bn_mp_exptmod.c
+++ b/libtommath/bn_mp_exptmod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 
@@ -65,21 +65,29 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 #endif
   }
 
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
 #ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
   dr = 0;
 #endif
 
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
 #endif
     
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -87,7 +95,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
     /* no exptmod for evens */
     return MP_VAL;
diff --git a/libtommath/bn_mp_exptmod_fast.c b/libtommath/bn_mp_exptmod_fast.c
index 255e9d9..1902e79 100644
--- a/libtommath/bn_mp_exptmod_fast.c
+++ b/libtommath/bn_mp_exptmod_fast.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
@@ -29,8 +29,7 @@
    #define TAB_SIZE 256
 #endif
 
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
@@ -315,4 +314,3 @@ LBL_M:
   return err;
 }
 #endif
-
diff --git a/libtommath/bn_mp_exteuclid.c b/libtommath/bn_mp_exteuclid.c
index 545450b..2e69ce1 100644
--- a/libtommath/bn_mp_exteuclid.c
+++ b/libtommath/bn_mp_exteuclid.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Extended euclidean algorithm of (a, b) produces 
@@ -59,6 +59,13 @@ int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
diff --git a/libtommath/bn_mp_fread.c b/libtommath/bn_mp_fread.c
index 293df3f..44e1ea8 100644
--- a/libtommath/bn_mp_fread.c
+++ b/libtommath/bn_mp_fread.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* read a bigint from a file stream in ASCII */
diff --git a/libtommath/bn_mp_fwrite.c b/libtommath/bn_mp_fwrite.c
index 8fa3129..b0ec29e 100644
--- a/libtommath/bn_mp_fwrite.c
+++ b/libtommath/bn_mp_fwrite.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 int mp_fwrite(mp_int *a, int radix, FILE *stream)
diff --git a/libtommath/bn_mp_gcd.c b/libtommath/bn_mp_gcd.c
index 6265df1..68cfa03 100644
--- a/libtommath/bn_mp_gcd.c
+++ b/libtommath/bn_mp_gcd.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Greatest Common Divisor using the binary method */
@@ -22,21 +22,13 @@ int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   int     k, u_lsb, v_lsb, res;
 
   /* either zero than gcd is the largest */
-  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+  if (mp_iszero (a) == MP_YES) {
     return mp_abs (b, c);
   }
-  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+  if (mp_iszero (b) == MP_YES) {
     return mp_abs (a, c);
   }
 
-  /* optimized.  At this point if a == 0 then
-   * b must equal zero too
-   */
-  if (mp_iszero (a) == 1) {
-    mp_zero(c);
-    return MP_OKAY;
-  }
-
   /* get copies of a and b we can modify */
   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
     return res;
diff --git a/libtommath/bn_mp_get_int.c b/libtommath/bn_mp_get_int.c
index 034467b..762cb23 100644
--- a/libtommath/bn_mp_get_int.c
+++ b/libtommath/bn_mp_get_int.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* get the lower 32-bits of an mp_int */
diff --git a/libtommath/bn_mp_grow.c b/libtommath/bn_mp_grow.c
index 12a78a8..b5b2407 100644
--- a/libtommath/bn_mp_grow.c
+++ b/libtommath/bn_mp_grow.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* grow as required */
diff --git a/libtommath/bn_mp_init.c b/libtommath/bn_mp_init.c
index 9d70554..ddb2d07 100644
--- a/libtommath/bn_mp_init.c
+++ b/libtommath/bn_mp_init.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* init a new mp_int */
diff --git a/libtommath/bn_mp_init_copy.c b/libtommath/bn_mp_init_copy.c
index b1b0fa2..2410a9f 100644
--- a/libtommath/bn_mp_init_copy.c
+++ b/libtommath/bn_mp_init_copy.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* creates "a" then copies b into it */
diff --git a/libtommath/bn_mp_init_multi.c b/libtommath/bn_mp_init_multi.c
index 8cb123a..44e3fe6 100644
--- a/libtommath/bn_mp_init_multi.c
+++ b/libtommath/bn_mp_init_multi.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 #include <stdarg.h>
 
diff --git a/libtommath/bn_mp_init_set.c b/libtommath/bn_mp_init_set.c
index 0251e61..dc08867 100644
--- a/libtommath/bn_mp_init_set.c
+++ b/libtommath/bn_mp_init_set.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* initialize and set a digit */
diff --git a/libtommath/bn_mp_init_set_int.c b/libtommath/bn_mp_init_set_int.c
index f59fd19..56b27e0 100644
--- a/libtommath/bn_mp_init_set_int.c
+++ b/libtommath/bn_mp_init_set_int.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* initialize and set a digit */
diff --git a/libtommath/bn_mp_init_size.c b/libtommath/bn_mp_init_size.c
index 845ce2c..8ed2c2a 100644
--- a/libtommath/bn_mp_init_size.c
+++ b/libtommath/bn_mp_init_size.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* init an mp_init for a given size */
diff --git a/libtommath/bn_mp_invmod.c b/libtommath/bn_mp_invmod.c
index 46118ad..fdb6c88 100644
--- a/libtommath/bn_mp_invmod.c
+++ b/libtommath/bn_mp_invmod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* hac 14.61, pp608 */
diff --git a/libtommath/bn_mp_invmod_slow.c b/libtommath/bn_mp_invmod_slow.c
index c1884c0..e079819 100644
--- a/libtommath/bn_mp_invmod_slow.c
+++ b/libtommath/bn_mp_invmod_slow.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* hac 14.61, pp608 */
@@ -33,8 +33,8 @@ int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto LBL_ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
     goto LBL_ERR;
diff --git a/libtommath/bn_mp_is_square.c b/libtommath/bn_mp_is_square.c
index 969d237..926b449 100644
--- a/libtommath/bn_mp_is_square.c
+++ b/libtommath/bn_mp_is_square.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Check if remainders are possible squares - fast exclude non-squares */
diff --git a/libtommath/bn_mp_jacobi.c b/libtommath/bn_mp_jacobi.c
index 74cbbf3..1644698 100644
--- a/libtommath/bn_mp_jacobi.c
+++ b/libtommath/bn_mp_jacobi.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes the jacobi c = (a | n) (or Legendre if n is prime)
diff --git a/libtommath/bn_mp_karatsuba_mul.c b/libtommath/bn_mp_karatsuba_mul.c
index daa78c7..0d62b9b 100644
--- a/libtommath/bn_mp_karatsuba_mul.c
+++ b/libtommath/bn_mp_karatsuba_mul.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* c = |a| * |b| using Karatsuba Multiplication using 
@@ -26,12 +26,12 @@
  * b = b1 * B**n + b0
  *
  * Then, a * b => 
-   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+   a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
  *
  * Note that a1b1 and a0b0 are used twice and only need to be 
  * computed once.  So in total three half size (half # of 
  * digit) multiplications are performed, a0b0, a1b1 and 
- * (a1-b1)(a0-b0)
+ * (a1+b1)(a0+b0)
  *
  * Note that a multiplication of half the digits requires
  * 1/4th the number of single precision multiplications so in 
@@ -122,19 +122,19 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
     goto X1Y1;          /* x1y1 = x1*y1 */
 
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc x1+x0 and y1+y0 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1Y1;          /* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+  if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = y1 - y0 */
   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
 
   /* add x0y0 */
   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = x0y0 + x1y1 */
-  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+  if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
diff --git a/libtommath/bn_mp_karatsuba_sqr.c b/libtommath/bn_mp_karatsuba_sqr.c
index 315ceab..829405a 100644
--- a/libtommath/bn_mp_karatsuba_sqr.c
+++ b/libtommath/bn_mp_karatsuba_sqr.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Karatsuba squaring, computes b = a*a using three 
@@ -80,8 +80,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
     goto X1X1;           /* x1x1 = x1*x1 */
 
-  /* now calc (x1-x0)**2 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc (x1+x0)**2 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = x1 - x0 */
   if (mp_sqr (&t1, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
@@ -89,8 +89,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   /* add x0y0 */
   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
     goto X1X1;           /* t2 = x0x0 + x1x1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+  if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
diff --git a/libtommath/bn_mp_lcm.c b/libtommath/bn_mp_lcm.c
index 8e3a759..1d53921 100644
--- a/libtommath/bn_mp_lcm.c
+++ b/libtommath/bn_mp_lcm.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes least common multiple as |a*b|/(a, b) */
diff --git a/libtommath/bn_mp_lshd.c b/libtommath/bn_mp_lshd.c
index 398b648..ce1e63b 100644
--- a/libtommath/bn_mp_lshd.c
+++ b/libtommath/bn_mp_lshd.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift left a certain amount of digits */
diff --git a/libtommath/bn_mp_mod.c b/libtommath/bn_mp_mod.c
index 75779bb..98e155e 100644
--- a/libtommath/bn_mp_mod.c
+++ b/libtommath/bn_mp_mod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* c = a mod b, 0 <= c < b */
diff --git a/libtommath/bn_mp_mod_2d.c b/libtommath/bn_mp_mod_2d.c
index 589e4ba..0170f65 100644
--- a/libtommath/bn_mp_mod_2d.c
+++ b/libtommath/bn_mp_mod_2d.c
@@ -12,12 +12,12 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* calc a value mod 2**b */
 int
-mp_mod_2d (mp_int * a, int b, mp_int * c)
+mp_mod_2d (const mp_int * a, int b, mp_int * c)
 {
   int     x, res;
 
diff --git a/libtommath/bn_mp_mod_d.c b/libtommath/bn_mp_mod_d.c
index 8a2ad24..f642ee8 100644
--- a/libtommath/bn_mp_mod_d.c
+++ b/libtommath/bn_mp_mod_d.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 int
diff --git a/libtommath/bn_mp_montgomery_calc_normalization.c b/libtommath/bn_mp_montgomery_calc_normalization.c
index 0a760cf..0748762 100644
--- a/libtommath/bn_mp_montgomery_calc_normalization.c
+++ b/libtommath/bn_mp_montgomery_calc_normalization.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /*
@@ -28,7 +28,6 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
diff --git a/libtommath/bn_mp_montgomery_reduce.c b/libtommath/bn_mp_montgomery_reduce.c
index 3095fa7..bc6abb8 100644
--- a/libtommath/bn_mp_montgomery_reduce.c
+++ b/libtommath/bn_mp_montgomery_reduce.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
diff --git a/libtommath/bn_mp_montgomery_setup.c b/libtommath/bn_mp_montgomery_setup.c
index 9dfc087..b8e1887 100644
--- a/libtommath/bn_mp_montgomery_setup.c
+++ b/libtommath/bn_mp_montgomery_setup.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* setups the montgomery reduction stuff */
@@ -48,7 +48,7 @@ mp_montgomery_setup (mp_int * n, mp_digit * rho)
 #endif
 
   /* rho = -1/m mod b */
-  *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
 
   return MP_OKAY;
 }
diff --git a/libtommath/bn_mp_mul.c b/libtommath/bn_mp_mul.c
index f9cfa09..fc024be 100644
--- a/libtommath/bn_mp_mul.c
+++ b/libtommath/bn_mp_mul.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* high level multiplication (handles sign) */
diff --git a/libtommath/bn_mp_mul_2.c b/libtommath/bn_mp_mul_2.c
index 6936681..2ca6022 100644
--- a/libtommath/bn_mp_mul_2.c
+++ b/libtommath/bn_mp_mul_2.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = a*2 */
diff --git a/libtommath/bn_mp_mul_2d.c b/libtommath/bn_mp_mul_2d.c
index 04cb8dd..4ac2e4e 100644
--- a/libtommath/bn_mp_mul_2d.c
+++ b/libtommath/bn_mp_mul_2d.c
@@ -12,11 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift left by a certain bit count */
-int mp_mul_2d (mp_int * a, int b, mp_int * c)
+int mp_mul_2d (const mp_int * a, int b, mp_int * c)
 {
   mp_digit d;
   int      res;
diff --git a/libtommath/bn_mp_mul_d.c b/libtommath/bn_mp_mul_d.c
index f936361..ba45a0c 100644
--- a/libtommath/bn_mp_mul_d.c
+++ b/libtommath/bn_mp_mul_d.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiply by a digit */
@@ -57,8 +57,9 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
   }
 
-  /* store final carry [if any] */
+  /* store final carry [if any] and increment ix offset  */
   *tmpc++ = u;
+  ++ix;
 
   /* now zero digits above the top */
   while (ix++ < olduse) {
diff --git a/libtommath/bn_mp_mulmod.c b/libtommath/bn_mp_mulmod.c
index d34e90a..649b717 100644
--- a/libtommath/bn_mp_mulmod.c
+++ b/libtommath/bn_mp_mulmod.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* d = a * b (mod c) */
-int
-mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
   int     res;
   mp_int  t;
diff --git a/libtommath/bn_mp_n_root.c b/libtommath/bn_mp_n_root.c
index 7b11aa2..b2700a8 100644
--- a/libtommath/bn_mp_n_root.c
+++ b/libtommath/bn_mp_n_root.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* find the n'th root of an integer 
diff --git a/libtommath/bn_mp_neg.c b/libtommath/bn_mp_neg.c
index 3a991db..07fb148 100644
--- a/libtommath/bn_mp_neg.c
+++ b/libtommath/bn_mp_neg.c
@@ -12,19 +12,25 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = -a */
-int mp_neg (mp_int * a, mp_int * b)
+int mp_neg (const mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
   }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
   }
+
   return MP_OKAY;
 }
 #endif
diff --git a/libtommath/bn_mp_or.c b/libtommath/bn_mp_or.c
index dccee7e..aa5b1bd 100644
--- a/libtommath/bn_mp_or.c
+++ b/libtommath/bn_mp_or.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* OR two ints together */
diff --git a/libtommath/bn_mp_prime_fermat.c b/libtommath/bn_mp_prime_fermat.c
index fd74dbe..7b9b12e 100644
--- a/libtommath/bn_mp_prime_fermat.c
+++ b/libtommath/bn_mp_prime_fermat.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* performs one Fermat test.
diff --git a/libtommath/bn_mp_prime_is_divisible.c b/libtommath/bn_mp_prime_is_divisible.c
index f85fe7c..710c967 100644
--- a/libtommath/bn_mp_prime_is_divisible.c
+++ b/libtommath/bn_mp_prime_is_divisible.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines if an integers is divisible by one 
diff --git a/libtommath/bn_mp_prime_is_prime.c b/libtommath/bn_mp_prime_is_prime.c
index 188053a..ce225a3 100644
--- a/libtommath/bn_mp_prime_is_prime.c
+++ b/libtommath/bn_mp_prime_is_prime.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* performs a variable number of rounds of Miller-Rabin
diff --git a/libtommath/bn_mp_prime_miller_rabin.c b/libtommath/bn_mp_prime_miller_rabin.c
index 758a2c3..c5185b8 100644
--- a/libtommath/bn_mp_prime_miller_rabin.c
+++ b/libtommath/bn_mp_prime_miller_rabin.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Miller-Rabin test of "a" to the base of "b" as described in 
diff --git a/libtommath/bn_mp_prime_next_prime.c b/libtommath/bn_mp_prime_next_prime.c
index 24f93c4..2433e8c 100644
--- a/libtommath/bn_mp_prime_next_prime.c
+++ b/libtommath/bn_mp_prime_next_prime.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* finds the next prime after the number "a" using "t" trials
@@ -143,7 +143,7 @@ int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
 
       /* is this prime? */
       for (x = 0; x < t; x++) {
-          mp_set(&b, ltm_prime_tab[t]);
+          mp_set(&b, ltm_prime_tab[x]);
           if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
              goto LBL_ERR;
           }
diff --git a/libtommath/bn_mp_prime_rabin_miller_trials.c b/libtommath/bn_mp_prime_rabin_miller_trials.c
index d1d0867..e57a43c 100644
--- a/libtommath/bn_mp_prime_rabin_miller_trials.c
+++ b/libtommath/bn_mp_prime_rabin_miller_trials.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 
diff --git a/libtommath/bn_mp_prime_random_ex.c b/libtommath/bn_mp_prime_random_ex.c
index 2010ebe..a37477e 100644
--- a/libtommath/bn_mp_prime_random_ex.c
+++ b/libtommath/bn_mp_prime_random_ex.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* makes a truly random prime of a given size (bits),
@@ -60,15 +60,13 @@ int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback
 
    /* calc the maskOR_msb */
    maskOR_msb        = 0;
-   maskOR_msb_offset = (size - 2) >> 3;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
-      maskOR_msb     |= 1 << ((size - 2) & 7);
-   } else if (flags & LTM_PRIME_2MSB_OFF) {
-      maskAND        &= ~(1 << ((size - 2) & 7));
-   } 
+      maskOR_msb       |= 0x80 >> ((9 - size) & 7);
+   }  
 
    /* get the maskOR_lsb */
-   maskOR_lsb         = 0;
+   maskOR_lsb         = 1;
    if (flags & LTM_PRIME_BBS) {
       maskOR_lsb     |= 3;
    }
diff --git a/libtommath/bn_mp_radix_size.c b/libtommath/bn_mp_radix_size.c
index 30b78d9..40c4d04 100644
--- a/libtommath/bn_mp_radix_size.c
+++ b/libtommath/bn_mp_radix_size.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* returns size of ASCII reprensentation */
@@ -35,22 +35,29 @@ int mp_radix_size (mp_int * a, int radix, int *size)
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+    *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
   }
 
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
@@ -59,8 +66,17 @@ int mp_radix_size (mp_int * a, int radix, int *size)
   }
   mp_clear (&t);
 
-  /* return digs + 1, the 1 is for the NULL byte that would be required. */
-  *size = digs + 1;
+  /* 
+   * return digs + 1, the 1 is for the NULL byte that would be required.
+   * mp_toradix_n requires a minimum of 3 bytes, so never report less than
+   * that.
+   */
+
+  if ( digs >= 2 ) {
+      *size = digs + 1;
+  } else {
+      *size = 3;
+  }
   return MP_OKAY;
 }
 
diff --git a/libtommath/bn_mp_radix_smap.c b/libtommath/bn_mp_radix_smap.c
index bc7517d..7aeb375 100644
--- a/libtommath/bn_mp_radix_smap.c
+++ b/libtommath/bn_mp_radix_smap.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* chars used in radix conversions */
diff --git a/libtommath/bn_mp_rand.c b/libtommath/bn_mp_rand.c
index 1cc47f1..17c1fbe 100644
--- a/libtommath/bn_mp_rand.c
+++ b/libtommath/bn_mp_rand.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* makes a pseudo-random int of a given size */
@@ -29,14 +29,14 @@ mp_rand (mp_int * a, int digits)
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
diff --git a/libtommath/bn_mp_read_radix.c b/libtommath/bn_mp_read_radix.c
index 704bd0f..4b92589 100644
--- a/libtommath/bn_mp_read_radix.c
+++ b/libtommath/bn_mp_read_radix.c
@@ -12,15 +12,18 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* read a string [ASCII] in a given radix */
-int mp_read_radix (mp_int * a, char *str, int radix)
+int mp_read_radix (mp_int * a, const char *str, int radix)
 {
   int     y, res, neg;
   char    ch;
 
+  /* zero the digit bignum */
+  mp_zero(a);
+
   /* make sure the radix is ok */
   if (radix < 2 || radix > 64) {
     return MP_VAL;
@@ -45,7 +48,7 @@ int mp_read_radix (mp_int * a, char *str, int radix)
      * this allows numbers like 1AB and 1ab to represent the same  value
      * [e.g. in hex]
      */
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    ch = (char) ((radix < 36) ? toupper ((unsigned char) *str) : *str);
     for (y = 0; y < 64; y++) {
       if (ch == mp_s_rmap[y]) {
          break;
@@ -69,6 +72,13 @@ int mp_read_radix (mp_int * a, char *str, int radix)
     ++str;
   }
   
+  /* if an illegal character was found, fail. */
+
+  if ( *str != '\0' ) {
+      mp_zero( a );
+      return MP_VAL;
+  }
+
   /* set the sign only if a != 0 */
   if (mp_iszero(a) != 1) {
      a->sign = neg;
diff --git a/libtommath/bn_mp_read_signed_bin.c b/libtommath/bn_mp_read_signed_bin.c
index 814d6c1..3ee8556 100644
--- a/libtommath/bn_mp_read_signed_bin.c
+++ b/libtommath/bn_mp_read_signed_bin.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int
-mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_signed_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
diff --git a/libtommath/bn_mp_read_unsigned_bin.c b/libtommath/bn_mp_read_unsigned_bin.c
index 946457d..caf5be0 100644
--- a/libtommath/bn_mp_read_unsigned_bin.c
+++ b/libtommath/bn_mp_read_unsigned_bin.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int
-mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
diff --git a/libtommath/bn_mp_reduce.c b/libtommath/bn_mp_reduce.c
index cfcb55a..4375e4e 100644
--- a/libtommath/bn_mp_reduce.c
+++ b/libtommath/bn_mp_reduce.c
@@ -12,15 +12,14 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reduces x mod m, assumes 0 < x < m**2, mu is 
  * precomputed via mp_reduce_setup.
  * From HAC pp.604 Algorithm 14.42
  */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
   mp_int  q;
   int     res, um = m->used;
@@ -40,11 +39,11 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
diff --git a/libtommath/bn_mp_reduce_2k.c b/libtommath/bn_mp_reduce_2k.c
index a5a9c74..428f2ff 100644
--- a/libtommath/bn_mp_reduce_2k.c
+++ b/libtommath/bn_mp_reduce_2k.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reduces a modulo n where n is of the form 2**p - d */
-int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
    mp_int q;
    int    p, res;
diff --git a/libtommath/bn_mp_reduce_2k_l.c b/libtommath/bn_mp_reduce_2k_l.c
new file mode 100644
index 0000000..8e52efa
--- /dev/null
+++ b/libtommath/bn_mp_reduce_2k_l.c
@@ -0,0 +1,58 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_reduce_2k_setup.c b/libtommath/bn_mp_reduce_2k_setup.c
index 5e1fb6e..ac043f6 100644
--- a/libtommath/bn_mp_reduce_2k_setup.c
+++ b/libtommath/bn_mp_reduce_2k_setup.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines the setup value */
-int 
-mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 {
    int res, p;
    mp_int tmp;
diff --git a/libtommath/bn_mp_reduce_2k_setup_l.c b/libtommath/bn_mp_reduce_2k_setup_l.c
new file mode 100644
index 0000000..b59a1ed
--- /dev/null
+++ b/libtommath/bn_mp_reduce_2k_setup_l.c
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
diff --git a/libtommath/bn_mp_reduce_is_2k.c b/libtommath/bn_mp_reduce_is_2k.c
index fc81397..4655fcf 100644
--- a/libtommath/bn_mp_reduce_is_2k.c
+++ b/libtommath/bn_mp_reduce_is_2k.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines if mp_reduce_2k can be used */
@@ -22,9 +22,9 @@ int mp_reduce_is_2k(mp_int *a)
    mp_digit iz;
    
    if (a->used == 0) {
-      return 0;
+      return MP_NO;
    } else if (a->used == 1) {
-      return 1;
+      return MP_YES;
    } else if (a->used > 1) {
       iy = mp_count_bits(a);
       iz = 1;
@@ -33,7 +33,7 @@ int mp_reduce_is_2k(mp_int *a)
       /* Test every bit from the second digit up, must be 1 */
       for (ix = DIGIT_BIT; ix < iy; ix++) {
           if ((a->dp[iw] & iz) == 0) {
-             return 0;
+             return MP_NO;
           }
           iz <<= 1;
           if (iz > (mp_digit)MP_MASK) {
@@ -42,7 +42,7 @@ int mp_reduce_is_2k(mp_int *a)
           }
       }
    }
-   return 1;
+   return MP_YES;
 }
 
 #endif
diff --git a/libtommath/bn_mp_reduce_is_2k_l.c b/libtommath/bn_mp_reduce_is_2k_l.c
new file mode 100644
index 0000000..7b57865
--- /dev/null
+++ b/libtommath/bn_mp_reduce_is_2k_l.c
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
diff --git a/libtommath/bn_mp_reduce_setup.c b/libtommath/bn_mp_reduce_setup.c
index 99f158a..d8cefd9 100644
--- a/libtommath/bn_mp_reduce_setup.c
+++ b/libtommath/bn_mp_reduce_setup.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* pre-calculate the value required for Barrett reduction
diff --git a/libtommath/bn_mp_rshd.c b/libtommath/bn_mp_rshd.c
index 913dda6..e6095b3 100644
--- a/libtommath/bn_mp_rshd.c
+++ b/libtommath/bn_mp_rshd.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift right a certain amount of digits */
diff --git a/libtommath/bn_mp_set.c b/libtommath/bn_mp_set.c
index 078fd5f..c32fc42 100644
--- a/libtommath/bn_mp_set.c
+++ b/libtommath/bn_mp_set.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* set to a digit */
diff --git a/libtommath/bn_mp_set_int.c b/libtommath/bn_mp_set_int.c
index bd47136..b0fc344 100644
--- a/libtommath/bn_mp_set_int.c
+++ b/libtommath/bn_mp_set_int.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* set a 32-bit const */
diff --git a/libtommath/bn_mp_shrink.c b/libtommath/bn_mp_shrink.c
index b31f9d2..bfdf93a 100644
--- a/libtommath/bn_mp_shrink.c
+++ b/libtommath/bn_mp_shrink.c
@@ -12,19 +12,24 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shrink a bignum */
 int mp_shrink (mp_int * a)
 {
   mp_digit *tmp;
-  if (a->alloc != a->used && a->used > 0) {
-    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+  int used = 1;
+  
+  if(a->used > 0)
+    used = a->used;
+  
+  if (a->alloc != used) {
+    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * used)) == NULL) {
       return MP_MEM;
     }
     a->dp    = tmp;
-    a->alloc = a->used;
+    a->alloc = used;
   }
   return MP_OKAY;
 }
diff --git a/libtommath/bn_mp_signed_bin_size.c b/libtommath/bn_mp_signed_bin_size.c
index 30048cb..8f88e76 100644
--- a/libtommath/bn_mp_signed_bin_size.c
+++ b/libtommath/bn_mp_signed_bin_size.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* get the size for an signed equivalent */
diff --git a/libtommath/bn_mp_sqr.c b/libtommath/bn_mp_sqr.c
index b1fdb57..3938537 100644
--- a/libtommath/bn_mp_sqr.c
+++ b/libtommath/bn_mp_sqr.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes b = a*a */
diff --git a/libtommath/bn_mp_sqrmod.c b/libtommath/bn_mp_sqrmod.c
index 1923be4..6f90772 100644
--- a/libtommath/bn_mp_sqrmod.c
+++ b/libtommath/bn_mp_sqrmod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* c = a * a (mod b) */
diff --git a/libtommath/bn_mp_sqrt.c b/libtommath/bn_mp_sqrt.c
index 76cec87..016b8ba 100644
--- a/libtommath/bn_mp_sqrt.c
+++ b/libtommath/bn_mp_sqrt.c
@@ -1,4 +1,5 @@
 #include <tommath.h>
+
 #ifdef BN_MP_SQRT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -12,14 +13,23 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
+#ifndef NO_FLOATING_POINT
+#include <math.h>
+#endif
+
 /* this function is less generic than mp_n_root, simpler and faster */
 int mp_sqrt(mp_int *arg, mp_int *ret) 
 {
   int res;
   mp_int t1,t2;
+  int i, j, k;
+#ifndef NO_FLOATING_POINT
+  volatile double d;
+  mp_digit dig;
+#endif
 
   /* must be positive */
   if (arg->sign == MP_NEG) {
@@ -31,17 +41,72 @@ int mp_sqrt(mp_int *arg, mp_int *ret)
     mp_zero(ret);
     return MP_OKAY;
   }
-
-  if ((res = mp_init_copy(&t1, arg)) != MP_OKAY) {
-    return res;
+  
+  i = (arg->used / 2) - 1;
+  j = 2 * i;
+  if ((res = mp_init_size(&t1, i+2)) != MP_OKAY) {
+      return res;
   }
-
+  
   if ((res = mp_init(&t2)) != MP_OKAY) {
     goto E2;
   }
 
-  /* First approx. (not very bad for large arg) */
-  mp_rshd (&t1,t1.used/2);
+  for (k = 0; k < i; ++k) {
+      t1.dp[k] = (mp_digit) 0;
+  }
+      
+#ifndef NO_FLOATING_POINT
+
+  /* Estimate the square root using the hardware floating point unit. */
+
+  d = 0.0;
+  for (k = arg->used-1; k >= j; --k) {
+      d = ldexp(d, DIGIT_BIT) + (double) (arg->dp[k]);
+  }
+
+  /* 
+   * At this point, d is the nearest floating point number to the most
+   * significant 1 or 2 mp_digits of arg. Extract its square root.
+   */
+     
+  d = sqrt(d);
+
+  /* dig is the most significant mp_digit of the square root */
+
+  dig = (mp_digit) ldexp(d, -DIGIT_BIT);
+
+  /* 
+   * If the most significant digit is nonzero, find the next digit down
+   * by subtracting DIGIT_BIT times thie most significant digit. 
+   * Subtract one from the result so that our initial estimate is always
+   * low.
+   */
+
+  if (dig) {
+      t1.used = i+2;
+      d -= ldexp((double) dig, DIGIT_BIT);
+      if (d >= 1.0) {
+	  t1.dp[i+1] = dig;
+	  t1.dp[i] = ((mp_digit) d) - 1;
+      } else {
+	  t1.dp[i+1] = dig-1;
+	  t1.dp[i] = MP_DIGIT_MAX;
+      }
+  } else {
+      t1.used = i+1;
+      t1.dp[i] = ((mp_digit) d) - 1;
+  }
+
+#else
+
+  /* Estimate the square root as having 1 in the most significant place. */
+
+  t1.used = i + 2;
+  t1.dp[i+1] = (mp_digit) 1;
+  t1.dp[i] = (mp_digit) 0;
+
+#endif
 
   /* t1 > 0  */ 
   if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
diff --git a/libtommath/bn_mp_sub.c b/libtommath/bn_mp_sub.c
index 97495f4..13cb43e 100644
--- a/libtommath/bn_mp_sub.c
+++ b/libtommath/bn_mp_sub.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* high level subtraction (handles signs) */
diff --git a/libtommath/bn_mp_sub_d.c b/libtommath/bn_mp_sub_d.c
index 4923dde..b1e4e3f 100644
--- a/libtommath/bn_mp_sub_d.c
+++ b/libtommath/bn_mp_sub_d.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* single digit subtraction */
@@ -36,6 +36,10 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
      a->sign = MP_ZPOS;
      res     = mp_add_d(a, b, c);
      a->sign = c->sign = MP_NEG;
+
+     /* clamp */
+     mp_clamp(c);
+
      return res;
   }
 
diff --git a/libtommath/bn_mp_submod.c b/libtommath/bn_mp_submod.c
index b999c85..7461678 100644
--- a/libtommath/bn_mp_submod.c
+++ b/libtommath/bn_mp_submod.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* d = a - b (mod c) */
diff --git a/libtommath/bn_mp_to_signed_bin.c b/libtommath/bn_mp_to_signed_bin.c
index 0e40d0f..7871921 100644
--- a/libtommath/bn_mp_to_signed_bin.c
+++ b/libtommath/bn_mp_to_signed_bin.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
   int     res;
 
diff --git a/libtommath/bn_mp_to_signed_bin_n.c b/libtommath/bn_mp_to_signed_bin_n.c
new file mode 100644
index 0000000..8da9961
--- /dev/null
+++ b/libtommath/bn_mp_to_signed_bin_n.c
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
diff --git a/libtommath/bn_mp_to_unsigned_bin.c b/libtommath/bn_mp_to_unsigned_bin.c
index 763e346..9496398 100644
--- a/libtommath/bn_mp_to_unsigned_bin.c
+++ b/libtommath/bn_mp_to_unsigned_bin.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
   int     x, res;
   mp_int  t;
diff --git a/libtommath/bn_mp_to_unsigned_bin_n.c b/libtommath/bn_mp_to_unsigned_bin_n.c
new file mode 100644
index 0000000..4f2a31d
--- /dev/null
+++ b/libtommath/bn_mp_to_unsigned_bin_n.c
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
diff --git a/libtommath/bn_mp_toom_mul.c b/libtommath/bn_mp_toom_mul.c
index 2d66779..9daefbd 100644
--- a/libtommath/bn_mp_toom_mul.c
+++ b/libtommath/bn_mp_toom_mul.c
@@ -12,14 +12,15 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
diff --git a/libtommath/bn_mp_toom_sqr.c b/libtommath/bn_mp_toom_sqr.c
index 8c46fea..9e3f79c 100644
--- a/libtommath/bn_mp_toom_sqr.c
+++ b/libtommath/bn_mp_toom_sqr.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* squaring using Toom-Cook 3-way algorithm */
diff --git a/libtommath/bn_mp_toradix.c b/libtommath/bn_mp_toradix.c
index a206d5e..132743e 100644
--- a/libtommath/bn_mp_toradix.c
+++ b/libtommath/bn_mp_toradix.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* stores a bignum as a ASCII string in a given radix (2..64) */
diff --git a/libtommath/bn_mp_toradix_n.c b/libtommath/bn_mp_toradix_n.c
index 7d43558..dedce71 100644
--- a/libtommath/bn_mp_toradix_n.c
+++ b/libtommath/bn_mp_toradix_n.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* stores a bignum as a ASCII string in a given radix (2..64) 
@@ -27,12 +27,12 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
   char   *_s = str;
 
   /* check range of the maxlen, radix */
-  if (maxlen < 3 || radix < 2 || radix > 64) {
+  if (maxlen < 2 || radix < 2 || radix > 64) {
     return MP_VAL;
   }
 
   /* quick out if its zero */
-  if (mp_iszero(a) == 1) {
+  if (mp_iszero(a) == MP_YES) {
      *str++ = '0';
      *str = '\0';
      return MP_OKAY;
@@ -57,21 +57,20 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
 
   digs = 0;
   while (mp_iszero (&t) == 0) {
+    if (--maxlen < 1) {
+       /* no more room */
+       break;
+    }
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
     }
     *str++ = mp_s_rmap[d];
     ++digs;
-
-    if (--maxlen == 1) {
-       /* no more room */
-       break;
-    }
   }
 
   /* reverse the digits of the string.  In this case _s points
-   * to the first digit [exluding the sign] of the number]
+   * to the first digit [exluding the sign] of the number
    */
   bn_reverse ((unsigned char *)_s, digs);
 
diff --git a/libtommath/bn_mp_unsigned_bin_size.c b/libtommath/bn_mp_unsigned_bin_size.c
index 80da415..58c18fb 100644
--- a/libtommath/bn_mp_unsigned_bin_size.c
+++ b/libtommath/bn_mp_unsigned_bin_size.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
+int mp_unsigned_bin_size (mp_int * a)
 {
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
diff --git a/libtommath/bn_mp_xor.c b/libtommath/bn_mp_xor.c
index 192aacc..432f42e 100644
--- a/libtommath/bn_mp_xor.c
+++ b/libtommath/bn_mp_xor.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* XOR two ints together */
@@ -37,7 +37,7 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
diff --git a/libtommath/bn_mp_zero.c b/libtommath/bn_mp_zero.c
index 0097598..d697a60 100644
--- a/libtommath/bn_mp_zero.c
+++ b/libtommath/bn_mp_zero.c
@@ -12,15 +12,21 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
+void mp_zero (mp_int * a)
 {
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
diff --git a/libtommath/bn_prime_tab.c b/libtommath/bn_prime_tab.c
index 14306c2..c47c8bd 100644
--- a/libtommath/bn_prime_tab.c
+++ b/libtommath/bn_prime_tab.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 const mp_digit ltm_prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
diff --git a/libtommath/bn_reverse.c b/libtommath/bn_reverse.c
index 851a6e8..9d7fd29 100644
--- a/libtommath/bn_reverse.c
+++ b/libtommath/bn_reverse.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reverse an array, used for radix code */
diff --git a/libtommath/bn_s_mp_add.c b/libtommath/bn_s_mp_add.c
index 2b378ae..7527bf8 100644
--- a/libtommath/bn_s_mp_add.c
+++ b/libtommath/bn_s_mp_add.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
diff --git a/libtommath/bn_s_mp_exptmod.c b/libtommath/bn_s_mp_exptmod.c
index 01a766f..ff6bd54 100644
--- a/libtommath/bn_s_mp_exptmod.c
+++ b/libtommath/bn_s_mp_exptmod.c
@@ -12,20 +12,20 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
-
 #ifdef MP_LOW_MEM
    #define TAB_SIZE 32
 #else
    #define TAB_SIZE 256
 #endif
 
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res, mu;
   mp_digit buf;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
 
   /* find window size */
   x = mp_count_bits (X);
@@ -72,9 +72,18 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   if ((err = mp_init (&mu)) != MP_OKAY) {
     goto LBL_M;
   }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto LBL_MU;
-  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
 
   /* create M table
    *
@@ -96,11 +105,14 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 
   for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
     if ((err = mp_sqr (&M[1 << (winsize - 1)], 
                        &M[1 << (winsize - 1)])) != MP_OKAY) {
       goto LBL_MU;
     }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
       goto LBL_MU;
     }
   }
@@ -112,7 +124,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
       goto LBL_MU;
     }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
       goto LBL_MU;
     }
   }
@@ -161,7 +173,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
         goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
         goto LBL_RES;
       }
       continue;
@@ -178,7 +190,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
           goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
           goto LBL_RES;
         }
       }
@@ -187,7 +199,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
         goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
         goto LBL_RES;
       }
 
@@ -205,7 +217,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
         goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
         goto LBL_RES;
       }
 
@@ -215,7 +227,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
           goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
           goto LBL_RES;
         }
       }
diff --git a/libtommath/bn_s_mp_mul_digs.c b/libtommath/bn_s_mp_mul_digs.c
index d9f0a56..401f32e 100644
--- a/libtommath/bn_s_mp_mul_digs.c
+++ b/libtommath/bn_s_mp_mul_digs.c
@@ -12,15 +12,14 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiplies |a| * |b| and only computes upto digs digits of result
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
diff --git a/libtommath/bn_s_mp_mul_high_digs.c b/libtommath/bn_s_mp_mul_high_digs.c
index a060248..f4dca76 100644
--- a/libtommath/bn_s_mp_mul_high_digs.c
+++ b/libtommath/bn_s_mp_mul_high_digs.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiplies |a| * |b| and does not compute the lower digs digits
diff --git a/libtommath/bn_s_mp_sqr.c b/libtommath/bn_s_mp_sqr.c
index 4d12804..464663f 100644
--- a/libtommath/bn_s_mp_sqr.c
+++ b/libtommath/bn_s_mp_sqr.c
@@ -12,12 +12,11 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
diff --git a/libtommath/bn_s_mp_sub.c b/libtommath/bn_s_mp_sub.c
index 5b7aef9..328c9e5 100644
--- a/libtommath/bn_s_mp_sub.c
+++ b/libtommath/bn_s_mp_sub.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
diff --git a/libtommath/bncore.c b/libtommath/bncore.c
index cf8a15a..eb95a2e 100644
--- a/libtommath/bncore.c
+++ b/libtommath/bncore.c
@@ -12,7 +12,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Known optimal configurations
@@ -20,11 +20,12 @@
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        80/       120/LTM 0.35
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 80,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 120,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
diff --git a/libtommath/booker.pl b/libtommath/booker.pl
index 5c77e53..df8b30d 100644
--- a/libtommath/booker.pl
+++ b/libtommath/booker.pl
@@ -89,6 +89,9 @@ while (<IN>) {
       
       $inline = 0;
       while (<SRC>) {
+      next if ($_ =~ /\$Source/);
+      next if ($_ =~ /\$Revision/);
+      next if ($_ =~ /\$Date/);
          $text[$line++] = $_;
          ++$inline;
          chomp($_);
@@ -218,7 +221,7 @@ while (<IN>) {
                      $str = "chapter eight";
                   } elsif ($a == 9) {
                      $str = "chapter nine";
-                  } elsif ($a == 2) {
+                  } elsif ($a == 10) {
                      $str = "chapter ten";
                   }
                } else {
diff --git a/libtommath/callgraph.txt b/libtommath/callgraph.txt
index 4dc4cba..2efcf24 100644
--- a/libtommath/callgraph.txt
+++ b/libtommath/callgraph.txt
@@ -907,7 +907,64 @@ BN_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_DIV_2_C
@@ -938,6 +995,66 @@ BN_MP_EXPTMOD_C
 |   +--->BN_MP_INVMOD_SLOW_C
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_SET_C
@@ -973,93 +1090,63 @@ BN_MP_EXPTMOD_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
 +--->BN_MP_CLEAR_MULTI_C
-+--->BN_MP_DR_IS_MODULUS_C
-+--->BN_MP_REDUCE_IS_2K_C
-|   +--->BN_MP_REDUCE_2K_C
-|   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_DIV_2D_C
-|   |   |   +--->BN_MP_COPY_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_MUL_D_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_ADD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_COUNT_BITS_C
-+--->BN_MP_EXPTMOD_FAST_C
++--->BN_MP_REDUCE_IS_2K_L_C
++--->BN_S_MP_EXPTMOD_C
 |   +--->BN_MP_COUNT_BITS_C
-|   +--->BN_MP_MONTGOMERY_SETUP_C
-|   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   +--->BN_MP_MONTGOMERY_REDUCE_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   +--->BN_MP_DR_SETUP_C
-|   +--->BN_MP_DR_REDUCE_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   +--->BN_MP_REDUCE_2K_SETUP_C
+|   +--->BN_MP_REDUCE_SETUP_C
 |   |   +--->BN_MP_2EXPT_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_REDUCE_2K_C
-|   |   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_MUL_D_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_ADD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-|   |   +--->BN_MP_2EXPT_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_SET_C
+|   +--->BN_MP_REDUCE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
 |   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_MUL_2_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_MULMOD_C
 |   |   +--->BN_MP_MUL_C
 |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -1070,8 +1157,6 @@ BN_MP_EXPTMOD_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ADD_C
@@ -1123,8 +1208,6 @@ BN_MP_EXPTMOD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_CLAMP_C
@@ -1132,62 +1215,150 @@ BN_MP_EXPTMOD_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_MOD_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_L_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_SET_C
-|   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
 |   +--->BN_MP_MOD_C
 |   |   +--->BN_MP_DIV_C
 |   |   |   +--->BN_MP_CMP_MAG_C
@@ -1195,6 +1366,7 @@ BN_MP_EXPTMOD_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
@@ -1379,57 +1551,224 @@ BN_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_EXCH_C
-+--->BN_S_MP_EXPTMOD_C
++--->BN_MP_DR_IS_MODULUS_C
++--->BN_MP_REDUCE_IS_2K_C
+|   +--->BN_MP_REDUCE_2K_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
 |   +--->BN_MP_COUNT_BITS_C
-|   +--->BN_MP_REDUCE_SETUP_C
++--->BN_MP_EXPTMOD_FAST_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MONTGOMERY_SETUP_C
+|   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_DR_SETUP_C
+|   +--->BN_MP_DR_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_REDUCE_2K_SETUP_C
 |   |   +--->BN_MP_2EXPT_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_DIV_C
-|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_C
+|   |   +--->BN_MP_DIV_2D_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MULMOD_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_MOD_C
 |   |   +--->BN_MP_DIV_C
 |   |   |   +--->BN_MP_CMP_MAG_C
@@ -1437,7 +1776,6 @@ BN_MP_EXPTMOD_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_LSHD_C
@@ -1554,120 +1892,6 @@ BN_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_REDUCE_C
-|   |   +--->BN_MP_INIT_COPY_C
-|   |   +--->BN_MP_RSHD_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_MUL_C
-|   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MOD_2D_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SUB_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_D_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_LSHD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_ADD_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
 |   +--->BN_MP_MUL_C
 |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -1736,8 +1960,6 @@ BN_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_SET_C
-|   |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_EXCH_C
 
 
@@ -1769,7 +1991,64 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_DIV_2_C
@@ -1799,6 +2078,66 @@ BN_MP_PRIME_FERMAT_C
 |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_SET_C
@@ -1833,93 +2172,63 @@ BN_MP_PRIME_FERMAT_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
 |   +--->BN_MP_CLEAR_MULTI_C
-|   +--->BN_MP_DR_IS_MODULUS_C
-|   +--->BN_MP_REDUCE_IS_2K_C
-|   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_COUNT_BITS_C
-|   +--->BN_MP_EXPTMOD_FAST_C
+|   +--->BN_MP_REDUCE_IS_2K_L_C
+|   +--->BN_S_MP_EXPTMOD_C
 |   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_MONTGOMERY_SETUP_C
-|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_DR_SETUP_C
-|   |   +--->BN_MP_DR_REDUCE_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
 |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-|   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MULMOD_C
 |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -1930,8 +2239,6 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_ADD_C
@@ -1983,8 +2290,6 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -1992,62 +2297,149 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_MOD_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_MOD_C
 |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -2055,6 +2447,7 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -2239,57 +2632,224 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_EXCH_C
-|   +--->BN_S_MP_EXPTMOD_C
+|   +--->BN_MP_DR_IS_MODULUS_C
+|   +--->BN_MP_REDUCE_IS_2K_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_REDUCE_SETUP_C
+|   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_DR_SETUP_C
+|   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_C
 |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_DIV_2D_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_DIV_2D_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_MOD_C
 |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -2297,7 +2857,6 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -2414,119 +2973,6 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_REDUCE_C
-|   |   |   +--->BN_MP_INIT_COPY_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_C
-|   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_MUL_C
 |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -2595,8 +3041,6 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_EXCH_C
 +--->BN_MP_CMP_C
 |   +--->BN_MP_CMP_MAG_C
@@ -2901,7 +3345,65 @@ BN_MP_INVMOD_C
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_DIV_2_C
@@ -2933,6 +3435,67 @@ BN_MP_INVMOD_C
 |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_INIT_C
 |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
 |   +--->BN_MP_SET_C
@@ -2985,7 +3548,65 @@ BN_FAST_MP_INVMOD_C
 |   +--->BN_MP_CLEAR_C
 +--->BN_MP_COPY_C
 |   +--->BN_MP_GROW_C
-+--->BN_MP_ABS_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_SET_C
 |   +--->BN_MP_ZERO_C
 +--->BN_MP_DIV_2_C
@@ -3473,7 +4094,55 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -3501,6 +4170,57 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_DIV_2_C
@@ -3533,73 +4253,52 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   |   +--->BN_MP_DR_IS_MODULUS_C
-|   |   |   +--->BN_MP_REDUCE_IS_2K_C
-|   |   |   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   |   +--->BN_S_MP_EXPTMOD_C
 |   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
-|   |   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_DR_SETUP_C
-|   |   |   |   +--->BN_MP_DR_REDUCE_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   +--->BN_MP_REDUCE_SETUP_C
 |   |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -3610,8 +4309,6 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_ADD_C
@@ -3663,8 +4360,6 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -3672,52 +4367,138 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_MOD_C
-|   |   |   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_MOD_C
 |   |   |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -3903,48 +4684,191 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
 |   |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_MOD_C
 |   |   |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -4061,116 +4985,6 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_REDUCE_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -4753,7 +5567,55 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -4781,6 +5643,57 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_DIV_2_C
@@ -4813,73 +5726,52 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   +--->BN_MP_DR_IS_MODULUS_C
-|   |   +--->BN_MP_REDUCE_IS_2K_C
-|   |   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   +--->BN_S_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
-|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_DR_SETUP_C
-|   |   |   +--->BN_MP_DR_REDUCE_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
 |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -4890,8 +5782,6 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_ADD_C
@@ -4943,8 +5833,6 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -4952,52 +5840,138 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_MOD_C
-|   |   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_MOD_C
 |   |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -5183,48 +6157,191 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
 |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_MOD_C
 |   |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -5341,116 +6458,6 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_REDUCE_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -6496,7 +7503,55 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -6524,6 +7579,57 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_DIV_2_C
@@ -6556,73 +7662,52 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   +--->BN_MP_DR_IS_MODULUS_C
-|   |   +--->BN_MP_REDUCE_IS_2K_C
-|   |   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   +--->BN_S_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
-|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_DR_SETUP_C
-|   |   |   +--->BN_MP_DR_REDUCE_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
 |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -6633,8 +7718,6 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_ADD_C
@@ -6686,8 +7769,6 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -6695,52 +7776,138 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_MOD_C
-|   |   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_MOD_C
 |   |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -6926,48 +8093,191 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
 |   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_MOD_C
 |   |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -7084,116 +8394,6 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_REDUCE_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_MUL_C
-|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -7406,6 +8606,67 @@ BN_MP_INVMOD_SLOW_C
 +--->BN_MP_INIT_MULTI_C
 |   +--->BN_MP_INIT_C
 |   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_COPY_C
 |   +--->BN_MP_GROW_C
 +--->BN_MP_SET_C
@@ -7604,6 +8865,107 @@ BN_MP_LCM_C
 |   +--->BN_MP_CLEAR_C
 
 
+BN_MP_REDUCE_2K_L_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
 BN_REVERSE_C
 
 
@@ -7671,6 +9033,18 @@ BN_MP_GCD_C
 +--->BN_MP_CLEAR_C
 
 
+BN_MP_REDUCE_2K_SETUP_L_C
++--->BN_MP_INIT_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_COUNT_BITS_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
 BN_MP_READ_RADIX_C
 +--->BN_MP_ZERO_C
 +--->BN_MP_MUL_D_C
@@ -7983,6 +9357,226 @@ BN_S_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_SETUP_L_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_L_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
 +--->BN_MP_MOD_C
 |   +--->BN_MP_DIV_C
 |   |   +--->BN_MP_CMP_MAG_C
@@ -8110,121 +9704,6 @@ BN_S_MP_EXPTMOD_C
 |   |   +--->BN_MP_INIT_SIZE_C
 |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_EXCH_C
-+--->BN_MP_REDUCE_C
-|   +--->BN_MP_INIT_COPY_C
-|   +--->BN_MP_RSHD_C
-|   |   +--->BN_MP_ZERO_C
-|   +--->BN_MP_MUL_C
-|   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLEAR_MULTI_C
-|   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_EXCH_C
-|   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_INIT_SIZE_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_EXCH_C
-|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_MOD_2D_C
-|   |   +--->BN_MP_ZERO_C
-|   |   +--->BN_MP_CLAMP_C
-|   +--->BN_S_MP_MUL_DIGS_C
-|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_INIT_SIZE_C
-|   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_EXCH_C
-|   +--->BN_MP_SUB_C
-|   |   +--->BN_S_MP_ADD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_CMP_D_C
-|   +--->BN_MP_SET_C
-|   |   +--->BN_MP_ZERO_C
-|   +--->BN_MP_LSHD_C
-|   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_ADD_C
-|   |   +--->BN_S_MP_ADD_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   |   +--->BN_S_MP_SUB_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   +--->BN_MP_CMP_C
-|   |   +--->BN_MP_CMP_MAG_C
-|   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_CLAMP_C
 +--->BN_MP_MUL_C
 |   +--->BN_MP_TOOM_MUL_C
 |   |   +--->BN_MP_INIT_MULTI_C
@@ -8529,6 +10008,31 @@ BN_MP_ADD_C
 |   +--->BN_MP_CLAMP_C
 
 
+BN_MP_TO_SIGNED_BIN_N_C
++--->BN_MP_SIGNED_BIN_SIZE_C
+|   +--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   |   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_TO_SIGNED_BIN_C
+|   +--->BN_MP_TO_UNSIGNED_BIN_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_IS_2K_L_C
+
+
 BN_MP_RAND_C
 +--->BN_MP_ZERO_C
 +--->BN_MP_ADD_D_C
@@ -8556,6 +10060,26 @@ BN_MP_RSHD_C
 BN_MP_SHRINK_C
 
 
+BN_MP_TO_UNSIGNED_BIN_N_C
++--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_TO_UNSIGNED_BIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
 BN_MP_REDUCE_C
 +--->BN_MP_REDUCE_SETUP_C
 |   +--->BN_MP_2EXPT_C
@@ -9062,6 +10586,7 @@ BN_MP_EXTEUCLID_C
 |   +--->BN_S_MP_SUB_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLAMP_C
++--->BN_MP_NEG_C
 +--->BN_MP_EXCH_C
 +--->BN_MP_CLEAR_MULTI_C
 |   +--->BN_MP_CLEAR_C
@@ -9269,7 +10794,56 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_DIV_2_C
@@ -9299,6 +10873,58 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_SET_C
@@ -9333,75 +10959,53 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
 |   +--->BN_MP_CLEAR_MULTI_C
-|   +--->BN_MP_DR_IS_MODULUS_C
-|   +--->BN_MP_REDUCE_IS_2K_C
-|   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   +--->BN_MP_COUNT_BITS_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_COUNT_BITS_C
-|   +--->BN_MP_EXPTMOD_FAST_C
+|   +--->BN_MP_REDUCE_IS_2K_L_C
+|   +--->BN_S_MP_EXPTMOD_C
 |   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_MONTGOMERY_SETUP_C
-|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_DR_SETUP_C
-|   |   +--->BN_MP_DR_REDUCE_C
-|   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   +--->BN_MP_REDUCE_2K_SETUP_C
-|   |   |   +--->BN_MP_2EXPT_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_REDUCE_2K_C
-|   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
 |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   +--->BN_MP_MULMOD_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_MUL_C
 |   |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -9412,8 +11016,6 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_ADD_C
@@ -9465,8 +11067,6 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -9474,55 +11074,140 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_MOD_C
-|   |   |   |   +--->BN_MP_DIV_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_SUB_C
 |   |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_MOD_C
 |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -9530,6 +11215,7 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -9707,50 +11393,199 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_EXCH_C
-|   +--->BN_S_MP_EXPTMOD_C
+|   +--->BN_MP_DR_IS_MODULUS_C
+|   +--->BN_MP_REDUCE_IS_2K_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_COUNT_BITS_C
-|   |   +--->BN_MP_REDUCE_SETUP_C
+|   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_DR_SETUP_C
+|   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_C
 |   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_DIV_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_MP_COPY_C
-|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   +--->BN_MP_2EXPT_C
 |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_ADD_C
 |   |   |   |   |   +--->BN_S_MP_ADD_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
 |   |   |   |   |   +--->BN_S_MP_SUB_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_MOD_C
 |   |   |   +--->BN_MP_DIV_C
 |   |   |   |   +--->BN_MP_CMP_MAG_C
@@ -9758,7 +11593,6 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_MUL_2D_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_LSHD_C
@@ -9868,118 +11702,6 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_REDUCE_C
-|   |   |   +--->BN_MP_RSHD_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_MUL_C
-|   |   |   |   +--->BN_MP_TOOM_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
-|   |   |   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_2_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_2D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_MUL_D_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_DIV_3_C
-|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_SUB_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ADD_C
-|   |   |   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_MOD_2D_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_S_MP_MUL_DIGS_C
-|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_INIT_SIZE_C
-|   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_EXCH_C
-|   |   |   +--->BN_MP_SUB_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_SET_C
-|   |   |   |   +--->BN_MP_ZERO_C
-|   |   |   +--->BN_MP_LSHD_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ADD_C
-|   |   |   |   +--->BN_S_MP_ADD_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_CLAMP_C
-|   |   |   +--->BN_MP_CMP_C
-|   |   |   |   +--->BN_MP_CMP_MAG_C
-|   |   |   +--->BN_S_MP_SUB_C
-|   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_CLAMP_C
 |   |   +--->BN_MP_MUL_C
 |   |   |   +--->BN_MP_TOOM_MUL_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
@@ -10048,8 +11770,6 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   +--->BN_MP_INIT_SIZE_C
 |   |   |   |   +--->BN_MP_CLAMP_C
 |   |   |   |   +--->BN_MP_EXCH_C
-|   |   +--->BN_MP_SET_C
-|   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_EXCH_C
 +--->BN_MP_CMP_C
 |   +--->BN_MP_CMP_MAG_C
diff --git a/libtommath/changes.txt b/libtommath/changes.txt
index 0d1ec2e..4fc0913 100644
--- a/libtommath/changes.txt
+++ b/libtommath/changes.txt
@@ -1,3 +1,69 @@
+July 23rd, 2010
+v0.42.0
+       -- Fix for mp_prime_next_prime() bug when checking generated prime
+       -- allow mp_shrink to shrink initialized, but empty MPI's
+       -- Added project and solution files for Visual Studio 2005 and Visual Studio 2008. 
+
+March 10th, 2007
+v0.41  -- Wolfgang Ehrhardt suggested a quick fix to mp_div_d() which makes the detection of powers of two quicker. 
+       -- [CRI] Added libtommath.dsp for Visual C++ users.
+
+December 24th, 2006
+v0.40  -- Updated makefile to properly support LIBNAME
+       -- Fixed bug in fast_s_mp_mul_high_digs() which overflowed (line 83), thanks Valgrind!
+
+April 4th, 2006
+v0.39  -- Jim Wigginton pointed out my Montgomery examples in figures 6.4 and 6.6 were off by one, k should be 9 not 8
+       -- Bruce Guenter suggested I use --tag=CC for libtool builds where the compiler may think it's C++.
+       -- "mm" from sci.crypt pointed out that my mp_gcd was sub-optimal (I also updated and corrected the book)
+       -- updated some of the @@ tags in tommath.src to reflect source changes.
+       -- updated email and url info in all source files
+
+Jan 26th, 2006
+v0.38  -- broken makefile.shared fixed
+       -- removed some carry stores that were not required [updated text]
+       
+November 18th, 2005
+v0.37  -- [Don Porter] reported on a TCL list [HEY SEND ME BUGREPORTS ALREADY!!!] that mp_add_d() would compute -0 with some inputs.  Fixed.
+       -- [rinick@gmail.com] reported the makefile.bcc was messed up.  Fixed.
+       -- [Kevin Kenny] reported some issues with mp_toradix_n().  Now it doesn't require a min of 3 chars of output.  
+       -- Made the make command renamable.  Wee
+
+August 1st, 2005
+v0.36  -- LTM_PRIME_2MSB_ON was fixed and the "OFF" flag was removed.
+       -- [Peter LaDow] found a typo in the XREALLOC macro
+       -- [Peter LaDow] pointed out that mp_read_(un)signed_bin should have "const" on the input
+       -- Ported LTC patch to fix the prime_random_ex() function to get the bitsize correct [and the maskOR flags]
+       -- Kevin Kenny pointed out a stray //
+       -- David Hulton pointed out a typo in the textbook [mp_montgomery_setup() pseudo-code]
+       -- Neal Hamilton (Elliptic Semiconductor) pointed out that my Karatsuba notation was backwards and that I could use 
+          unsigned operations in the routine.  
+       -- Paul Schmidt pointed out a linking error in mp_exptmod() when BN_S_MP_EXPTMOD_C is undefined (and another for read_radix)
+       -- Updated makefiles to be way more flexible
+
+March 12th, 2005
+v0.35  -- Stupid XOR function missing line again... oops.
+       -- Fixed bug in invmod not handling negative inputs correctly [Wolfgang Ehrhardt]
+       -- Made exteuclid always give positive u3 output...[ Wolfgang Ehrhardt ]
+       -- [Wolfgang Ehrhardt] Suggested a fix for mp_reduce() which avoided underruns.  ;-)
+       -- mp_rand() would emit one too many digits and it was possible to get a 0 out of it ... oops
+       -- Added montgomery to the testing to make sure it handles 1..10 digit moduli correctly
+       -- Fixed bug in comba that would lead to possible erroneous outputs when "pa < digs" 
+       -- Fixed bug in mp_toradix_size for "0" [Kevin Kenny]
+       -- Updated chapters 1-5 of the textbook ;-) It now talks about the new comba code!
+
+February 12th, 2005
+v0.34  -- Fixed two more small errors in mp_prime_random_ex()
+       -- Fixed overflow in mp_mul_d() [Kevin Kenny]
+       -- Added mp_to_(un)signed_bin_n() functions which do bounds checking for ya [and report the size]
+       -- Added "large" diminished radix support.  Speeds up things like DSA where the moduli is of the form 2^k - P for some P < 2^(k/2) or so
+          Actually is faster than Montgomery on my AMD64 (and probably much faster on a P4)
+       -- Updated the manual a bit
+       -- Ok so I haven't done the textbook work yet... My current freelance gig has landed me in France till the 
+          end of Feb/05.  Once I get back I'll have tons of free time and I plan to go to town on the book.
+          As of this release the API will freeze.  At least until the book catches up with all the changes.  I welcome
+          bug reports but new algorithms will have to wait.
+
 December 23rd, 2004
 v0.33  -- Fixed "small" variant for mp_div() which would munge with negative dividends...
        -- Fixed bug in mp_prime_random_ex() which would set the most significant byte to zero when
diff --git a/libtommath/demo/demo.c b/libtommath/demo/demo.c
index 62615cd..e1f8a5e 100644
--- a/libtommath/demo/demo.c
+++ b/libtommath/demo/demo.c
@@ -9,15 +9,16 @@
 
 #include "tommath.h"
 
-void ndraw(mp_int *a, char *name)
+void ndraw(mp_int * a, char *name)
 {
    char buf[16000];
+
    printf("%s: ", name);
    mp_toradix(a, buf, 10);
    printf("%s\n", buf);
 }
 
-static void draw(mp_int *a)
+static void draw(mp_int * a)
 {
    ndraw(a, "");
 }
@@ -39,20 +40,23 @@ int lbit(void)
 int myrng(unsigned char *dst, int len, void *dat)
 {
    int x;
-   for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
+
+   for (x = 0; x < len; x++)
+      dst[x] = rand() & 0xFF;
    return len;
 }
 
 
 
-   char cmd[4096], buf[4096];
+char cmd[4096], buf[4096];
 int main(void)
 {
    mp_int a, b, c, d, e, f;
-   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
-                 div2_n, mul2_n, add_d_n, sub_d_n, t;
+   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n,
+      gcd_n, lcm_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, t;
    unsigned rr;
    int i, n, err, cnt, ix, old_kara_m, old_kara_s;
+   mp_digit mp;
 
 
    mp_init(&a);
@@ -65,108 +69,152 @@ int main(void)
    srand(time(NULL));
 
 #if 0
-  // test mp_get_int
-  printf("Testing: mp_get_int\n");
-  for(i=0;i<1000;++i) {
-    t = ((unsigned long)rand()*rand()+1)&0xFFFFFFFF;
-    mp_set_int(&a,t);
-    if (t!=mp_get_int(&a)) { 
+   // test montgomery 
+   printf("Testing montgomery...\n");
+   for (i = 1; i < 10; i++) {
+      printf("Testing digit size: %d\n", i);
+      for (n = 0; n < 1000; n++) {
+         mp_rand(&a, i);
+         a.dp[0] |= 1;
+
+         // let's see if R is right
+         mp_montgomery_calc_normalization(&b, &a);
+         mp_montgomery_setup(&a, &mp);
+
+         // now test a random reduction 
+         for (ix = 0; ix < 100; ix++) {
+             mp_rand(&c, 1 + abs(rand()) % (2*i));
+             mp_copy(&c, &d);
+             mp_copy(&c, &e);
+
+             mp_mod(&d, &a, &d);
+             mp_montgomery_reduce(&c, &a, mp);
+             mp_mulmod(&c, &b, &a, &c);
+
+             if (mp_cmp(&c, &d) != MP_EQ) { 
+printf("d = e mod a, c = e MOD a\n");
+mp_todecimal(&a, buf); printf("a = %s\n", buf);
+mp_todecimal(&e, buf); printf("e = %s\n", buf);
+mp_todecimal(&d, buf); printf("d = %s\n", buf);
+mp_todecimal(&c, buf); printf("c = %s\n", buf);
+printf("compare no compare!\n"); exit(EXIT_FAILURE); }
+         }
+      }
+   }
+   printf("done\n");
+
+   // test mp_get_int
+   printf("Testing: mp_get_int\n");
+   for (i = 0; i < 1000; ++i) {
+      t = ((unsigned long) rand() * rand() + 1) & 0xFFFFFFFF;
+      mp_set_int(&a, t);
+      if (t != mp_get_int(&a)) {
+	 printf("mp_get_int() bad result!\n");
+	 return 1;
+      }
+   }
+   mp_set_int(&a, 0);
+   if (mp_get_int(&a) != 0) {
       printf("mp_get_int() bad result!\n");
       return 1;
-    }
-  }
-  mp_set_int(&a,0);
-  if (mp_get_int(&a)!=0)
-  { printf("mp_get_int() bad result!\n");
-    return 1;
-  }
-  mp_set_int(&a,0xffffffff);
-  if (mp_get_int(&a)!=0xffffffff)
-  { printf("mp_get_int() bad result!\n");
-    return 1;
-  }
-
-  // test mp_sqrt
-  printf("Testing: mp_sqrt\n");
-  for (i=0;i<1000;++i) { 
-    printf("%6d\r", i); fflush(stdout);
-    n = (rand()&15)+1;
-    mp_rand(&a,n);
-    if (mp_sqrt(&a,&b) != MP_OKAY)
-    { printf("mp_sqrt() error!\n");
-      return 1;
-    }
-    mp_n_root(&a,2,&a);
-    if (mp_cmp_mag(&b,&a) != MP_EQ)
-    { printf("mp_sqrt() bad result!\n");
-      return 1;
-    }
-  }
-
-  printf("\nTesting: mp_is_square\n");
-  for (i=0;i<1000;++i) {
-    printf("%6d\r", i); fflush(stdout);
-
-    /* test mp_is_square false negatives */
-    n = (rand()&7)+1;
-    mp_rand(&a,n);
-    mp_sqr(&a,&a);
-    if (mp_is_square(&a,&n)!=MP_OKAY) { 
-      printf("fn:mp_is_square() error!\n");
-      return 1;
-    }
-    if (n==0) { 
-      printf("fn:mp_is_square() bad result!\n");
+   }
+   mp_set_int(&a, 0xffffffff);
+   if (mp_get_int(&a) != 0xffffffff) {
+      printf("mp_get_int() bad result!\n");
       return 1;
-    }
+   }
+   // test mp_sqrt
+   printf("Testing: mp_sqrt\n");
+   for (i = 0; i < 1000; ++i) {
+      printf("%6d\r", i);
+      fflush(stdout);
+      n = (rand() & 15) + 1;
+      mp_rand(&a, n);
+      if (mp_sqrt(&a, &b) != MP_OKAY) {
+	 printf("mp_sqrt() error!\n");
+	 return 1;
+      }
+      mp_n_root(&a, 2, &a);
+      if (mp_cmp_mag(&b, &a) != MP_EQ) {
+	 printf("mp_sqrt() bad result!\n");
+	 return 1;
+      }
+   }
 
-    /* test for false positives */
-    mp_add_d(&a, 1, &a);
-    if (mp_is_square(&a,&n)!=MP_OKAY) { 
-      printf("fp:mp_is_square() error!\n");
-      return 1;
-    }
-    if (n==1) { 
-      printf("fp:mp_is_square() bad result!\n");
-      return 1;
-    }
+   printf("\nTesting: mp_is_square\n");
+   for (i = 0; i < 1000; ++i) {
+      printf("%6d\r", i);
+      fflush(stdout);
+
+      /* test mp_is_square false negatives */
+      n = (rand() & 7) + 1;
+      mp_rand(&a, n);
+      mp_sqr(&a, &a);
+      if (mp_is_square(&a, &n) != MP_OKAY) {
+	 printf("fn:mp_is_square() error!\n");
+	 return 1;
+      }
+      if (n == 0) {
+	 printf("fn:mp_is_square() bad result!\n");
+	 return 1;
+      }
 
-  }
-  printf("\n\n");
+      /* test for false positives */
+      mp_add_d(&a, 1, &a);
+      if (mp_is_square(&a, &n) != MP_OKAY) {
+	 printf("fp:mp_is_square() error!\n");
+	 return 1;
+      }
+      if (n == 1) {
+	 printf("fp:mp_is_square() bad result!\n");
+	 return 1;
+      }
+
+   }
+   printf("\n\n");
 
    /* test for size */
-   for (ix = 10; ix < 256; ix++) {
-       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
-       err = mp_prime_random_ex(&a, 8, ix, (rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON, myrng, NULL);
-       if (err != MP_OKAY) {
-          printf("failed with err code %d\n", err);
-          return EXIT_FAILURE;
-       }
-       if (mp_count_bits(&a) != ix) {
-          printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
-          return EXIT_FAILURE;
-       }
+   for (ix = 10; ix < 128; ix++) {
+      printf("Testing (not safe-prime): %9d bits    \r", ix);
+      fflush(stdout);
+      err =
+	 mp_prime_random_ex(&a, 8, ix,
+			    (rand() & 1) ? LTM_PRIME_2MSB_OFF :
+			    LTM_PRIME_2MSB_ON, myrng, NULL);
+      if (err != MP_OKAY) {
+	 printf("failed with err code %d\n", err);
+	 return EXIT_FAILURE;
+      }
+      if (mp_count_bits(&a) != ix) {
+	 printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+	 return EXIT_FAILURE;
+      }
    }
 
-   for (ix = 16; ix < 256; ix++) {
-       printf("Testing (   safe-prime): %9d bits    \r", ix); fflush(stdout);
-       err = mp_prime_random_ex(&a, 8, ix, ((rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON)|LTM_PRIME_SAFE, myrng, NULL);
-       if (err != MP_OKAY) {
-          printf("failed with err code %d\n", err);
-          return EXIT_FAILURE;
-       }
-       if (mp_count_bits(&a) != ix) {
-          printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
-          return EXIT_FAILURE;
-       }
-       /* let's see if it's really a safe prime */
-       mp_sub_d(&a, 1, &a);
-       mp_div_2(&a, &a);
-       mp_prime_is_prime(&a, 8, &cnt);
-       if (cnt != MP_YES) {
-          printf("sub is not prime!\n");
-          return EXIT_FAILURE;
-       }
+   for (ix = 16; ix < 128; ix++) {
+      printf("Testing (   safe-prime): %9d bits    \r", ix);
+      fflush(stdout);
+      err =
+	 mp_prime_random_ex(&a, 8, ix,
+			    ((rand() & 1) ? LTM_PRIME_2MSB_OFF :
+			     LTM_PRIME_2MSB_ON) | LTM_PRIME_SAFE, myrng,
+			    NULL);
+      if (err != MP_OKAY) {
+	 printf("failed with err code %d\n", err);
+	 return EXIT_FAILURE;
+      }
+      if (mp_count_bits(&a) != ix) {
+	 printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+	 return EXIT_FAILURE;
+      }
+      /* let's see if it's really a safe prime */
+      mp_sub_d(&a, 1, &a);
+      mp_div_2(&a, &a);
+      mp_prime_is_prime(&a, 8, &cnt);
+      if (cnt != MP_YES) {
+	 printf("sub is not prime!\n");
+	 return EXIT_FAILURE;
+      }
    }
 
    printf("\n\n");
@@ -194,51 +242,56 @@ int main(void)
    printf("testing mp_cnt_lsb...\n");
    mp_set(&a, 1);
    for (ix = 0; ix < 1024; ix++) {
-       if (mp_cnt_lsb(&a) != ix) {
-          printf("Failed at %d, %d\n", ix, mp_cnt_lsb(&a));
-          return 0;
-       }
-       mp_mul_2(&a, &a);
+      if (mp_cnt_lsb(&a) != ix) {
+	 printf("Failed at %d, %d\n", ix, mp_cnt_lsb(&a));
+	 return 0;
+      }
+      mp_mul_2(&a, &a);
    }
 
 /* test mp_reduce_2k */
    printf("Testing mp_reduce_2k...\n");
    for (cnt = 3; cnt <= 128; ++cnt) {
-       mp_digit tmp;
-       mp_2expt(&a, cnt);
-       mp_sub_d(&a, 2, &a);  /* a = 2**cnt - 2 */
-
-
-       printf("\nTesting %4d bits", cnt);
-       printf("(%d)", mp_reduce_is_2k(&a));
-       mp_reduce_2k_setup(&a, &tmp);
-       printf("(%d)", tmp);
-       for (ix = 0; ix < 1000; ix++) {
-           if (!(ix & 127)) {printf("."); fflush(stdout); }
-           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
-           mp_copy(&c, &b);
-           mp_mod(&c, &a, &c);
-           mp_reduce_2k(&b, &a, 1);
-           if (mp_cmp(&c, &b)) {
-              printf("FAILED\n");
-              exit(0);
-           }
-        }
-    }
+      mp_digit tmp;
+
+      mp_2expt(&a, cnt);
+      mp_sub_d(&a, 2, &a);	/* a = 2**cnt - 2 */
+
+
+      printf("\nTesting %4d bits", cnt);
+      printf("(%d)", mp_reduce_is_2k(&a));
+      mp_reduce_2k_setup(&a, &tmp);
+      printf("(%d)", tmp);
+      for (ix = 0; ix < 1000; ix++) {
+	 if (!(ix & 127)) {
+	    printf(".");
+	    fflush(stdout);
+	 }
+	 mp_rand(&b, (cnt / DIGIT_BIT + 1) * 2);
+	 mp_copy(&c, &b);
+	 mp_mod(&c, &a, &c);
+	 mp_reduce_2k(&b, &a, 2);
+	 if (mp_cmp(&c, &b)) {
+	    printf("FAILED\n");
+	    exit(0);
+	 }
+      }
+   }
 
 /* test mp_div_3  */
    printf("Testing mp_div_3...\n");
    mp_set(&d, 3);
-   for (cnt = 0; cnt < 10000; ) {
+   for (cnt = 0; cnt < 10000;) {
       mp_digit r1, r2;
 
-      if (!(++cnt & 127)) printf("%9d\r", cnt);
+      if (!(++cnt & 127))
+	 printf("%9d\r", cnt);
       mp_rand(&a, abs(rand()) % 128 + 1);
       mp_div(&a, &d, &b, &e);
       mp_div_3(&a, &c, &r2);
 
       if (mp_cmp(&b, &c) || mp_cmp_d(&e, r2)) {
-         printf("\n\nmp_div_3 => Failure\n");
+	 printf("\n\nmp_div_3 => Failure\n");
       }
    }
    printf("\n\nPassed div_3 testing\n");
@@ -246,270 +299,438 @@ int main(void)
 /* test the DR reduction */
    printf("testing mp_dr_reduce...\n");
    for (cnt = 2; cnt < 32; cnt++) {
-       printf("%d digit modulus\n", cnt);
-       mp_grow(&a, cnt);
-       mp_zero(&a);
-       for (ix = 1; ix < cnt; ix++) {
-           a.dp[ix] = MP_MASK;
-       }
-       a.used = cnt;
-       a.dp[0] = 3;
-
-       mp_rand(&b, cnt - 1);
-       mp_copy(&b, &c);
+      printf("%d digit modulus\n", cnt);
+      mp_grow(&a, cnt);
+      mp_zero(&a);
+      for (ix = 1; ix < cnt; ix++) {
+	 a.dp[ix] = MP_MASK;
+      }
+      a.used = cnt;
+      a.dp[0] = 3;
+
+      mp_rand(&b, cnt - 1);
+      mp_copy(&b, &c);
 
       rr = 0;
       do {
-         if (!(rr & 127)) { printf("%9lu\r", rr); fflush(stdout); }
-         mp_sqr(&b, &b); mp_add_d(&b, 1, &b);
-         mp_copy(&b, &c);
-
-         mp_mod(&b, &a, &b);
-         mp_dr_reduce(&c, &a, (((mp_digit)1)<<DIGIT_BIT)-a.dp[0]);
-
-         if (mp_cmp(&b, &c) != MP_EQ) {
-            printf("Failed on trial %lu\n", rr); exit(-1);
-
-         }
+	 if (!(rr & 127)) {
+	    printf("%9lu\r", rr);
+	    fflush(stdout);
+	 }
+	 mp_sqr(&b, &b);
+	 mp_add_d(&b, 1, &b);
+	 mp_copy(&b, &c);
+
+	 mp_mod(&b, &a, &b);
+	 mp_dr_reduce(&c, &a, (((mp_digit) 1) << DIGIT_BIT) - a.dp[0]);
+
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("Failed on trial %lu\n", rr);
+	    exit(-1);
+
+	 }
       } while (++rr < 500);
       printf("Passed DR test for %d digits\n", cnt);
    }
 
 #endif
 
+/* test the mp_reduce_2k_l code */
+#if 0
+#if 0
+/* first load P with 2^1024 - 0x2A434 B9FDEC95 D8F9D550 FFFFFFFF FFFFFFFF */
+   mp_2expt(&a, 1024);
+   mp_read_radix(&b, "2A434B9FDEC95D8F9D550FFFFFFFFFFFFFFFF", 16);
+   mp_sub(&a, &b, &a);
+#elif 1
+/*  p = 2^2048 - 0x1 00000000 00000000 00000000 00000000 4945DDBF 8EA2A91D 5776399B B83E188F  */
+   mp_2expt(&a, 2048);
+   mp_read_radix(&b,
+		 "1000000000000000000000000000000004945DDBF8EA2A91D5776399BB83E188F",
+		 16);
+   mp_sub(&a, &b, &a);
+#endif
+
+   mp_todecimal(&a, buf);
+   printf("p==%s\n", buf);
+/* now mp_reduce_is_2k_l() should return */
+   if (mp_reduce_is_2k_l(&a) != 1) {
+      printf("mp_reduce_is_2k_l() return 0, should be 1\n");
+      return EXIT_FAILURE;
+   }
+   mp_reduce_2k_setup_l(&a, &d);
+   /* now do a million square+1 to see if it varies */
+   mp_rand(&b, 64);
+   mp_mod(&b, &a, &b);
+   mp_copy(&b, &c);
+   printf("testing mp_reduce_2k_l...");
+   fflush(stdout);
+   for (cnt = 0; cnt < (1UL << 20); cnt++) {
+      mp_sqr(&b, &b);
+      mp_add_d(&b, 1, &b);
+      mp_reduce_2k_l(&b, &a, &d);
+      mp_sqr(&c, &c);
+      mp_add_d(&c, 1, &c);
+      mp_mod(&c, &a, &c);
+      if (mp_cmp(&b, &c) != MP_EQ) {
+	 printf("mp_reduce_2k_l() failed at step %lu\n", cnt);
+	 mp_tohex(&b, buf);
+	 printf("b == %s\n", buf);
+	 mp_tohex(&c, buf);
+	 printf("c == %s\n", buf);
+	 return EXIT_FAILURE;
+      }
+   }
+   printf("...Passed\n");
+#endif
+
    div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
-   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= 0;
+      sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n =
+      sub_d_n = 0;
 
    /* force KARA and TOOM to enable despite cutoffs */
-   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
-   TOOM_SQR_CUTOFF      = TOOM_MUL_CUTOFF      = 150;
+   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 8;
+   TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 16;
 
    for (;;) {
-       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
-       switch (abs(rand()) % 7) {
-           case 0:  mp_clear(&a); mp_init(&a); break;
-           case 1:  mp_clear(&b); mp_init(&b); break;
-           case 2:  mp_clear(&c); mp_init(&c); break;
-           case 3:  mp_clear(&d); mp_init(&d); break;
-           case 4:  mp_clear(&e); mp_init(&e); break;
-           case 5:  mp_clear(&f); mp_init(&f); break;
-           case 6:  break; /* don't clear any */
-       }
-
-
-       printf("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
-       fgets(cmd, 4095, stdin);
-       cmd[strlen(cmd)-1] = 0;
-       printf("%s  ]\r",cmd); fflush(stdout);
-       if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-
-          mp_mul_2d(&a, rr, &a);
-          a.sign = b.sign;
-          if (mp_cmp(&a, &b) != MP_EQ) {
-             printf("mul2d failed, rr == %d\n",rr);
-             draw(&a);
-             draw(&b);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-
-          mp_div_2d(&a, rr, &a, &e);
-          a.sign = b.sign;
-          if (a.used == b.used && a.used == 0) { a.sign = b.sign = MP_ZPOS; }
-          if (mp_cmp(&a, &b) != MP_EQ) {
-             printf("div2d failed, rr == %d\n",rr);
-             draw(&a);
-             draw(&b);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "add")) { ++add_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_add(&d, &b, &d);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("add %lu failure!\n", add_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-
-          /* test the sign/unsigned storage functions */
-
-          rr = mp_signed_bin_size(&c);
-          mp_to_signed_bin(&c, (unsigned char *)cmd);
-          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_signed_bin(&d, (unsigned char *)cmd, rr);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("mp_signed_bin failure!\n");
-             draw(&c);
-             draw(&d);
-             return 0;
-          }
-
-
-          rr = mp_unsigned_bin_size(&c);
-          mp_to_unsigned_bin(&c, (unsigned char *)cmd);
-          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
-          if (mp_cmp_mag(&c, &d) != MP_EQ) {
-             printf("mp_unsigned_bin failure!\n");
-             draw(&c);
-             draw(&d);
-             return 0;
-          }
-
-       } else if (!strcmp(cmd, "sub")) { ++sub_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_sub(&d, &b, &d);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("sub %lu failure!\n", sub_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "mul")) { ++mul_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_mul(&d, &b, &d);
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("mul %lu failure!\n", mul_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "div")) { ++div_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64);
-          fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 64);
-
-          mp_div(&a, &b, &e, &f);
-          if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
-             printf("div %lu %d, %d, failure!\n", div_n, mp_cmp(&c, &e), mp_cmp(&d, &f));
-draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
-             return 0;
-          }
-
-       } else if (!strcmp(cmd, "sqr")) { ++sqr_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          mp_copy(&a, &c);
-          mp_sqr(&c, &c);
-          if (mp_cmp(&b, &c) != MP_EQ) {
-             printf("sqr %lu failure!\n", sqr_n);
-draw(&a);draw(&b);draw(&c);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "gcd")) { ++gcd_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-          mp_copy(&a, &d);
-          mp_gcd(&d, &b, &d);
-          d.sign = c.sign;
-          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("gcd %lu failure!\n", gcd_n);
-draw(&a);draw(&b);draw(&c);draw(&d);
-             return 0;
-          }
-       } else if (!strcmp(cmd, "lcm")) { ++lcm_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-             mp_copy(&a, &d);
-             mp_lcm(&d, &b, &d);
-             d.sign = c.sign;
-             if (mp_cmp(&c, &d) != MP_EQ) {
-                printf("lcm %lu failure!\n", lcm_n);
-   draw(&a);draw(&b);draw(&c);draw(&d);
-                return 0;
-             }
-       } else if (!strcmp(cmd, "expt")) {  ++expt_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&d, buf, 64);
-             mp_copy(&a, &e);
-             mp_exptmod(&e, &b, &c, &e);
-             if (mp_cmp(&d, &e) != MP_EQ) {
-                printf("expt %lu failure!\n", expt_n);
-   draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
-                return 0;
-             }
-       } else if (!strcmp(cmd, "invmod")) {  ++inv_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
-             mp_invmod(&a, &b, &d);
-             mp_mulmod(&d,&a,&b,&e);
-             if (mp_cmp_d(&e, 1) != MP_EQ) {
-                printf("inv [wrong value from MPI?!] failure\n");
-                draw(&a);draw(&b);draw(&c);draw(&d);
-                mp_gcd(&a, &b, &e);
-                draw(&e);
-                return 0;
-             }
-
-       } else if (!strcmp(cmd, "div2")) { ++div2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             mp_div_2(&a, &c);
-             if (mp_cmp(&c, &b) != MP_EQ) {
-                 printf("div_2 %lu failure\n", div2_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 return 0;
-             }
-       } else if (!strcmp(cmd, "mul2")) { ++mul2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
-             mp_mul_2(&a, &c);
-             if (mp_cmp(&c, &b) != MP_EQ) {
-                 printf("mul_2 %lu failure\n", mul2_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 return 0;
-             }
-       } else if (!strcmp(cmd, "add_d")) { ++add_d_n;
-              fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
-              fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-              mp_add_d(&a, ix, &c);
-              if (mp_cmp(&b, &c) != MP_EQ) {
-                 printf("add_d %lu failure\n", add_d_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 printf("d == %d\n", ix);
-                 return 0;
-              }
-       } else if (!strcmp(cmd, "sub_d")) { ++sub_d_n;
-              fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
-              fgets(buf, 4095, stdin); sscanf(buf, "%d", &ix);
-              fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
-              mp_sub_d(&a, ix, &c);
-              if (mp_cmp(&b, &c) != MP_EQ) {
-                 printf("sub_d %lu failure\n", sub_d_n);
-                 draw(&a);
-                 draw(&b);
-                 draw(&c);
-                 printf("d == %d\n", ix);
-                 return 0;
-              }
-       }
+      /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
+      switch (abs(rand()) % 7) {
+      case 0:
+	 mp_clear(&a);
+	 mp_init(&a);
+	 break;
+      case 1:
+	 mp_clear(&b);
+	 mp_init(&b);
+	 break;
+      case 2:
+	 mp_clear(&c);
+	 mp_init(&c);
+	 break;
+      case 3:
+	 mp_clear(&d);
+	 mp_init(&d);
+	 break;
+      case 4:
+	 mp_clear(&e);
+	 mp_init(&e);
+	 break;
+      case 5:
+	 mp_clear(&f);
+	 mp_init(&f);
+	 break;
+      case 6:
+	 break;			/* don't clear any */
+      }
+
+
+      printf
+	 ("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ",
+	  add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n,
+	  expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
+      fgets(cmd, 4095, stdin);
+      cmd[strlen(cmd) - 1] = 0;
+      printf("%s  ]\r", cmd);
+      fflush(stdout);
+      if (!strcmp(cmd, "mul2d")) {
+	 ++mul2d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &rr);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+
+	 mp_mul_2d(&a, rr, &a);
+	 a.sign = b.sign;
+	 if (mp_cmp(&a, &b) != MP_EQ) {
+	    printf("mul2d failed, rr == %d\n", rr);
+	    draw(&a);
+	    draw(&b);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "div2d")) {
+	 ++div2d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &rr);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+
+	 mp_div_2d(&a, rr, &a, &e);
+	 a.sign = b.sign;
+	 if (a.used == b.used && a.used == 0) {
+	    a.sign = b.sign = MP_ZPOS;
+	 }
+	 if (mp_cmp(&a, &b) != MP_EQ) {
+	    printf("div2d failed, rr == %d\n", rr);
+	    draw(&a);
+	    draw(&b);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "add")) {
+	 ++add_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_add(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("add %lu failure!\n", add_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+
+	 /* test the sign/unsigned storage functions */
+
+	 rr = mp_signed_bin_size(&c);
+	 mp_to_signed_bin(&c, (unsigned char *) cmd);
+	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
+	 mp_read_signed_bin(&d, (unsigned char *) cmd, rr);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("mp_signed_bin failure!\n");
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+
+
+	 rr = mp_unsigned_bin_size(&c);
+	 mp_to_unsigned_bin(&c, (unsigned char *) cmd);
+	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
+	 mp_read_unsigned_bin(&d, (unsigned char *) cmd, rr);
+	 if (mp_cmp_mag(&c, &d) != MP_EQ) {
+	    printf("mp_unsigned_bin failure!\n");
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+
+      } else if (!strcmp(cmd, "sub")) {
+	 ++sub_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_sub(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("sub %lu failure!\n", sub_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "mul")) {
+	 ++mul_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_mul(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("mul %lu failure!\n", mul_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "div")) {
+	 ++div_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&d, buf, 64);
+
+	 mp_div(&a, &b, &e, &f);
+	 if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
+	    printf("div %lu %d, %d, failure!\n", div_n, mp_cmp(&c, &e),
+		   mp_cmp(&d, &f));
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    draw(&e);
+	    draw(&f);
+	    return 0;
+	 }
+
+      } else if (!strcmp(cmd, "sqr")) {
+	 ++sqr_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_copy(&a, &c);
+	 mp_sqr(&c, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("sqr %lu failure!\n", sqr_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "gcd")) {
+	 ++gcd_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_gcd(&d, &b, &d);
+	 d.sign = c.sign;
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("gcd %lu failure!\n", gcd_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "lcm")) {
+	 ++lcm_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_lcm(&d, &b, &d);
+	 d.sign = c.sign;
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("lcm %lu failure!\n", lcm_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "expt")) {
+	 ++expt_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&d, buf, 64);
+	 mp_copy(&a, &e);
+	 mp_exptmod(&e, &b, &c, &e);
+	 if (mp_cmp(&d, &e) != MP_EQ) {
+	    printf("expt %lu failure!\n", expt_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    draw(&e);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "invmod")) {
+	 ++inv_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_invmod(&a, &b, &d);
+	 mp_mulmod(&d, &a, &b, &e);
+	 if (mp_cmp_d(&e, 1) != MP_EQ) {
+	    printf("inv [wrong value from MPI?!] failure\n");
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    mp_gcd(&a, &b, &e);
+	    draw(&e);
+	    return 0;
+	 }
+
+      } else if (!strcmp(cmd, "div2")) {
+	 ++div2_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_div_2(&a, &c);
+	 if (mp_cmp(&c, &b) != MP_EQ) {
+	    printf("div_2 %lu failure\n", div2_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "mul2")) {
+	 ++mul2_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_mul_2(&a, &c);
+	 if (mp_cmp(&c, &b) != MP_EQ) {
+	    printf("mul_2 %lu failure\n", mul2_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "add_d")) {
+	 ++add_d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &ix);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_add_d(&a, ix, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("add_d %lu failure\n", add_d_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    printf("d == %d\n", ix);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "sub_d")) {
+	 ++sub_d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &ix);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_sub_d(&a, ix, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("sub_d %lu failure\n", sub_d_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    printf("d == %d\n", ix);
+	    return 0;
+	 }
+      }
    }
    return 0;
 }
-
diff --git a/libtommath/demo/timing.c b/libtommath/demo/timing.c
index 7b27d53..bb3be52 100644
--- a/libtommath/demo/timing.c
+++ b/libtommath/demo/timing.c
@@ -11,15 +11,16 @@ ulong64 _tt;
 #endif
 
 
-void ndraw(mp_int *a, char *name)
+void ndraw(mp_int * a, char *name)
 {
    char buf[4096];
+
    printf("%s: ", name);
    mp_toradix(a, buf, 64);
    printf("%s\n", buf);
 }
 
-static void draw(mp_int *a)
+static void draw(mp_int * a)
 {
    ndraw(a, "");
 }
@@ -39,35 +40,38 @@ int lbit(void)
 }
 
 /* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC (void)
-   {
-   #if defined __GNUC__
-      #if defined(__i386__) || defined(__x86_64__)
-         unsigned long long a;
-         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
-         return a;
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
+static ulong64 TIMFUNC(void)
+{
+#if defined __GNUC__
+#if defined(__i386__) || defined(__x86_64__)
+   unsigned long long a;
+   __asm__ __volatile__("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::
+			"m"(a):"%eax", "%edx");
+   return a;
+#else /* gcc-IA64 version */
+   unsigned long result;
+   __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
+
+   while (__builtin_expect((int) result == -1, 0))
+      __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
+
+   return result;
+#endif
 
    // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
+#elif defined _M_IX86
+   __asm rdtsc
+#elif defined _M_AMD64
+   return __rdtsc();
+#elif defined _M_IA64
+#if defined __INTEL_COMPILER
+#include <ia64intrin.h>
+#endif
+   return __getReg(3116);
+#else
+#error need rdtsc function for this build
+#endif
+}
 
 #define DO(x) x; x;
 //#define DO4(x) DO2(x); DO2(x);
@@ -77,7 +81,7 @@ static ulong64 TIMFUNC (void)
 int main(void)
 {
    ulong64 tt, gg, CLK_PER_SEC;
-   FILE *log, *logb, *logc;
+   FILE *log, *logb, *logc, *logd;
    mp_int a, b, c, d, e, f;
    int n, cnt, ix, old_kara_m, old_kara_s;
    unsigned rr;
@@ -90,168 +94,191 @@ int main(void)
    mp_init(&f);
 
    srand(time(NULL));
- 
-
-      /* temp. turn off TOOM */
-      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
-
-      CLK_PER_SEC = TIMFUNC();
-      sleep(1);
-      CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
-
-      printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
-      
-      log = fopen("logs/add.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_add(&a,&b,&c));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100000);
-         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt); fflush(log);
-      }
-      fclose(log);
 
-      log = fopen("logs/sub.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_sub(&a,&b,&c));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100000);
-
-         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);  fflush(log);
-      }
-      fclose(log);
+
+   /* temp. turn off TOOM */
+   TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
+
+   CLK_PER_SEC = TIMFUNC();
+   sleep(1);
+   CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
+
+   printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
+   goto exptmod;
+   log = fopen("logs/add.log", "w");
+   for (cnt = 8; cnt <= 128; cnt += 8) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_add(&a, &b, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 100000);
+      printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+      fflush(log);
+   }
+   fclose(log);
+
+   log = fopen("logs/sub.log", "w");
+   for (cnt = 8; cnt <= 128; cnt += 8) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_sub(&a, &b, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 100000);
+
+      printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+      fflush(log);
+   }
+   fclose(log);
 
    /* do mult/square twice, first without karatsuba and second with */
+ multtest:
    old_kara_m = KARATSUBA_MUL_CUTOFF;
    old_kara_s = KARATSUBA_SQR_CUTOFF;
-   for (ix = 0; ix < 1; ix++) {
-      printf("With%s Karatsuba\n", (ix==0)?"out":"");
-
-      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
-      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
-
-      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
-      for (cnt = 4; cnt <= 288; cnt += 2) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_mul(&a, &b, &c));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100);
-         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+   for (ix = 0; ix < 2; ix++) {
+      printf("With%s Karatsuba\n", (ix == 0) ? "out" : "");
+
+      KARATSUBA_MUL_CUTOFF = (ix == 0) ? 9999 : old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix == 0) ? 9999 : old_kara_s;
+
+      log = fopen((ix == 0) ? "logs/mult.log" : "logs/mult_kara.log", "w");
+      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
+	 SLEEP;
+	 mp_rand(&a, cnt);
+	 mp_rand(&b, cnt);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_mul(&a, &b, &c));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 100);
+	 printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);
+	 fflush(log);
       }
       fclose(log);
 
-      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
-      for (cnt = 4; cnt <= 288; cnt += 2) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_sqr(&a, &b));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 100);
-         printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+      log = fopen((ix == 0) ? "logs/sqr.log" : "logs/sqr_kara.log", "w");
+      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
+	 SLEEP;
+	 mp_rand(&a, cnt);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_sqr(&a, &b));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 100);
+	 printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);
+	 fflush(log);
       }
       fclose(log);
 
    }
+ exptmod:
 
-  {
+   {
       char *primes[] = {
-         /* 2K moduli mersenne primes */
-         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
-         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
-         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
-         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
-         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
-         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
-
-         /* DR moduli */
-         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
-         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
-         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
-         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
-         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
-         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
-         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
-
-         /* generic unrestricted moduli */
-         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
-         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
-         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
-         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
-         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
-         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
-         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-         NULL
+	 /* 2K large moduli */
+	 "179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474124377767893424865485276302219601246094119453082952085005768838150682342462881473913110540827237163350510684586239334100047359817950870678242457666208137217",
+	 "32317006071311007300714876688669951960444102669715484032130345427524655138867890893197201411522913463688717960921898019494119559150490921095088152386448283120630877367300996091750197750389652106796057638384067568276792218642619756161838094338476170470581645852036305042887575891541065808607552399123930385521914333389668342420684974786564569494856176035326322058077805659331026192708460314150258592864177116725943603718461857357598351152301645904403697613233287231227125684710820209725157101726931323469678542580656697935045997268352998638099733077152121140120031150424541696791951097529546801429027668869927491725169",
+	 "1044388881413152506691752710716624382579964249047383780384233483283953907971557456848826811934997558340890106714439262837987573438185793607263236087851365277945956976543709998340361590134383718314428070011855946226376318839397712745672334684344586617496807908705803704071284048740118609114467977783598029006686938976881787785946905630190260940599579453432823469303026696443059025015972399867714215541693835559885291486318237914434496734087811872639496475100189041349008417061675093668333850551032972088269550769983616369411933015213796825837188091833656751221318492846368125550225998300412344784862595674492194617023806505913245610825731835380087608622102834270197698202313169017678006675195485079921636419370285375124784014907159135459982790513399611551794271106831134090584272884279791554849782954323534517065223269061394905987693002122963395687782878948440616007412945674919823050571642377154816321380631045902916136926708342856440730447899971901781465763473223850267253059899795996090799469201774624817718449867455659250178329070473119433165550807568221846571746373296884912819520317457002440926616910874148385078411929804522981857338977648103126085902995208257421855249796721729039744118165938433694823325696642096892124547425283",
+	 /* 2K moduli mersenne primes */
+	 "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+	 "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
+	 "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
+	 "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
+	 "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
+	 "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
+
+	 /* DR moduli */
+	 "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
+	 "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
+	 "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
+	 "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
+	 "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
+	 "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
+	 "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
+
+	 /* generic unrestricted moduli */
+	 "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
+	 "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
+	 "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
+	 "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
+	 "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
+	 "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
+	 "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
+	 NULL
       };
-   log = fopen("logs/expt.log", "w");
-   logb = fopen("logs/expt_dr.log", "w");
-   logc = fopen("logs/expt_2k.log", "w");
-   for (n = 0; primes[n]; n++) {
-      SLEEP;
-      mp_read_radix(&a, primes[n], 10);
-      mp_zero(&b);
-      for (rr = 0; rr < (unsigned)mp_count_bits(&a); rr++) {
-         mp_mul_2(&b, &b);
-         b.dp[0] |= lbit();
-         b.used  += 1;
-      }
-      mp_sub_d(&a, 1, &c);
-      mp_mod(&b, &c, &b);
-      mp_set(&c, 3);
-         rr = 0;
-         tt = -1;
-         do {
-            gg = TIMFUNC();
-            DO(mp_exptmod(&c, &b, &a, &d));
-            gg = (TIMFUNC() - gg)>>1;
-            if (tt > gg) tt = gg;
-         } while (++rr < 10);
-      mp_sub_d(&a, 1, &e);
-      mp_sub(&e, &b, &b);
-      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
-      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
-      if (mp_cmp_d(&d, 1)) {
-         printf("Different (%d)!!!\n", mp_count_bits(&a));
-         draw(&d);
-         exit(0);
+      log = fopen("logs/expt.log", "w");
+      logb = fopen("logs/expt_dr.log", "w");
+      logc = fopen("logs/expt_2k.log", "w");
+      logd = fopen("logs/expt_2kl.log", "w");
+      for (n = 0; primes[n]; n++) {
+	 SLEEP;
+	 mp_read_radix(&a, primes[n], 10);
+	 mp_zero(&b);
+	 for (rr = 0; rr < (unsigned) mp_count_bits(&a); rr++) {
+	    mp_mul_2(&b, &b);
+	    b.dp[0] |= lbit();
+	    b.used += 1;
+	 }
+	 mp_sub_d(&a, 1, &c);
+	 mp_mod(&b, &c, &b);
+	 mp_set(&c, 3);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_exptmod(&c, &b, &a, &d));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 10);
+	 mp_sub_d(&a, 1, &e);
+	 mp_sub(&e, &b, &b);
+	 mp_exptmod(&c, &b, &a, &e);	/* c^(p-1-b) mod a */
+	 mp_mulmod(&e, &d, &a, &d);	/* c^b * c^(p-1-b) == c^p-1 == 1 */
+	 if (mp_cmp_d(&d, 1)) {
+	    printf("Different (%d)!!!\n", mp_count_bits(&a));
+	    draw(&d);
+	    exit(0);
+	 }
+	 printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(n < 4 ? logd : (n < 9) ? logc : (n < 16) ? logb : log,
+		 "%d %9llu\n", mp_count_bits(&a), tt);
       }
-      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), tt);
-   }
    }
    fclose(log);
    fclose(logb);
    fclose(logc);
+   fclose(logd);
 
    log = fopen("logs/invmod.log", "w");
    for (cnt = 4; cnt <= 128; cnt += 4) {
@@ -260,28 +287,29 @@ int main(void)
       mp_rand(&b, cnt);
 
       do {
-         mp_add_d(&b, 1, &b);
-         mp_gcd(&a, &b, &c);
+	 mp_add_d(&b, 1, &b);
+	 mp_gcd(&a, &b, &c);
       } while (mp_cmp_d(&c, 1) != MP_EQ);
 
-         rr = 0;
-         tt = -1;
+      rr = 0;
+      tt = -1;
       do {
-         gg = TIMFUNC();
-         DO(mp_invmod(&b, &a, &c));
-         gg = (TIMFUNC() - gg)>>1;
-         if (tt > gg) tt = gg;
+	 gg = TIMFUNC();
+	 DO(mp_invmod(&b, &a, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
       } while (++rr < 1000);
       mp_mulmod(&b, &c, &a, &d);
       if (mp_cmp_d(&d, 1) != MP_EQ) {
-         printf("Failed to invert\n");
-         return 0;
+	 printf("Failed to invert\n");
+	 return 0;
       }
-      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
-      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
    }
    fclose(log);
 
    return 0;
 }
-
diff --git a/libtommath/dep.pl b/libtommath/dep.pl
index 22266e3..c39e27e 100644
--- a/libtommath/dep.pl
+++ b/libtommath/dep.pl
@@ -13,6 +13,8 @@ print CLASS "#if !(defined(LTM1) && defined(LTM2) && defined(LTM3))\n#if defined
 foreach my $filename (glob "bn*.c") {
    my $define = $filename;
 
+print "Processing $filename\n";
+
    # convert filename to upper case so we can use it as a define 
    $define =~ tr/[a-z]/[A-Z]/;
    $define =~ tr/\./_/;
diff --git a/libtommath/etc/2kprime.c b/libtommath/etc/2kprime.c
index d48b83e..67a2777 100644
--- a/libtommath/etc/2kprime.c
+++ b/libtommath/etc/2kprime.c
@@ -73,8 +73,3 @@ int main(void)
    
    return 0;
 }   
-       
-         
-            
-            
-          
diff --git a/libtommath/etc/drprime.c b/libtommath/etc/drprime.c
index 0ab8ea6..0d0fdb9 100644
--- a/libtommath/etc/drprime.c
+++ b/libtommath/etc/drprime.c
@@ -57,4 +57,3 @@ int main(void)
    
    return 0;
 }
-
diff --git a/libtommath/etc/drprimes.txt b/libtommath/etc/drprimes.txt
index 2c887ea..7c97f67 100644
--- a/libtommath/etc/drprimes.txt
+++ b/libtommath/etc/drprimes.txt
@@ -1,6 +1,9 @@
-280-bit prime:
-p == 1942668892225729070919461906823518906642406839052139521251812409738904285204940164839
+300-bit prime:
+p == 2037035976334486086268445688409378161051468393665936250636140449354381298610415201576637819
 
-532-bit prime:
-p == 14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368691747
+540-bit prime:
+p == 3599131035634557106248430806148785487095757694641533306480604458089470064537190296255232548883112685719936728506816716098566612844395439751206810991770626477344739
+
+780-bit prime:
+p == 6359114106063703798370219984742410466332205126109989319225557147754704702203399726411277962562135973685197744935448875852478791860694279747355800678568677946181447581781401213133886609947027230004277244697462656003655947791725966271167
 
diff --git a/libtommath/etc/makefile.icc b/libtommath/etc/makefile.icc
index 0a50728..8a1ffff 100644
--- a/libtommath/etc/makefile.icc
+++ b/libtommath/etc/makefile.icc
@@ -16,7 +16,7 @@ CFLAGS += -I../
 #   B - Blend of P4 and PM [mobile]
 #
 # Default to just generic max opts
-CFLAGS += -O3 -xN -ip
+CFLAGS += -O3 -xP -ip
 
 # default lib name (requires install with root)
 # LIBNAME=-ltommath
diff --git a/libtommath/etc/mersenne.c b/libtommath/etc/mersenne.c
index 1cd5b50..28ac834 100644
--- a/libtommath/etc/mersenne.c
+++ b/libtommath/etc/mersenne.c
@@ -1,6 +1,6 @@
 /* Finds Mersenne primes using the Lucas-Lehmer test 
  *
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
  */
 #include <time.h>
 #include <tommath.h>
diff --git a/libtommath/etc/mont.c b/libtommath/etc/mont.c
index dbf1735..7839675 100644
--- a/libtommath/etc/mont.c
+++ b/libtommath/etc/mont.c
@@ -39,8 +39,3 @@ int main(void)
     
     return 0;
 }
-
-
-
-
-
diff --git a/libtommath/etc/pprime.c b/libtommath/etc/pprime.c
index 26e0d84..955f19e 100644
--- a/libtommath/etc/pprime.c
+++ b/libtommath/etc/pprime.c
@@ -1,8 +1,8 @@
 /* Generates provable primes
  *
- * See http://iahu.ca:8080/papers/pp.pdf for more info.
+ * See http://gmail.com:8080/papers/pp.pdf for more info.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com, http://tom.gmail.com
  */
 #include <time.h>
 #include "tommath.h"
diff --git a/libtommath/etc/timer.asm b/libtommath/etc/timer.asm
index 35890d9..326a947 100644
--- a/libtommath/etc/timer.asm
+++ b/libtommath/etc/timer.asm
@@ -1,37 +1,37 @@
-; x86 timer in NASM
-;
-; Tom St Denis, tomstdenis@iahu.ca
-[bits 32]
-[section .data]
-time dd 0, 0
-
-[section .text]
-
-%ifdef USE_ELF
-[global t_start]
-t_start:
-%else
-[global _t_start]
-_t_start:
-%endif
-   push edx
-   push eax
-   rdtsc
-   mov [time+0],edx
-   mov [time+4],eax
-   pop eax
-   pop edx
-   ret
-   
-%ifdef USE_ELF
-[global t_read]
-t_read:
-%else
-[global _t_read]
-_t_read:
-%endif
-   rdtsc
-   sub eax,[time+4]
-   sbb edx,[time+0]
-   ret
+; x86 timer in NASM
+;
+; Tom St Denis, tomstdenis@iahu.ca
+[bits 32]
+[section .data]
+time dd 0, 0
+
+[section .text]
+
+%ifdef USE_ELF
+[global t_start]
+t_start:
+%else
+[global _t_start]
+_t_start:
+%endif
+   push edx
+   push eax
+   rdtsc
+   mov [time+0],edx
+   mov [time+4],eax
+   pop eax
+   pop edx
+   ret
+   
+%ifdef USE_ELF
+[global t_read]
+t_read:
+%else
+[global _t_read]
+_t_read:
+%endif
+   rdtsc
+   sub eax,[time+4]
+   sbb edx,[time+0]
+   ret
    
 \ No newline at end of file
diff --git a/libtommath/etc/tune.c b/libtommath/etc/tune.c
index 14aace2..acb146f 100644
--- a/libtommath/etc/tune.c
+++ b/libtommath/etc/tune.c
@@ -1,6 +1,6 @@
 /* Tune the Karatsuba parameters
  *
- * Tom St Denis, tomstdenis@iahu.ca
+ * Tom St Denis, tomstdenis@gmail.com
  */
 #include <tommath.h>
 #include <time.h>
@@ -10,13 +10,44 @@
  */
 #define TIMES (1UL<<14UL)
 
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC (void)
+   {
+   #if defined __GNUC__
+      #if defined(__i386__) || defined(__x86_64__)
+         unsigned long long a;
+         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+
 
 #ifndef X86_TIMER
 
 /* generic ISO C timer */
 ulong64 LBL_T;
-void t_start(void) { LBL_T = clock(); }
-ulong64 t_read(void) { return clock() - LBL_T; }
+void t_start(void) { LBL_T = TIMFUNC(); }
+ulong64 t_read(void) { return TIMFUNC() - LBL_T; }
 
 #else
 extern void t_start(void);
diff --git a/libtommath/logs/README b/libtommath/logs/README
index ea20c81..965e7c8 100644
--- a/libtommath/logs/README
+++ b/libtommath/logs/README
@@ -1,13 +1,13 @@
-To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
-Todo this type 
-
-make timing ; ltmtest
-
-in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
-
-After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
-them all :-)
-
-Have fun
-
+To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
+Todo this type 
+
+make timing ; ltmtest
+
+in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
+
+After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
+them all :-)
+
+Have fun
+
 Tom
 \ No newline at end of file
diff --git a/libtommath/logs/add.log b/libtommath/logs/add.log
index fa11039..43503ac 100644
--- a/libtommath/logs/add.log
+++ b/libtommath/logs/add.log
@@ -1,10 +1,10 @@
-480        88
-960       113
-1440       138
-1920       163
-2400       202
-2880       226
-3360       251
+480        87
+960       111
+1440       135
+1920       159
+2400       200
+2880       224
+3360       248
 3840       272
 4320       296
 4800       320
diff --git a/libtommath/logs/addsub.png b/libtommath/logs/addsub.png
index a5679ac..441c7b2 100644
--- a/libtommath/logs/addsub.png
+++ b/libtommath/logs/addsub.png
diff --git a/libtommath/logs/expt.log b/libtommath/logs/expt.log
index e65e927..70932ab 100644
--- a/libtommath/logs/expt.log
+++ b/libtommath/logs/expt.log
@@ -1,7 +1,7 @@
-513   1499509
-769   3682671
-1025   8098887
-2049  49332743
-2561  89647783
-3073 149440713
-4097 326135364
+513   1435869
+769   3544970
+1025   7791638
+2049  46902238
+2561  85334899
+3073 141451412
+4097 308770310
diff --git a/libtommath/logs/expt.png b/libtommath/logs/expt.png
index 9ee8bb7..d779cc5 100644
--- a/libtommath/logs/expt.png
+++ b/libtommath/logs/expt.png
diff --git a/libtommath/logs/expt_2k.log b/libtommath/logs/expt_2k.log
index d106280..97d325f 100644
--- a/libtommath/logs/expt_2k.log
+++ b/libtommath/logs/expt_2k.log
@@ -1,6 +1,5 @@
-521   1423346
-607   1841305
-1279   8375656
-2203  34104708
-3217  83830729
-4253 167916804
+607   2109225
+1279  10148314
+2203  34126877
+3217  82716424
+4253 161569606
diff --git a/libtommath/logs/expt_2kl.log b/libtommath/logs/expt_2kl.log
new file mode 100644
index 0000000..d9ad4be
--- /dev/null
+++ b/libtommath/logs/expt_2kl.log
@@ -0,0 +1,4 @@
+1024   7705271
+2048  34286851
+4096 165207491
+521   1618631
diff --git a/libtommath/logs/expt_dr.log b/libtommath/logs/expt_dr.log
index 6cfc874..c6bbe07 100644
--- a/libtommath/logs/expt_dr.log
+++ b/libtommath/logs/expt_dr.log
@@ -1,7 +1,7 @@
-532   1803110
-784   3607375
-1036   6089790
-1540  14739797
-2072  33251589
-3080  82794331
-4116 165212734
+532   1928550
+784   3763908
+1036   7564221
+1540  16566059
+2072  32283784
+3080  79851565
+4116 157843530
diff --git a/libtommath/logs/index.html b/libtommath/logs/index.html
index 19fe403..8c1ed9d 100644
--- a/libtommath/logs/index.html
+++ b/libtommath/logs/index.html
@@ -21,4 +21,4 @@
 <hr>
 
 </body>
-</html>
-\ No newline at end of file
+</html>
diff --git a/libtommath/logs/invmod.png b/libtommath/logs/invmod.png
index 0a8a4ad..9dcd7d8 100644
--- a/libtommath/logs/invmod.png
+++ b/libtommath/logs/invmod.png
diff --git a/libtommath/logs/mult.log b/libtommath/logs/mult.log
index 864de46..33563fc 100644
--- a/libtommath/logs/mult.log
+++ b/libtommath/logs/mult.log
@@ -1,143 +1,84 @@
-271       580
-390       861
-511      1177
-630      1598
-749      2115
-871      2670
-991      3276
-1111      3987
-1231      4722
-1351      5474
-1471      6281
-1589      7126
-1710      8114
-1831      8988
-1946     10038
-2071     10995
-2188     12286
-2310     13152
-2430     14480
-2549     15521
-2671     17171
-2790     18081
-2911     19754
-3031     20809
-3150     22849
-3269     23757
-3391     25772
-3508     26832
-3631     29304
-3750     30149
-3865     32581
-3988     33644
-4111     36565
-4231     37309
-4351     40152
-4471     41188
-4590     44658
-4710     45256
-4827     48538
-4951     49490
-5070     53472
-5190     53902
-5308     57619
-5431     58509
-5550     63044
-5664     63333
-5791     67542
-5911     68279
-6028     73477
-6150     73475
-6271     78189
-6390     78842
-6510     84691
-6631     84444
-6751     89721
-6871     90186
-6991     96665
-7111     96119
-7231    101937
-7350    102212
-7471    109439
-7591    108491
-7709    114965
-7829    115025
-7951    123002
-8071    121630
-8190    128725
-8311    128536
-8430    137298
-8550    135568
-8671    143265
-8791    142793
-8911    152432
-9030    150202
-9151    158616
-9271    157848
-9391    168374
-9511    165651
-9627    174775
-9750    173375
-9871    185067
-9985    181845
-10111    191708
-10229    190239
-10351    202585
-10467    198704
-10591    209193
-10711    207322
-10831    220842
-10950    215882
-11071    227761
-11191    225501
-11311    239669
-11430    234809
-11550    243511
-11671    255947
-11791    255243
-11906    267828
-12029    263437
-12149    276571
-12270    275579
-12390    288963
-12510    284001
-12631    298196
-12751    297018
-12869    310848
-12990    305369
-13111    319086
-13230    318940
-13349    333685
-13471    327495
-13588    343678
-13711    341817
-13831    357181
-13948    350440
-14071    367526
-14189    365330
-14311    381551
-14429    374149
-14549    392203
-14670    389764
-14791    406761
-14910    398652
-15026    417718
-15150    414733
-15269    432759
-15390   1037071
-15511   1053454
-15631   1069198
-15748   1086164
-15871   1112820
-15991   1129676
-16111   1145924
-16230   1163016
-16345   1179911
-16471   1197048
-16586   1214352
-16711   1232095
-16829   1249338
-16947   1266987
-17071   1284181
-17188   1302521
-17311   1320539
+271       555
+390       855
+508      1161
+631      1605
+749      2117
+871      2687
+991      3329
+1108      4084
+1231      4786
+1351      5624
+1470      6392
+1586      7364
+1710      8218
+1830      9255
+1951     10217
+2067     11461
+2191     12463
+2308     13677
+2430     14800
+2551     16232
+2671     17460
+2791     18899
+2902     20247
+3028     21902
+3151     23240
+3267     24927
+3391     26441
+3511     28277
+3631     29838
+3749     31751
+3869     33673
+3989     35431
+4111     37518
+4231     39426
+4349     41504
+4471     43567
+4591     45786
+4711     47876
+4831     50299
+4951     52427
+5071     54785
+5189     57241
+5307     59730
+5431     62194
+5551     64761
+5670     67322
+5789     70073
+5907     72663
+6030     75437
+6151     78242
+6268     81202
+6389     83948
+6509     86985
+6631     89903
+6747     93184
+6869     96044
+6991     99286
+7109    102395
+7229    105917
+7351    108940
+7470    112490
+7589    115702
+7711    119508
+7831    122632
+7951    126410
+8071    129808
+8190    133895
+8311    137146
+8431    141218
+8549    144732
+8667    149131
+8790    152462
+8911    156754
+9030    160479
+9149    165138
+9271    168601
+9391    173185
+9511    176988
+9627    181976
+9751    185539
+9870    190388
+9991    194335
+10110    199605
+10228    203298
diff --git a/libtommath/logs/mult.png b/libtommath/logs/mult.png
index 4f7a4ee..d22e8c8 100644
--- a/libtommath/logs/mult.png
+++ b/libtommath/logs/mult.png
diff --git a/libtommath/logs/mult_kara.log b/libtommath/logs/mult_kara.log
index 086feaf..7136c79 100644
--- a/libtommath/logs/mult_kara.log
+++ b/libtommath/logs/mult_kara.log
@@ -1,33 +1,84 @@
-924     16686
-1146     25334
-1371     35304
-1591     47122
-1820     61500
-2044     75254
-2266     91732
-2492    111656
-2716    129428
-2937    147508
-3164    167758
-3388    188248
-3612    210826
-3836    233814
-4059    256898
-4284    280210
-4508    310372
-4731    333902
-4955    376502
-5179    402854
-5404    432004
-5626    459010
-5849    491868
-6076    520550
-6300    547400
-6524    575968
-6747    608482
-6971    642850
-7196    673670
-7419    710680
-7644    743942
-7868    780394
-8092    817342
+271       560
+391       870
+511      1159
+631      1605
+750      2111
+871      2737
+991      3361
+1111      4054
+1231      4778
+1351      5600
+1471      6404
+1591      7323
+1710      8255
+1831      9239
+1948     10257
+2070     11397
+2190     12531
+2308     13665
+2429     14870
+2550     16175
+2671     17539
+2787     18879
+2911     20350
+3031     21807
+3150     23415
+3270     24897
+3388     26567
+3511     28205
+3627     30076
+3751     31744
+3869     33657
+3991     35425
+4111     37522
+4229     39363
+4351     41503
+4470     43491
+4590     45827
+4711     47795
+4828     50166
+4951     52318
+5070     54911
+5191     57036
+5308     58237
+5431     60248
+5551     62678
+5671     64786
+5791     67294
+5908     69343
+6031     71607
+6151     74166
+6271     76590
+6391     78734
+6511     81175
+6631     83742
+6750     86403
+6868     88873
+6990     91150
+7110     94211
+7228     96922
+7351     99445
+7469    102216
+7589    104968
+7711    108113
+7827    110758
+7950    113714
+8071    116511
+8186    119643
+8310    122679
+8425    125581
+8551    128715
+8669    131778
+8788    135116
+8910    138138
+9031    141628
+9148    144754
+9268    148367
+9391    151551
+9511    155033
+9631    158652
+9751    162125
+9871    165248
+9988    168627
+10111    172427
+10231    176412
diff --git a/libtommath/logs/sqr.log b/libtommath/logs/sqr.log
index 0898342..cd29fc5 100644
--- a/libtommath/logs/sqr.log
+++ b/libtommath/logs/sqr.log
@@ -1,143 +1,84 @@
-271       552
-389       883
-510      1191
-629      1572
-750      1996
-863      2428
-991      2891
-1108      3539
-1231      4182
-1351      4980
-1471      5771
-1590      6551
-1711      7313
-1830      8240
-1951      9184
-2070     10087
-2191     11140
-2311     12111
-2431     13219
-2550     14247
-2669     15353
-2791     16446
-2911     17692
-3029     18848
-3151     20028
-3268     21282
-3391     22696
-3511     23971
-3631     25303
-3751     26675
-3871     28245
-3990     29736
-4111     31124
-4229     32714
-4347     34397
-4471     35877
-4587     37269
-4710     39011
-4831     40884
-4950     42501
-5070     44005
-5191     46026
-5310     48168
-5431     49801
-5551     51385
-5671     53604
-5787     55942
-5910     57757
-6031     59391
-6151     61754
-6271     64234
-6390     66110
-6511     67845
-6627     70474
-6751     73113
-6871     75064
-6990     76940
-7111     79681
-7230     82548
-7351     84597
-7471     86507
-7591     89497
-7711    225216
-7831    232192
-7951    239583
-8071    247302
-8191    255497
-8308    261587
-8431    271490
-8550    279492
-8671    286927
-8790    294680
-8910    302974
-9030    311300
-9150    318635
-9271    326740
-9390    335304
-9511    344297
-9630    352056
-9748    358652
-9870    369723
-9991    379119
-10111    386982
-10231    396075
-10349    404396
-10470    415375
-10590    424146
-10711    433390
-10829    442662
-10950    453238
-11071    462178
-11186    469811
-11311    482529
-11431    493214
-11550    503210
-11671    513486
-11791    524244
-11911    535277
-12031    544872
-12151    555695
-12271    566893
-12391    578385
-12510    588658
-12628    596914
-12751    611324
-12871    623437
-12991    633907
-13110    645605
-13231    657684
-13351    670037
-13471    680939
-13591    693047
-13710    705363
-13829    718178
-13949    727930
-14069    739641
-14190    754817
-14310    768192
-14431    779875
-14551    792655
-14667    802847
-14791    819806
-14911    831684
-15031    844936
-15151    858813
-15270    873037
-15387    882123
-15510    899117
-15631    913465
-15750    927989
-15870    940790
-15991    954948
-16110    969483
-16231    984544
-16350    997837
-16470   1012445
-16590   1027834
-16710   1043032
-16831   1056394
-16951   1071408
-17069   1097263
-17191   1113364
-17306   1123650
+265       562
+389       882
+509      1207
+631      1572
+750      1990
+859      2433
+991      2894
+1109      3555
+1230      4228
+1350      5018
+1471      5805
+1591      6579
+1709      7415
+1829      8329
+1949      9225
+2071     10139
+2188     11239
+2309     12178
+2431     13212
+2551     14294
+2671     15551
+2791     16512
+2911     17718
+3030     18876
+3150     20259
+3270     21374
+3391     22650
+3511     23948
+3631     25493
+3750     26756
+3870     28225
+3989     29705
+4110     31409
+4230     32834
+4351     34327
+4471     35818
+4591     37636
+4711     39228
+4830     40868
+4949     42393
+5070     44541
+5191     46269
+5310     48162
+5429     49728
+5548     51985
+5671     53948
+5791     55885
+5910     57584
+6031     60082
+6150     62239
+6270     64309
+6390     66014
+6511     68766
+6631     71012
+6750     73172
+6871     74952
+6991     77909
+7111     80371
+7231     82666
+7351     84531
+7469     87698
+7589     90318
+7711    225384
+7830    232428
+7950    240009
+8070    246522
+8190    253662
+8310    260961
+8431    269253
+8549    275743
+8671    283769
+8789    290811
+8911    300034
+9030    306873
+9149    315085
+9270    323944
+9390    332390
+9508    337519
+9631    348986
+9749    356904
+9871    367013
+9989    373831
+10108    381033
+10230    393475
diff --git a/libtommath/logs/sqr_kara.log b/libtommath/logs/sqr_kara.log
index cafe458..06355a7 100644
--- a/libtommath/logs/sqr_kara.log
+++ b/libtommath/logs/sqr_kara.log
@@ -1,33 +1,84 @@
-922     11272
-1148     16004
-1370     21958
-1596     28684
-1817     37832
-2044     46386
-2262     56218
-2492     66388
-2716     77478
-2940     89380
-3163    103680
-3385    116274
-3612    135334
-3836    151332
-4057    164938
-4284    183178
-4508    198864
-4731    215222
-4954    231986
-5180    251660
-5404    269414
-5626    288454
-5850    307806
-6076    329458
-6299    347726
-6523    369864
-6748    387832
-6971    413010
-7194    453310
-7415    476936
-7643    497118
-7867    521394
-8091    540224
+271       560
+388       878
+511      1179
+629      1625
+751      1988
+871      2423
+989      2896
+1111      3561
+1231      4209
+1350      5015
+1470      5804
+1591      6556
+1709      7420
+1831      8263
+1951      9173
+2070     10153
+2191     11229
+2310     12167
+2431     13211
+2550     14309
+2671     15524
+2788     16525
+2910     17712
+3028     18822
+3148     20220
+3271     21343
+3391     22652
+3511     23944
+3630     25485
+3750     26778
+3868     28201
+3990     29653
+4111     31393
+4225     32841
+4350     34328
+4471     35786
+4590     37652
+4711     39245
+4830     40876
+4951     42433
+5068     44547
+5191     46321
+5311     48140
+5430     49727
+5550     52034
+5671     53954
+5791     55921
+5908     57597
+6031     60084
+6148     62226
+6270     64295
+6390     66045
+6511     68779
+6629     71003
+6751     73169
+6871     74992
+6991     77895
+7110     80376
+7231     82628
+7351     84468
+7470     87664
+7591     90284
+7711     91352
+7828     93995
+7950     96276
+8071     98691
+8190    101256
+8308    103631
+8431    105222
+8550    108343
+8671    110281
+8787    112764
+8911    115397
+9031    117690
+9151    120266
+9271    122715
+9391    124624
+9510    127937
+9630    130313
+9750    132914
+9871    136129
+9991    138517
+10108    141525
+10231    144225
diff --git a/libtommath/logs/sub.log b/libtommath/logs/sub.log
index a42d91e..9f84fa2 100644
--- a/libtommath/logs/sub.log
+++ b/libtommath/logs/sub.log
@@ -1,16 +1,16 @@
-480        87
-960       114
-1440       139
-1920       159
-2400       204
-2880       228
-3360       250
-3840       273
-4320       300
+480        94
+960       116
+1440       140
+1920       164
+2400       205
+2880       229
+3360       253
+3840       277
+4320       299
 4800       321
-5280       348
-5760       370
-6240       393
-6720       420
-7200       444
-7680       466
+5280       345
+5760       371
+6240       395
+6720       419
+7200       441
+7680       465
diff --git a/libtommath/makefile b/libtommath/makefile
index 164a0ab..70de306 100644
--- a/libtommath/makefile
+++ b/libtommath/makefile
@@ -3,12 +3,18 @@
 #Tom St Denis
 
 #version of library 
-VERSION=0.33
+VERSION=0.42.0
 
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
+ifndef MAKE
+   MAKE=make
+endif
+
+ifndef IGNORE_SPEED
+
 #for speed 
-CFLAGS += -O3 -funroll-all-loops
+CFLAGS += -O3 -funroll-loops
 
 #for size 
 #CFLAGS += -Os
@@ -19,14 +25,28 @@ CFLAGS  += -fomit-frame-pointer
 #debug
 #CFLAGS += -g3
 
-#install as this user
-USER=root
-GROUP=root
+endif
 
-default: libtommath.a
+#install as this user
+ifndef INSTALL_GROUP
+   GROUP=wheel
+else
+   GROUP=$(INSTALL_GROUP)
+endif
+
+ifndef INSTALL_USER
+   USER=root
+else
+   USER=$(INSTALL_USER)
+endif
 
 #default files to install
-LIBNAME=libtommath.a
+ifndef LIBNAME
+   LIBNAME=libtommath.a
+endif
+
+default: ${LIBNAME}
+
 HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 #LIBPATH-The directory for libtommath to be installed to.
@@ -57,15 +77,17 @@ bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
-libtommath.a:  $(OBJECTS)
-	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
-	ranlib libtommath.a
+$(LIBNAME):  $(OBJECTS)
+	$(AR) $(ARFLAGS) $@ $(OBJECTS)
+	ranlib $@
 
 #make a profiled library (takes a while!!!)
 #
@@ -87,27 +109,27 @@ profiled_single:
 	./ltmtest
 	rm -f *.o ltmtest
 	$(CC) $(CFLAGS) -fbranch-probabilities -DTESTING -c mpi.c -o mpi.o
-	$(AR) $(ARFLAGS) libtommath.a mpi.o
-	ranlib libtommath.a	
+	$(AR) $(ARFLAGS) $(LIBNAME) mpi.o
+	ranlib $(LIBNAME)	
 
-install: libtommath.a
+install: $(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
 	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
-test: libtommath.a demo/demo.o
-	$(CC) $(CFLAGS) demo/demo.o libtommath.a -o test
+test: $(LIBNAME) demo/demo.o
+	$(CC) $(CFLAGS) demo/demo.o $(LIBNAME) -o test
 	
 mtest: test	
 	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
         
-timing: libtommath.a
-	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
+timing: $(LIBNAME)
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c $(LIBNAME) -o ltmtest
 
 # makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
 docdvi: tommath.src
-	cd pics ; make 
+	cd pics ; MAKE=${MAKE} ${MAKE} 
 	echo "hello" > tommath.ind
 	perl booker.pl
 	latex tommath > /dev/null
@@ -124,7 +146,7 @@ poster: poster.tex
 docs:   docdvi
 	dvipdf tommath
 	rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg
-	cd pics ; make clean
+	cd pics ; MAKE=${MAKE} ${MAKE} clean
 	
 #LTM user manual
 mandvi: bn.tex
@@ -144,14 +166,21 @@ pretty:
 
 clean:
 	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex `find -type f | grep [~] | xargs` *.lo *.la
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex `find . -type f | grep [~] | xargs` *.lo *.la
 	rm -rf .libs
-	cd etc ; make clean
-	cd pics ; make clean
+	cd etc ; MAKE=${MAKE} ${MAKE} clean
+	cd pics ; MAKE=${MAKE} ${MAKE} clean
+
+#zipup the project (take that!)
+no_oops: clean
+	cd .. ; cvs commit 
+	echo Scanning for scratch/dirty files
+	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
 
 zipup: clean manual poster docs
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; \
 	tar -c libtommath-$(VERSION)/* | bzip2 -9vvc > ltm-$(VERSION).tar.bz2 ; \
-	zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
+	zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* ; \
+	mv -f ltm* ~ ; rm -rf libtommath-$(VERSION)
diff --git a/libtommath/makefile.bcc b/libtommath/makefile.bcc
index 775e9ff..67743d9 100644
--- a/libtommath/makefile.bcc
+++ b/libtommath/makefile.bcc
@@ -27,16 +27,18 @@ bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj \
 bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_reduce_2k_l.obj bn_mp_reduce_is_2k_l.obj bn_mp_reduce_2k_setup_l.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
 bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
-bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
+bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
 
 TARGET = libtommath.lib
 
 $(TARGET): $(OBJECTS)
 
-.c.objbjbjbj:
+.c.obj:
 	$(CC) $(CFLAGS) $<
 	$(LIB) $(TARGET) -+$@
diff --git a/libtommath/makefile.cygwin_dll b/libtommath/makefile.cygwin_dll
index c90e5d9..85b10c7 100644
--- a/libtommath/makefile.cygwin_dll
+++ b/libtommath/makefile.cygwin_dll
@@ -32,11 +32,13 @@ bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
 # make a Windows DLL via Cygwin
 windll:  $(OBJECTS)
diff --git a/libtommath/makefile.icc b/libtommath/makefile.icc
index 3775b20..cf70ab0 100644
--- a/libtommath/makefile.icc
+++ b/libtommath/makefile.icc
@@ -19,7 +19,7 @@ CFLAGS  +=  -I./
 #   B - Blend of P4 and PM [mobile]
 #
 # Default to just generic max opts
-CFLAGS += -O3 -xN
+CFLAGS += -O3 -xP -ip
 
 #install as this user
 USER=root
@@ -59,11 +59,13 @@ bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
 
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
diff --git a/libtommath/makefile.msvc b/libtommath/makefile.msvc
index cf59943..5edebec 100644
--- a/libtommath/makefile.msvc
+++ b/libtommath/makefile.msvc
@@ -2,7 +2,7 @@
 #
 #Tom St Denis
 
-CFLAGS = /I. /Ox /DWIN32 /W4
+CFLAGS = /I. /Ox /DWIN32 /W3 /Fo$@
 
 default: library
 
@@ -26,11 +26,15 @@ bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj \
 bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_reduce_2k_l.obj bn_mp_reduce_is_2k_l.obj bn_mp_reduce_2k_setup_l.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
 bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
-bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
+bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
+
+HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
diff --git a/libtommath/makefile.shared b/libtommath/makefile.shared
index 86a3786..f17bbbd 100644
--- a/libtommath/makefile.shared
+++ b/libtommath/makefile.shared
@@ -1,11 +1,14 @@
 #Makefile for GCC
 #
 #Tom St Denis
-VERSION=0:33
+VERSION=0:41
+
+CC = libtool --mode=compile --tag=CC gcc
 
-CC = libtool --mode=compile gcc
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
+ifndef IGNORE_SPEED
+
 #for speed 
 CFLAGS += -O3 -funroll-loops
 
@@ -15,14 +18,30 @@ CFLAGS += -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer
 
+endif
+
 #install as this user
-USER=root
-GROUP=root
+ifndef INSTALL_GROUP
+   GROUP=wheel
+else
+   GROUP=$(INSTALL_GROUP)
+endif
+
+ifndef INSTALL_USER
+   USER=root
+else
+   USER=$(INSTALL_USER)
+endif
 
 default: libtommath.la
 
 #default files to install
-LIBNAME=libtommath.la
+ifndef LIBNAME
+   LIBNAME=libtommath.la
+endif
+ifndef LIBNAME_S
+   LIBNAME_S=libtommath.a
+endif
 HEADERS=tommath.h tommath_class.h tommath_superclass.h
 
 #LIBPATH-The directory for libtommath to be installed to.
@@ -53,25 +72,31 @@ bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
-bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
+
+objs: $(OBJECTS)
+
+$(LIBNAME):  $(OBJECTS)
+	libtool --mode=link gcc *.lo -o $(LIBNAME) -rpath $(LIBPATH) -version-info $(VERSION)
 
-libtommath.la:  $(OBJECTS)
-	libtool --mode=link gcc *.lo -o libtommath.la -rpath $(LIBPATH) -version-info $(VERSION)
-	libtool --mode=link gcc *.o -o libtommath.a 
-	libtool --mode=install install -c libtommath.la $(LIBPATH)/libtommath.la
+install: $(LIBNAME)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	libtool --mode=install install -c $(LIBNAME) $(DESTDIR)$(LIBPATH)/$(LIBNAME)
 	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
 	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
 
-test: libtommath.a demo/demo.o
+test: $(LIBNAME) demo/demo.o
 	gcc $(CFLAGS) -c demo/demo.c -o demo/demo.o
-	libtool --mode=link gcc -o test demo/demo.o libtommath.la
+	libtool --mode=link gcc -o test demo/demo.o $(LIBNAME_S)
 	
 mtest: test	
-	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest
         
-timing: libtommath.la
-	gcc $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
+timing: $(LIBNAME)
+	gcc $(CFLAGS) -DTIMER demo/timing.c $(LIBNAME_S) -o ltmtest
diff --git a/libtommath/mess.sh b/libtommath/mess.sh
new file mode 100644
index 0000000..bf639ce
--- /dev/null
+++ b/libtommath/mess.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+if cvs log $1 >/dev/null 2>/dev/null; then exit 0; else echo "$1 shouldn't be here" ; exit 1; fi
+
+
diff --git a/libtommath/mtest/logtab.h b/libtommath/mtest/logtab.h
index 68462bd..addd3ab 100644
--- a/libtommath/mtest/logtab.h
+++ b/libtommath/mtest/logtab.h
@@ -17,4 +17,3 @@ const float s_logv_2[] = {
    0.169293808, 0.168613099, 0.167948779, 0.167300179, 	/* 60 61 62 63 */
    0.166666667
 };
-
diff --git a/libtommath/mtest/mpi-config.h b/libtommath/mtest/mpi-config.h
index dcdbd35..a347263 100644
--- a/libtommath/mtest/mpi-config.h
+++ b/libtommath/mtest/mpi-config.h
@@ -1,5 +1,4 @@
 /* Default configuration for MPI library */
-/* $Id: mpi-config.h,v 1.1.1.1 2005/01/19 22:41:29 kennykb Exp $ */
 
 #ifndef MPI_CONFIG_H_
 #define MPI_CONFIG_H_
diff --git a/libtommath/mtest/mpi-types.h b/libtommath/mtest/mpi-types.h
index e097188..42ccfc3 100644
--- a/libtommath/mtest/mpi-types.h
+++ b/libtommath/mtest/mpi-types.h
@@ -13,4 +13,3 @@ typedef int                mp_err;
 #define MP_DIGIT_SIZE      2
 #define DIGIT_FMT          "%04X"
 #define RADIX              (MP_DIGIT_MAX+1)
-
diff --git a/libtommath/mtest/mpi.c b/libtommath/mtest/mpi.c
index 0517602..4566e89 100644
--- a/libtommath/mtest/mpi.c
+++ b/libtommath/mtest/mpi.c
@@ -5,8 +5,6 @@
     Copyright (C) 1998 Michael J. Fromberger, All Rights Reserved
 
     Arbitrary precision integer arithmetic library
-
-    $Id: mpi.c,v 1.1.1.1 2005/01/19 22:41:29 kennykb Exp $
  */
 
 #include "mpi.h"
@@ -91,7 +89,7 @@ static unsigned int s_mp_defprec = MP_DEFPREC;
 /* {{{ Constant strings */
 
 /* Constant strings returned by mp_strerror() */
-static const char *mp_err_string[] = {
+static const char *const mp_err_string[] = {
   "unknown result code",     /* say what?            */
   "boolean true",            /* MP_OKAY, MP_YES      */
   "boolean false",           /* MP_NO                */
diff --git a/libtommath/mtest/mpi.h b/libtommath/mtest/mpi.h
index b7a8cb5..211421f 100644
--- a/libtommath/mtest/mpi.h
+++ b/libtommath/mtest/mpi.h
@@ -5,8 +5,6 @@
     Copyright (C) 1998 Michael J. Fromberger, All Rights Reserved
 
     Arbitrary precision integer arithmetic library
-
-    $Id: mpi.h,v 1.1.1.1 2005/01/19 22:41:29 kennykb Exp $
  */
 
 #ifndef _H_MPI_
diff --git a/libtommath/pics/expt_state.tif b/libtommath/pics/expt_state.tif
index cb06e8e..0aaee39 100644
--- a/libtommath/pics/expt_state.tif
+++ b/libtommath/pics/expt_state.tif
diff --git a/libtommath/pics/primality.tif b/libtommath/pics/primality.tif
index 76d6be3..83aafe0 100644
--- a/libtommath/pics/primality.tif
+++ b/libtommath/pics/primality.tif
diff --git a/libtommath/poster.pdf b/libtommath/poster.pdf
index e0b4f84..1f705cf 100644
--- a/libtommath/poster.pdf
+++ b/libtommath/poster.pdf
diff --git a/libtommath/pre_gen/mpi.c b/libtommath/pre_gen/mpi.c
index 7d832e7..d2224c0 100644
--- a/libtommath/pre_gen/mpi.c
+++ b/libtommath/pre_gen/mpi.c
@@ -13,7 +13,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 static const struct {
@@ -60,7 +60,7 @@ char *mp_error_to_string(int code)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes the modular inverse via binary extended euclidean algorithm, 
@@ -69,8 +69,7 @@ char *mp_error_to_string(int code)
  * Based on slow invmod except this is optimized for the case where b is 
  * odd as per HAC Note 14.64 on pp. 610
  */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
   mp_int  x, y, u, v, B, D;
   int     res, neg;
@@ -91,7 +90,7 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
     goto LBL_ERR;
   }
 
@@ -209,7 +208,7 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction
@@ -220,8 +219,7 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
  *
  * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
   int     ix, res, olduse;
   mp_word W[MP_WARRAY];
@@ -382,7 +380,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Fast (comba) multiplier
@@ -401,8 +399,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
  */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -433,7 +430,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -441,6 +438,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       /* execute loop */
       for (iz = 0; iz < iy; ++iz) {
          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+
       }
 
       /* store term */
@@ -448,19 +446,16 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* make next carry */
       _W = _W >> ((mp_word)DIGIT_BIT);
-  }
-
-  /* store final carry */
-  W[ix] = _W;
+ }
 
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
@@ -492,7 +487,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* this is a modified version of fast_s_mul_digs that only produces
@@ -504,8 +499,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix, iz;
   mp_digit W[MP_WARRAY];
@@ -551,9 +545,6 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       _W = _W >> ((mp_word)DIGIT_BIT);
   }
   
-  /* store final carry */
-  W[ix] = _W;
-
   /* setup dest */
   olduse  = c->used;
   c->used = pa;
@@ -562,7 +553,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     register mp_digit *tmpc;
 
     tmpc = c->dp + digs;
-    for (ix = digs; ix <= pa; ix++) {
+    for (ix = digs; ix < pa; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
@@ -594,36 +585,17 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -658,7 +630,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -683,7 +655,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       }
 
       /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
@@ -727,7 +699,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes a = 2**b 
@@ -775,7 +747,7 @@ mp_2expt (mp_int * a, int b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = |a| 
@@ -818,7 +790,7 @@ mp_abs (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* high level addition (handles signs) */
@@ -871,7 +843,7 @@ int mp_add (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* single digit addition */
@@ -899,6 +871,9 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
      /* fix sign  */
      a->sign = c->sign = MP_NEG;
 
+     /* clamp */
+     mp_clamp(c);
+
      return res;
   }
 
@@ -980,7 +955,7 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* d = a + b (mod c) */
@@ -1021,7 +996,7 @@ mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* AND two ints together */
@@ -1078,7 +1053,7 @@ mp_and (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* trim unused digits 
@@ -1122,7 +1097,7 @@ mp_clamp (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* clear one (frees)  */
@@ -1166,7 +1141,7 @@ mp_clear (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 #include <stdarg.h>
 
@@ -1200,7 +1175,7 @@ void mp_clear_multi(mp_int *mp, ...)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* compare two ints (signed)*/
@@ -1243,7 +1218,7 @@ mp_cmp (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* compare a digit */
@@ -1287,7 +1262,7 @@ int mp_cmp_d(mp_int * a, mp_digit b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* compare maginitude of two ints (unsigned) */
@@ -1342,7 +1317,7 @@ int mp_cmp_mag (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 static const int lnz[16] = { 
@@ -1395,7 +1370,7 @@ int mp_cnt_lsb(mp_int *a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* copy, b = a */
@@ -1463,7 +1438,7 @@ mp_copy (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* returns the number of bits in an int */
@@ -1508,7 +1483,7 @@ mp_count_bits (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 #ifdef BN_MP_DIV_SMALL
@@ -1800,7 +1775,7 @@ LBL_Q:mp_clear (&q);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = a/2 */
@@ -1868,7 +1843,7 @@ int mp_div_2(mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
@@ -1965,7 +1940,7 @@ int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* divide by three (based on routine from MPI and the GMP manual) */
@@ -2044,14 +2019,19 @@ mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 static int s_is_power_of_two(mp_digit b, int *p)
 {
    int x;
 
-   for (x = 1; x < DIGIT_BIT; x++) {
+   /* fast return if no power of two */
+   if ((b==0) || (b & (b-1))) {
+      return 0;
+   }
+
+   for (x = 0; x < DIGIT_BIT; x++) {
       if (b == (((mp_digit)1)<<x)) {
          *p = x;
          return 1;
@@ -2154,7 +2134,7 @@ int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines if a number is a valid DR modulus */
@@ -2197,7 +2177,7 @@ int mp_dr_is_modulus(mp_int *a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
@@ -2291,7 +2271,7 @@ top:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines the setup value */
@@ -2323,7 +2303,7 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* swap the elements of two integers, for cases where you can't simply swap the 
@@ -2357,7 +2337,7 @@ mp_exch (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* calculate c = a**b  using a square-multiply algorithm */
@@ -2414,7 +2394,7 @@ int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 
@@ -2467,21 +2447,29 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 #endif
   }
 
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
 #ifdef BN_MP_DR_IS_MODULUS_C
   /* is it a DR modulus? */
   dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
   dr = 0;
 #endif
 
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
   if (dr == 0) {
      dr = mp_reduce_is_2k(P) << 1;
   }
 #endif
     
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == 1 || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -2489,7 +2477,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
     /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
     /* no exptmod for evens */
     return MP_VAL;
@@ -2518,7 +2506,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
@@ -2535,8 +2523,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
    #define TAB_SIZE 256
 #endif
 
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
@@ -2822,7 +2809,6 @@ LBL_M:
 }
 #endif
 
-
 /* End: bn_mp_exptmod_fast.c */
 
 /* Start: bn_mp_exteuclid.c */
@@ -2840,7 +2826,7 @@ LBL_M:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Extended euclidean algorithm of (a, b) produces 
@@ -2887,6 +2873,13 @@ int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
@@ -2915,7 +2908,7 @@ _ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* read a bigint from a file stream in ASCII */
@@ -2982,7 +2975,7 @@ int mp_fread(mp_int *a, int radix, FILE *stream)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 int mp_fwrite(mp_int *a, int radix, FILE *stream)
@@ -3034,7 +3027,7 @@ int mp_fwrite(mp_int *a, int radix, FILE *stream)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Greatest Common Divisor using the binary method */
@@ -3044,21 +3037,13 @@ int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   int     k, u_lsb, v_lsb, res;
 
   /* either zero than gcd is the largest */
-  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+  if (mp_iszero (a) == MP_YES) {
     return mp_abs (b, c);
   }
-  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+  if (mp_iszero (b) == MP_YES) {
     return mp_abs (a, c);
   }
 
-  /* optimized.  At this point if a == 0 then
-   * b must equal zero too
-   */
-  if (mp_iszero (a) == 1) {
-    mp_zero(c);
-    return MP_OKAY;
-  }
-
   /* get copies of a and b we can modify */
   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
     return res;
@@ -3147,7 +3132,7 @@ LBL_U:mp_clear (&v);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* get the lower 32-bits of an mp_int */
@@ -3192,7 +3177,7 @@ unsigned long mp_get_int(mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* grow as required */
@@ -3249,7 +3234,7 @@ int mp_grow (mp_int * a, int size)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* init a new mp_int */
@@ -3295,7 +3280,7 @@ int mp_init (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* creates "a" then copies b into it */
@@ -3327,7 +3312,7 @@ int mp_init_copy (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 #include <stdarg.h>
 
@@ -3386,7 +3371,7 @@ int mp_init_multi(mp_int *mp, ...)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* initialize and set a digit */
@@ -3418,7 +3403,7 @@ int mp_init_set (mp_int * a, mp_digit b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* initialize and set a digit */
@@ -3449,7 +3434,7 @@ int mp_init_set_int (mp_int * a, unsigned long b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* init an mp_init for a given size */
@@ -3497,7 +3482,7 @@ int mp_init_size (mp_int * a, int size)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* hac 14.61, pp608 */
@@ -3540,7 +3525,7 @@ int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* hac 14.61, pp608 */
@@ -3561,8 +3546,8 @@ int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto LBL_ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
     goto LBL_ERR;
@@ -3715,7 +3700,7 @@ LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Check if remainders are possible squares - fast exclude non-squares */
@@ -3824,7 +3809,7 @@ ERR:mp_clear(&t);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes the jacobi c = (a | n) (or Legendre if n is prime)
@@ -3929,7 +3914,7 @@ LBL_A1:mp_clear (&a1);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* c = |a| * |b| using Karatsuba Multiplication using 
@@ -3943,12 +3928,12 @@ LBL_A1:mp_clear (&a1);
  * b = b1 * B**n + b0
  *
  * Then, a * b => 
-   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+   a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
  *
  * Note that a1b1 and a0b0 are used twice and only need to be 
  * computed once.  So in total three half size (half # of 
  * digit) multiplications are performed, a0b0, a1b1 and 
- * (a1-b1)(a0-b0)
+ * (a1+b1)(a0+b0)
  *
  * Note that a multiplication of half the digits requires
  * 1/4th the number of single precision multiplications so in 
@@ -4039,19 +4024,19 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
     goto X1Y1;          /* x1y1 = x1*y1 */
 
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc x1+x0 and y1+y0 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1Y1;          /* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+  if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = y1 - y0 */
   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
 
   /* add x0y0 */
   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
     goto X1Y1;          /* t2 = x0y0 + x1y1 */
-  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+  if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
@@ -4096,7 +4081,7 @@ ERR:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Karatsuba squaring, computes b = a*a using three 
@@ -4164,8 +4149,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
     goto X1X1;           /* x1x1 = x1*x1 */
 
-  /* now calc (x1-x0)**2 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+  /* now calc (x1+x0)**2 */
+  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = x1 - x0 */
   if (mp_sqr (&t1, &t1) != MP_OKAY)
     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
@@ -4173,8 +4158,8 @@ int mp_karatsuba_sqr (mp_int * a, mp_int * b)
   /* add x0y0 */
   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
     goto X1X1;           /* t2 = x0x0 + x1x1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+  if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
@@ -4217,7 +4202,7 @@ ERR:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes least common multiple as |a*b|/(a, b) */
@@ -4277,7 +4262,7 @@ LBL_T:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift left a certain amount of digits */
@@ -4344,7 +4329,7 @@ int mp_lshd (mp_int * a, int b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* c = a mod b, 0 <= c < b */
@@ -4392,7 +4377,7 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* calc a value mod 2**b */
@@ -4447,7 +4432,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 int
@@ -4474,7 +4459,7 @@ mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /*
@@ -4490,7 +4475,6 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
@@ -4534,7 +4518,7 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
@@ -4652,7 +4636,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* setups the montgomery reduction stuff */
@@ -4688,7 +4672,7 @@ mp_montgomery_setup (mp_int * n, mp_digit * rho)
 #endif
 
   /* rho = -1/m mod b */
-  *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
 
   return MP_OKAY;
 }
@@ -4711,7 +4695,7 @@ mp_montgomery_setup (mp_int * n, mp_digit * rho)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* high level multiplication (handles sign) */
@@ -4777,7 +4761,7 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = a*2 */
@@ -4859,7 +4843,7 @@ int mp_mul_2(mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift left by a certain bit count */
@@ -4944,7 +4928,7 @@ int mp_mul_2d (mp_int * a, int b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiply by a digit */
@@ -4989,8 +4973,9 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
   }
 
-  /* store final carry [if any] */
+  /* store final carry [if any] and increment ix offset  */
   *tmpc++ = u;
+  ++ix;
 
   /* now zero digits above the top */
   while (ix++ < olduse) {
@@ -5022,12 +5007,11 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* d = a * b (mod c) */
-int
-mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
   int     res;
   mp_int  t;
@@ -5063,7 +5047,7 @@ mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* find the n'th root of an integer 
@@ -5195,19 +5179,25 @@ LBL_T1:mp_clear (&t1);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* b = -a */
 int mp_neg (mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
   }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
   }
+
   return MP_OKAY;
 }
 #endif
@@ -5229,7 +5219,7 @@ int mp_neg (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* OR two ints together */
@@ -5279,7 +5269,7 @@ int mp_or (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* performs one Fermat test.
@@ -5341,7 +5331,7 @@ LBL_T:mp_clear (&t);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines if an integers is divisible by one 
@@ -5391,7 +5381,7 @@ int mp_prime_is_divisible (mp_int * a, int *result)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* performs a variable number of rounds of Miller-Rabin
@@ -5474,7 +5464,7 @@ LBL_B:mp_clear (&b);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Miller-Rabin test of "a" to the base of "b" as described in 
@@ -5577,7 +5567,7 @@ LBL_N1:mp_clear (&n1);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* finds the next prime after the number "a" using "t" trials
@@ -5708,7 +5698,7 @@ int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
 
       /* is this prime? */
       for (x = 0; x < t; x++) {
-          mp_set(&b, ltm_prime_tab[t]);
+          mp_set(&b, ltm_prime_tab[x]);
           if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
              goto LBL_ERR;
           }
@@ -5747,7 +5737,7 @@ LBL_ERR:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 
@@ -5799,7 +5789,7 @@ int mp_prime_rabin_miller_trials(int size)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* makes a truly random prime of a given size (bits),
@@ -5847,15 +5837,13 @@ int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback
 
    /* calc the maskOR_msb */
    maskOR_msb        = 0;
-   maskOR_msb_offset = (size - 2) >> 3;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
    if (flags & LTM_PRIME_2MSB_ON) {
-      maskOR_msb     |= 1 << ((size - 2) & 7);
-   } else if (flags & LTM_PRIME_2MSB_OFF) {
-      maskAND        &= ~(1 << ((size - 2) & 7));
-   } 
+      maskOR_msb       |= 0x80 >> ((9 - size) & 7);
+   }  
 
    /* get the maskOR_lsb */
-   maskOR_lsb         = 0;
+   maskOR_lsb         = 1;
    if (flags & LTM_PRIME_BBS) {
       maskOR_lsb     |= 3;
    }
@@ -5926,7 +5914,7 @@ error:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* returns size of ASCII reprensentation */
@@ -5949,22 +5937,29 @@ int mp_radix_size (mp_int * a, int radix, int *size)
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+    *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
   }
 
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
@@ -5997,7 +5992,7 @@ int mp_radix_size (mp_int * a, int radix, int *size)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* chars used in radix conversions */
@@ -6021,7 +6016,7 @@ const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrs
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* makes a pseudo-random int of a given size */
@@ -6038,14 +6033,14 @@ mp_rand (mp_int * a, int digits)
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
@@ -6076,15 +6071,18 @@ mp_rand (mp_int * a, int digits)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* read a string [ASCII] in a given radix */
-int mp_read_radix (mp_int * a, char *str, int radix)
+int mp_read_radix (mp_int * a, const char *str, int radix)
 {
   int     y, res, neg;
   char    ch;
 
+  /* zero the digit bignum */
+  mp_zero(a);
+
   /* make sure the radix is ok */
   if (radix < 2 || radix > 64) {
     return MP_VAL;
@@ -6158,12 +6156,11 @@ int mp_read_radix (mp_int * a, char *str, int radix)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int
-mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_signed_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
@@ -6200,12 +6197,11 @@ mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int
-mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
 {
   int     res;
 
@@ -6256,15 +6252,14 @@ mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reduces x mod m, assumes 0 < x < m**2, mu is 
  * precomputed via mp_reduce_setup.
  * From HAC pp.604 Algorithm 14.42
  */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
   mp_int  q;
   int     res, um = m->used;
@@ -6284,11 +6279,11 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
@@ -6357,12 +6352,11 @@ CLEANUP:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reduces a modulo n where n is of the form 2**p - d */
-int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
    mp_int q;
    int    p, res;
@@ -6404,6 +6398,68 @@ ERR:
 
 /* End: bn_mp_reduce_2k.c */
 
+/* Start: bn_mp_reduce_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
+
+/* End: bn_mp_reduce_2k_l.c */
+
 /* Start: bn_mp_reduce_2k_setup.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_2K_SETUP_C
@@ -6419,12 +6475,11 @@ ERR:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines the setup value */
-int 
-mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 {
    int res, p;
    mp_int tmp;
@@ -6452,6 +6507,50 @@ mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 
 /* End: bn_mp_reduce_2k_setup.c */
 
+/* Start: bn_mp_reduce_2k_setup_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
+
+/* End: bn_mp_reduce_2k_setup_l.c */
+
 /* Start: bn_mp_reduce_is_2k.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_IS_2K_C
@@ -6467,7 +6566,7 @@ mp_reduce_2k_setup(mp_int *a, mp_digit *d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* determines if mp_reduce_2k can be used */
@@ -6477,9 +6576,9 @@ int mp_reduce_is_2k(mp_int *a)
    mp_digit iz;
    
    if (a->used == 0) {
-      return 0;
+      return MP_NO;
    } else if (a->used == 1) {
-      return 1;
+      return MP_YES;
    } else if (a->used > 1) {
       iy = mp_count_bits(a);
       iz = 1;
@@ -6488,7 +6587,7 @@ int mp_reduce_is_2k(mp_int *a)
       /* Test every bit from the second digit up, must be 1 */
       for (ix = DIGIT_BIT; ix < iy; ix++) {
           if ((a->dp[iw] & iz) == 0) {
-             return 0;
+             return MP_NO;
           }
           iz <<= 1;
           if (iz > (mp_digit)MP_MASK) {
@@ -6497,13 +6596,57 @@ int mp_reduce_is_2k(mp_int *a)
           }
       }
    }
-   return 1;
+   return MP_YES;
 }
 
 #endif
 
 /* End: bn_mp_reduce_is_2k.c */
 
+/* Start: bn_mp_reduce_is_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
+
+/* End: bn_mp_reduce_is_2k_l.c */
+
 /* Start: bn_mp_reduce_setup.c */
 #include <tommath.h>
 #ifdef BN_MP_REDUCE_SETUP_C
@@ -6519,7 +6662,7 @@ int mp_reduce_is_2k(mp_int *a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* pre-calculate the value required for Barrett reduction
@@ -6553,7 +6696,7 @@ int mp_reduce_setup (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shift right a certain amount of digits */
@@ -6625,7 +6768,7 @@ void mp_rshd (mp_int * a, int b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* set to a digit */
@@ -6654,7 +6797,7 @@ void mp_set (mp_int * a, mp_digit b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* set a 32-bit const */
@@ -6702,19 +6845,24 @@ int mp_set_int (mp_int * a, unsigned long b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* shrink a bignum */
 int mp_shrink (mp_int * a)
 {
   mp_digit *tmp;
-  if (a->alloc != a->used && a->used > 0) {
-    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+  int used = 1;
+  
+  if(a->used > 0)
+    used = a->used;
+  
+  if (a->alloc != used) {
+    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * used)) == NULL) {
       return MP_MEM;
     }
     a->dp    = tmp;
-    a->alloc = a->used;
+    a->alloc = used;
   }
   return MP_OKAY;
 }
@@ -6737,7 +6885,7 @@ int mp_shrink (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* get the size for an signed equivalent */
@@ -6764,7 +6912,7 @@ int mp_signed_bin_size (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* computes b = a*a */
@@ -6822,7 +6970,7 @@ if (a->used >= KARATSUBA_SQR_CUTOFF) {
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* c = a * a (mod b) */
@@ -6850,6 +6998,7 @@ mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
 
 /* Start: bn_mp_sqrt.c */
 #include <tommath.h>
+
 #ifdef BN_MP_SQRT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -6863,7 +7012,7 @@ mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* this function is less generic than mp_n_root, simpler and faster */
@@ -6944,7 +7093,7 @@ E2: mp_clear(&t1);
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* high level subtraction (handles signs) */
@@ -7003,7 +7152,7 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* single digit subtraction */
@@ -7027,6 +7176,10 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
      a->sign = MP_ZPOS;
      res     = mp_add_d(a, b, c);
      a->sign = c->sign = MP_NEG;
+
+     /* clamp */
+     mp_clamp(c);
+
      return res;
   }
 
@@ -7092,7 +7245,7 @@ mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* d = a - b (mod c) */
@@ -7134,12 +7287,11 @@ mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
   int     res;
 
@@ -7153,6 +7305,37 @@ mp_to_signed_bin (mp_int * a, unsigned char *b)
 
 /* End: bn_mp_to_signed_bin.c */
 
+/* Start: bn_mp_to_signed_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_signed_bin_n.c */
+
 /* Start: bn_mp_to_unsigned_bin.c */
 #include <tommath.h>
 #ifdef BN_MP_TO_UNSIGNED_BIN_C
@@ -7168,12 +7351,11 @@ mp_to_signed_bin (mp_int * a, unsigned char *b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
   int     x, res;
   mp_int  t;
@@ -7202,6 +7384,37 @@ mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 
 /* End: bn_mp_to_unsigned_bin.c */
 
+/* Start: bn_mp_to_unsigned_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_unsigned_bin_n.c */
+
 /* Start: bn_mp_toom_mul.c */
 #include <tommath.h>
 #ifdef BN_MP_TOOM_MUL_C
@@ -7217,14 +7430,15 @@ mp_to_unsigned_bin (mp_int * a, unsigned char *b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
@@ -7500,7 +7714,7 @@ ERR:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* squaring using Toom-Cook 3-way algorithm */
@@ -7726,7 +7940,7 @@ ERR:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* stores a bignum as a ASCII string in a given radix (2..64) */
@@ -7801,7 +8015,7 @@ int mp_toradix (mp_int * a, char *str, int radix)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* stores a bignum as a ASCII string in a given radix (2..64) 
@@ -7816,12 +8030,12 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
   char   *_s = str;
 
   /* check range of the maxlen, radix */
-  if (maxlen < 3 || radix < 2 || radix > 64) {
+  if (maxlen < 2 || radix < 2 || radix > 64) {
     return MP_VAL;
   }
 
   /* quick out if its zero */
-  if (mp_iszero(a) == 1) {
+  if (mp_iszero(a) == MP_YES) {
      *str++ = '0';
      *str = '\0';
      return MP_OKAY;
@@ -7846,21 +8060,20 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
 
   digs = 0;
   while (mp_iszero (&t) == 0) {
+    if (--maxlen < 1) {
+       /* no more room */
+       break;
+    }
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
     }
     *str++ = mp_s_rmap[d];
     ++digs;
-
-    if (--maxlen == 1) {
-       /* no more room */
-       break;
-    }
   }
 
   /* reverse the digits of the string.  In this case _s points
-   * to the first digit [exluding the sign] of the number]
+   * to the first digit [exluding the sign] of the number
    */
   bn_reverse ((unsigned char *)_s, digs);
 
@@ -7890,12 +8103,11 @@ int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
+int mp_unsigned_bin_size (mp_int * a)
 {
   int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
@@ -7919,7 +8131,7 @@ mp_unsigned_bin_size (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* XOR two ints together */
@@ -7944,7 +8156,7 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
@@ -7970,16 +8182,22 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
+void mp_zero (mp_int * a)
 {
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
 
@@ -8000,7 +8218,7 @@ mp_zero (mp_int * a)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 const mp_digit ltm_prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
@@ -8061,7 +8279,7 @@ const mp_digit ltm_prime_tab[] = {
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* reverse an array, used for radix code */
@@ -8100,7 +8318,7 @@ bn_reverse (unsigned char *s, int len)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
@@ -8209,20 +8427,20 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
-
 #ifdef MP_LOW_MEM
    #define TAB_SIZE 32
 #else
    #define TAB_SIZE 256
 #endif
 
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
   mp_int  M[TAB_SIZE], res, mu;
   mp_digit buf;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
 
   /* find window size */
   x = mp_count_bits (X);
@@ -8269,9 +8487,18 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   if ((err = mp_init (&mu)) != MP_OKAY) {
     goto LBL_M;
   }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto LBL_MU;
-  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
 
   /* create M table
    *
@@ -8293,11 +8520,14 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 
   for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
     if ((err = mp_sqr (&M[1 << (winsize - 1)], 
                        &M[1 << (winsize - 1)])) != MP_OKAY) {
       goto LBL_MU;
     }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
       goto LBL_MU;
     }
   }
@@ -8309,7 +8539,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
       goto LBL_MU;
     }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
       goto LBL_MU;
     }
   }
@@ -8358,7 +8588,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
         goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
         goto LBL_RES;
       }
       continue;
@@ -8375,7 +8605,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
           goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
           goto LBL_RES;
         }
       }
@@ -8384,7 +8614,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
         goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
         goto LBL_RES;
       }
 
@@ -8402,7 +8632,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
         goto LBL_RES;
       }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
         goto LBL_RES;
       }
 
@@ -8412,7 +8642,7 @@ int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
           goto LBL_RES;
         }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
           goto LBL_RES;
         }
       }
@@ -8449,15 +8679,14 @@ LBL_M:
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiplies |a| * |b| and only computes upto digs digits of result
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
@@ -8540,7 +8769,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* multiplies |a| * |b| and does not compute the lower digs digits
@@ -8621,12 +8850,11 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
@@ -8706,7 +8934,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
@@ -8795,7 +9023,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 
 /* Known optimal configurations
@@ -8803,11 +9031,12 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
  Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        80/       120/LTM 0.35
  
 */
 
-int     KARATSUBA_MUL_CUTOFF = 88,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 128,     /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 80,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 120,     /* Min. number of digits before Karatsuba squaring is used. */
         
         TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
         TOOM_SQR_CUTOFF      = 400; 
diff --git a/libtommath/tombc/grammar.txt b/libtommath/tombc/grammar.txt
new file mode 100644
index 0000000..a780e75
--- /dev/null
+++ b/libtommath/tombc/grammar.txt
@@ -0,0 +1,35 @@
+program       := program statement | statement | empty
+statement     := { statement }                                                                              | 
+                 identifier = numexpression;                                                                | 
+                 identifier[numexpression] = numexpression;                                                 |
+                 function(expressionlist);                                                                  | 
+                 for (identifer = numexpression; numexpression; identifier = numexpression) { statement }   |
+                 while (numexpression) { statement }                                                        | 
+                 if (numexpresion) { statement } elif                                                       | 
+                 break;                                                                                     | 
+                 continue;                                                                                  
+                 
+elif          := else statement | empty
+function      := abs | countbits | exptmod | jacobi | print | isprime | nextprime | issquare | readinteger | exit
+expressionlist := expressionlist, expression | expression
+
+// LR(1) !!!?
+expression    := string | numexpression
+numexpression := cmpexpr && cmpexpr | cmpexpr \|\| cmpexpr | cmpexpr
+cmpexpr       := boolexpr  < boolexpr | boolexpr  > boolexpr | boolexpr == boolexpr | 
+                 boolexpr <= boolexpr | boolexpr >= boolexpr | boolexpr
+boolexpr      := shiftexpr & shiftexpr | shiftexpr ^ shiftexpr | shiftexpr \| shiftexpr | shiftexpr
+shiftexpr     := addsubexpr << addsubexpr | addsubexpr >> addsubexpr | addsubexpr
+addsubexpr    := mulexpr + mulexpr | mulexpr - mulexpr | mulexpr
+mulexpr       := expr * expr       | expr / expr | expr % expr | expr
+expr          := -nexpr | nexpr 
+nexpr         := integer | identifier | ( numexpression ) | identifier[numexpression] 
+
+identifier    := identifer digits | identifier alpha | alpha
+alpha         := a ... z | A ... Z
+integer       := hexnumber | digits 
+hexnumber     := 0xhexdigits
+hexdigits     := hexdigits hexdigit | hexdigit
+hexdigit      := 0 ... 9 | a ... f | A ... F
+digits        := digits digit | digit 
+digit         := 0 ... 9
diff --git a/libtommath/tommath.h b/libtommath/tommath.h
index 7cc92c2..4b3a76f 100644
--- a/libtommath/tommath.h
+++ b/libtommath/tommath.h
@@ -10,7 +10,7 @@
  * The library is free for all purposes without any express
  * guarantee it works.
  *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
  */
 #ifndef BN_H_
 #define BN_H_
@@ -23,10 +23,13 @@
 
 #include <tommath_class.h>
 
-#undef MIN
-#define MIN(x,y) ((x)<(y)?(x):(y))
-#undef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
+#ifndef MIN
+#   define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+#ifndef MAX
+#   define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,9 +47,9 @@ extern "C" {
 
 /* detect 64-bit mode if possible */
 #if defined(__x86_64__) 
-   #if !(defined(MP_64BIT) && defined(MP_16BIT) && defined(MP_8BIT))
-      #define MP_64BIT
-   #endif
+#   if !(defined(MP_64BIT) && defined(MP_16BIT) && defined(MP_8BIT))
+#	define MP_64BIT
+#   endif
 #endif
 
 /* some default configurations.
@@ -73,19 +76,19 @@ extern "C" {
    typedef unsigned long      mp_digit;
    typedef unsigned long      mp_word __attribute__ ((mode(TI)));
 
-   #define DIGIT_BIT          60
+#  define DIGIT_BIT          60
 #else
    /* this is the default case, 28-bit digits */
    
    /* this is to make porting into LibTomCrypt easier :-) */
 #ifndef CRYPT
-   #if defined(_MSC_VER) || defined(__BORLANDC__) 
+#  if defined(_MSC_VER) || defined(__BORLANDC__)
       typedef unsigned __int64   ulong64;
       typedef signed __int64     long64;
-   #else
+#  else
       typedef unsigned long long ulong64;
       typedef signed long long   long64;
-   #endif
+#  endif
 #endif
 
    typedef unsigned long      mp_digit;
@@ -93,35 +96,35 @@ extern "C" {
 
 #ifdef MP_31BIT   
    /* this is an extension that uses 31-bit digits */
-   #define DIGIT_BIT          31
+#  define DIGIT_BIT          31
 #else
    /* default case is 28-bit digits, defines MP_28BIT as a handy macro to test */
-   #define DIGIT_BIT          28
-   #define MP_28BIT
+#  define DIGIT_BIT          28
+#  define MP_28BIT
 #endif   
 #endif
 
 /* define heap macros */
 #ifndef CRYPT
    /* default to libc stuff */
-   #ifndef XMALLOC 
-       #define XMALLOC  malloc
-       #define XFREE    free
-       #define XREALLOC realloc
-       #define XCALLOC  calloc
-   #else
+#  ifndef XMALLOC
+#     define XMALLOC  malloc
+#     define XFREE    free
+#     define XREALLOC realloc
+#     define XCALLOC  calloc
+#  else
       /* prototypes for our heap functions */
       extern void *XMALLOC(size_t n);
-      extern void *REALLOC(void *p, size_t n);
+      extern void *XREALLOC(void *p, size_t n);
       extern void *XCALLOC(size_t n, size_t s);
       extern void XFREE(void *p);
-   #endif
+#  endif
 #endif
 
 
 /* otherwise the bits per digit is calculated automatically from the size of a mp_digit */
 #ifndef DIGIT_BIT
-   #define DIGIT_BIT     ((int)((CHAR_BIT * sizeof(mp_digit) - 1)))  /* bits per digit */
+#   define DIGIT_BIT     ((int)((CHAR_BIT * sizeof(mp_digit) - 1)))  /* bits per digit */
 #endif
 
 #define MP_DIGIT_BIT     DIGIT_BIT
@@ -147,7 +150,6 @@ extern "C" {
 /* Primality generation flags */
 #define LTM_PRIME_BBS      0x0001 /* BBS style prime */
 #define LTM_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
-#define LTM_PRIME_2MSB_OFF 0x0004 /* force 2nd MSB to 0 */
 #define LTM_PRIME_2MSB_ON  0x0008 /* force 2nd MSB to 1 */
 
 typedef int           mp_err;
@@ -163,11 +165,11 @@ extern int KARATSUBA_MUL_CUTOFF,
 
 /* default precision */
 #ifndef MP_PREC
-   #ifndef MP_LOW_MEM
-      #define MP_PREC                 64     /* default digits of precision */
-   #else
-      #define MP_PREC                 8      /* default digits of precision */
-   #endif   
+#  ifndef MP_LOW_MEM
+#     define MP_PREC                 32     /* default digits of precision */
+#  else
+#     define MP_PREC                 8      /* default digits of precision */
+#  endif
 #endif
 
 /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
@@ -217,7 +219,7 @@ int mp_init_size(mp_int *a, int size);
 
 /* ---> Basic Manipulations <--- */
 #define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
-#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? MP_YES : MP_NO)
+#define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? MP_YES : MP_NO)
 #define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? MP_YES : MP_NO)
 
 /* set to zero */
@@ -239,7 +241,7 @@ int mp_init_set (mp_int * a, mp_digit b);
 int mp_init_set_int (mp_int * a, unsigned long b);
 
 /* copy, b = a */
-int mp_copy(mp_int *a, mp_int *b);
+int mp_copy(const mp_int *a, mp_int *b);
 
 /* inits and copies, a = b */
 int mp_init_copy(mp_int *a, mp_int *b);
@@ -256,19 +258,19 @@ void mp_rshd(mp_int *a, int b);
 int mp_lshd(mp_int *a, int b);
 
 /* c = a / 2**b */
-int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
+int mp_div_2d(const mp_int *a, int b, mp_int *c, mp_int *d);
 
 /* b = a/2 */
 int mp_div_2(mp_int *a, mp_int *b);
 
 /* c = a * 2**b */
-int mp_mul_2d(mp_int *a, int b, mp_int *c);
+int mp_mul_2d(const mp_int *a, int b, mp_int *c);
 
 /* b = a*2 */
 int mp_mul_2(mp_int *a, mp_int *b);
 
 /* c = a mod 2**d */
-int mp_mod_2d(mp_int *a, int b, mp_int *c);
+int mp_mod_2d(const mp_int *a, int b, mp_int *c);
 
 /* computes a = 2**b */
 int mp_2expt(mp_int *a, int b);
@@ -294,16 +296,16 @@ int mp_and(mp_int *a, mp_int *b, mp_int *c);
 /* ---> Basic arithmetic <--- */
 
 /* b = -a */
-int mp_neg(mp_int *a, mp_int *b);
+int mp_neg(const mp_int *a, mp_int *b);
 
 /* b = |a| */
 int mp_abs(mp_int *a, mp_int *b);
 
 /* compare a to b */
-int mp_cmp(mp_int *a, mp_int *b);
+int mp_cmp(const mp_int *a, const mp_int *b);
 
 /* compare |a| to |b| */
-int mp_cmp_mag(mp_int *a, mp_int *b);
+int mp_cmp_mag(const mp_int *a, const mp_int *b);
 
 /* c = a + b */
 int mp_add(mp_int *a, mp_int *b, mp_int *c);
@@ -326,7 +328,7 @@ int mp_mod(mp_int *a, mp_int *b, mp_int *c);
 /* ---> single digit functions <--- */
 
 /* compare against a single digit */
-int mp_cmp_d(mp_int *a, mp_digit b);
+int mp_cmp_d(const mp_int *a, mp_digit b);
 
 /* c = a + b */
 int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
@@ -429,6 +431,15 @@ int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
 /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
 int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
 
+/* returns true if a can be reduced with mp_reduce_2k_l */
+int mp_reduce_is_2k_l(mp_int *a);
+
+/* determines k value for 2k reduction */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
+
+/* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
+
 /* d = a**b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 
@@ -436,9 +447,9 @@ int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 
 /* number of primes */
 #ifdef MP_8BIT
-   #define PRIME_SIZE      31
+#  define PRIME_SIZE      31
 #else
-   #define PRIME_SIZE      256
+#  define PRIME_SIZE      256
 #endif
 
 /* table of first PRIME_SIZE primes */
@@ -506,17 +517,19 @@ int mp_prime_next_prime(mp_int *a, int t, int bbs_style);
 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat);
 
 /* ---> radix conversion <--- */
-int mp_count_bits(mp_int *a);
+int mp_count_bits(const mp_int *a);
 
 int mp_unsigned_bin_size(mp_int *a);
-int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+int mp_read_unsigned_bin(mp_int *a, const unsigned char *b, int c);
 int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
 
 int mp_signed_bin_size(mp_int *a);
-int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
-int mp_to_signed_bin(mp_int *a, unsigned char *b);
+int mp_read_signed_bin(mp_int *a, const unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a,  unsigned char *b);
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
 
-int mp_read_radix(mp_int *a, char *str, int radix);
+int mp_read_radix(mp_int *a, const char *str, int radix);
 int mp_toradix(mp_int *a, char *str, int radix);
 int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen);
 int mp_radix_size(mp_int *a, int radix, int *size);
@@ -554,14 +567,13 @@ int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c);
 int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c);
 int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int mode);
 void bn_reverse(unsigned char *s, int len);
 
 extern const char *mp_s_rmap;
 
 #ifdef __cplusplus
-   }
+}
 #endif
 
 #endif
-
diff --git a/libtommath/tommath.pdf b/libtommath/tommath.pdf
index 88e2dc7..c9571d8 100644
--- a/libtommath/tommath.pdf
+++ b/libtommath/tommath.pdf
diff --git a/libtommath/tommath.src b/libtommath/tommath.src
index 6ee842d..4065822 100644
--- a/libtommath/tommath.src
+++ b/libtommath/tommath.src
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\title{Multi--Precision Math}
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.39 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -77,7 +77,7 @@ K2L 1C3
 Canada
 
 Phone: 1-613-836-3160
-Email: tomstdenis@iahu.ca
+Email: tomstdenis@gmail.com
 \end{alltt}
 
 This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
@@ -85,66 +85,32 @@ This text is formatted to the international B5 paper size of 176mm wide by 250mm
 
 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Draft Edition}
-I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
-contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
-own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
-ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
-would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
-text.  
-
-Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
-off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
-a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
-to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
-managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
-rewarding.
-
-Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
-Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
-finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
-Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
-onto finishing the book not securing a contract.
-
-So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
-Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
-from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
-is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
-people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
-without hinderance.  
-
-I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
-to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
-software.  Several educational institutions use it as a matter of course and many freelance developers use it as
-part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
-multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
-to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
-
-The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
-said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
-
-At this time I feel I should share a little information about myself.  The most common question I was asked at 
-Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
-truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
-is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
-
-I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
-computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
-still far off from that goal.  
-
-Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
-corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
-in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
-sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
-of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
-his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
-in my written English have saved me on several occasions to say the least.
-
-What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
-been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
-plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
-should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
-people who will take it.
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
+
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
+
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
+
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
+
+Open Source.  Open Academia.  Open Minds.
 
 \begin{flushright} Tom St Denis \end{flushright}
 
@@ -302,7 +268,7 @@ and fast modular inversion, which we consider practical oversights.  These optim
 any form of useful performance in non-trivial applications.  
 
 To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
-package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.com}} package is used 
 to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
 tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
 discusses a very large portion of the inner workings of the library.
@@ -937,7 +903,7 @@ assumed to contain undefined values they are initially set to zero.
 
 EXAM,bn_mp_grow.c
 
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @23,if@) checks
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @24,alloc@) checks
 if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
 the function skips the re-allocation part thus saving time.
 
@@ -1310,7 +1276,7 @@ After the function is completed, all of the digits are zeroed, the \textbf{used}
 With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
 the absolute value of an mp\_int.
 
-\newpage\begin{figure}[here]
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_abs}. \\
@@ -1335,6 +1301,9 @@ logic to handle it.
 
 EXAM,bn_mp_abs.c
 
+This fairly trivial algorithm first eliminates non--required duplications (line @27,a != b@) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
 \subsection{Integer Negation}
 With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
 the negative of an mp\_int input.
@@ -1368,11 +1337,15 @@ zero as negative.
 
 EXAM,bn_mp_neg.c
 
+Like mp\_abs() this function avoids non--required duplications (line @21,a != b@) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
 \section{Small Constants}
 \subsection{Setting Small Constants}
 Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
 
-\begin{figure}[here]
+\newpage\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_set}. \\
@@ -1397,11 +1370,14 @@ single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adj
 
 EXAM,bn_mp_set.c
 
-Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign.  Line @22,MP_MASK@ copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line @23,a->used@ will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
+First we zero (line @21,mp_zero@) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line @22,MP_MASK@).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
 
 One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
 this function should take that into account.  Only trivially small constants can be set using this function.
@@ -1503,10 +1479,12 @@ the zero'th digit.  If after all of the digits have been compared, no difference
 
 EXAM,bn_mp_cmp_mag.c
 
-The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
+The two if statements (lines @24,if@ and @28,if@) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
 
 \subsection{Signed Comparisons}
 Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
@@ -1539,9 +1517,9 @@ $\vert a \vert < \vert b \vert$.  Step number four will compare the two when the
 
 EXAM,bn_mp_cmp.c
 
-The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line @30,if@, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}).  Otherwise, the signs are assumed to 
+The two if statements (lines @22,if@ and @26,if@) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line @30,if@) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line @31,mp_cmp_mag@).  Otherwise, the signs are assumed to 
 be both positive and a forward direction unsigned comparison is performed.
 
 \section*{Exercises}
@@ -1664,19 +1642,21 @@ The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are
 
 EXAM,bn_s_mp_add.c
 
-Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines @37,init@ to @42,}@ ensure that the destination is grown to 
-accomodate the result of the addition. 
+We first sort (lines @27,if@ to @35,}@) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (@37,init@ to @42,}@) ensure that it can accomodate the result of the addition. 
 
 Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
 lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
 compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
 
-The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line @66,for@ and ends on line @75,}@.  Similarly the conditional addition loop
-begins on line @81,for@ and ends on line @90,}@.  The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@.  
-Note the ``++'' operator on the same line.  After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero.
+The initial carry $u$ will be cleared (line @65,u = 0@), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line @66,for@ to @75,}@) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line @81,for@ to @90,}@) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line @94,tmpc++@).  Note the ``++'' operator within the same expression.
+After line @94,tmpc++@, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line @97,for@ to @99,}@) which set any old upper digits to zero.
 
 \subsection{Low Level Subtraction}
 The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
@@ -1692,7 +1672,7 @@ this algorithm we will assume that the variable $\gamma$ represents the number o
 mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
 
 For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
 
 \newpage\begin{figure}[!here]
 \begin{center}
@@ -1759,20 +1739,23 @@ If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and cop
 
 EXAM,bn_s_mp_sub.c
 
-Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for 
-$a$, $b$ and $c$ respectively.
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines @24,min@ and @25,max@).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines @42,tmpa@, @43,tmpb@ and @44,tmpc@) for $a$, $b$ and $c$ respectively.
 
-The first subtraction loop occurs on lines @47,u = 0@ through @61,}@.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line @57, >>@}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
+The first subtraction loop (lines @47,u = 0@ through @61,}@) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line @57, >>@).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
 
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
+If $a$ has a larger magnitude than $b$ an additional loop (lines @64,for@ through @73,}@) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
 
 \subsection{High Level Addition}
 Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
@@ -2098,10 +2081,11 @@ FIGU,sliding_window,Sliding Window Movement
 
 EXAM,bn_mp_lshd.c
 
-The if statement on line @24,if@ ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line @42,top@ is an alias
-for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
+The if statement (line @24,if@) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line @42,top@) is an alias
+for the leading digit while $bottom$ (line @45,bottom@) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
 
 \subsection{Division by $x$}
 
@@ -2151,9 +2135,9 @@ Once the window copy is complete the upper digits must be zeroed and the \textbf
 
 EXAM,bn_mp_rshd.c
 
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line @59,for (;@) we then zero
+the upper digits of the input to make sure the result is correct.
 
 \section{Powers of Two}
 
@@ -2206,7 +2190,7 @@ left.
 
 After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
 required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
-Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
 variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
 
 This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
@@ -2214,7 +2198,15 @@ complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm
 
 EXAM,bn_mp_mul_2d.c
 
-Notes to be revised when code is updated. -- Tom
+The shifting is performed in--place which means the first step (line @24,a != c@) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line @31,grow@) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines @45,if@ to @76,}@) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
 
 \subsection{Division by Power of Two}
 
@@ -2263,7 +2255,8 @@ ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The
 result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
 the quotient is obtained.
 
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
 
 \subsection{Remainder of Division by Power of Two}
 
@@ -2306,7 +2299,13 @@ is copied to $b$, leading digits are removed and the remaining leading digit is
 
 EXAM,bn_mp_mod_2d.c
 
--- Add comments later, Tom.
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line @41,for@).  Next we reduce the 
+leading digit of both (line @45,&=@) and then mp\_clamp().
 
 \section*{Exercises}
 \begin{tabular}{cl}
@@ -2464,33 +2463,46 @@ exceed the precision requested.
 
 EXAM,bn_s_mp_mul_digs.c
 
-Lines @31,if@ to @35,}@ determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
+First we determine (line @30,if@) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
+
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line @36,init@) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line @48,MIN@) to the maximum 
+number of inner loop iterations.  
 
-Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line @68,tmpt@) and the carry propagated (line @71,>>@) to the 
+next iteration.
 
 \subsection{Faster Multiplication by the ``Comba'' Method}
 MARK,COMBA
 
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
 
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
 
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
 
 \begin{equation}
 \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
@@ -2584,38 +2596,31 @@ $256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which,
 \textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
 \textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
 \hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
 1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
 2.  If step 1 failed return(\textit{MP\_MEM}).\\
 \\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
 \\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
 \\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+6.  $oldused \leftarrow c.used$ \\
+7.  $c.used \leftarrow digs$ \\
+8.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}8.1  $c_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\
 \\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
+10.  Clamp $c$. \\
+11.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -2625,15 +2630,24 @@ Zero excess digits. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
+
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
+
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
 
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
 
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
 
 To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
 cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
@@ -2643,20 +2657,20 @@ and addition operations in the nested loop in parallel.
 
 EXAM,bn_fast_s_mp_mul_digs.c
 
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+As per the pseudo--code we first calculate $pa$ (line @47,MIN@) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines @61,tmpx@, @62,tmpy@) to point
+inside the two multiplicands quickly.  
 
-The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
+The inner loop (lines @70,for@ to @72,}@) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
 
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines @75,W[ix]@, @78,>>@) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line @82,W[ix]@) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -2760,26 +2774,25 @@ general purpose multiplication.  Given two polynomial basis representations $f(x
 light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
 
 \begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
 \end{equation}
 
 Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
 this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
 out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
 
 \begin{center}
 \begin{tabular}{rcrcrcrc}
 $\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
 $\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
 \end{tabular}
 \end{center}
 
 By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
 of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -2802,13 +2815,13 @@ Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
 Calculate the three products. \\
 8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
 9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
+10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
+11.  $x0 \leftarrow y1 + y0$ \\
 12.  $t1 \leftarrow t1 \cdot x0$ \\
 \\
 Calculate the middle term. \\
 13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
+14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
 \\
 Calculate the final product. \\
 15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
@@ -2835,7 +2848,7 @@ smallest input \textbf{used} count.  After the radix point is chosen the inputs
 compute the lower halves.  Step 6 and 7 computer the upper halves.  
 
 After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
 of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
 
 The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
@@ -2976,13 +2989,26 @@ result $a \cdot b$ is produced.
 
 EXAM,bn_mp_toom_mul.c
 
--- Comments to be added during editing phase.
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines @40,mod@ to @69,rshd@) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines @72,mul@ and @77,mul@).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
 
 \subsection{Signed Multiplication}
 Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
 of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3065,7 +3091,7 @@ Column two of row one is a square and column three is the first unique column.
 The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
 will not handle.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3121,9 +3147,14 @@ results calculated so far.  This involves expensive carry propagation which will
 
 EXAM,bn_s_mp_sqr.c
 
-Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@.  Line @42,>>@ extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively.  The doubling is performed using two
-additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast.  
+Inside the outer loop (line @32,for@) the square term is calculated on line @35,r =@.  The carry (line @42,>>@) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines @45,tmpx@ and @48,tmpt@) to simplify the inner loop.  The doubling is performed using two
+additions (line @57,r + r@) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
 
 \subsection{Faster Squaring by the ``Comba'' Method}
 A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
@@ -3135,9 +3166,9 @@ propagation operations from the inner loop.  However, the inner product must sti
 that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
 $ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
 
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -3147,34 +3178,34 @@ moved to a $O(n)$ work level outside the $O(n^2)$ level.
 \textbf{Input}.   mp\_int $a$ \\
 \textbf{Output}.  $b \leftarrow a^2$ \\
 \hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
 1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
 2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
 \\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
 \\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
 \hline
 \end{tabular}
 \end{center}
@@ -3183,24 +3214,24 @@ Double the products and propagate the carries simultaneously. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
 
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
 
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
 
 EXAM,bn_fast_s_mp_sqr.c
 
--- Write something deep and insightful later, Tom.
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
 
 \subsection{Polynomial Basis Squaring}
 The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
@@ -3213,10 +3244,10 @@ Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  Th
 number with the following equation.
 
 \begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
 \end{equation}
 
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in 
 Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
 $O \left ( n^{lg(3)} \right )$.
 
@@ -3248,12 +3279,12 @@ Split the input.  e.g. $a = x1\beta^B + x0$ \\
 Calculate the three squares. \\
 6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
 7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
 9.  $t1 \leftarrow t1^2$ \\
 \\
 Compute the middle term. \\
 10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
+11.  $t1 \leftarrow t1 - t2$ \\
 \\
 Compute final product. \\
 12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
@@ -3276,7 +3307,7 @@ The radix point for squaring is simply placed exactly in the middle of the digit
 placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
 as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
 
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
 Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
 this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
 
@@ -3312,14 +3343,13 @@ By inlining the copy and shift operations the cutoff point for Karatsuba multipl
 is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
 it is actually below the Comba limit (\textit{at 110 digits}).
 
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
 
 \subsection{Toom-Cook Squaring}
 The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
 derive their own Toom-Cook squaring algorithm.  
 
 \subsection{High Level Squaring}
@@ -3362,12 +3392,9 @@ EXAM,bn_mp_sqr.c
 $\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
                       & that have different number of digits in Karatsuba multiplication. \\
                       & \\
-$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
+$\left [ 2 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
                       & of double products and at most one square is stated.  Prove this statement. \\
                       & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
 $\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
                       & \\
 $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
@@ -3375,6 +3402,14 @@ $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3
 $\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
                       & required for equation $6.7$ to be true.  \\
                       & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
 \end{tabular}
 
 \chapter{Modular Reduction}
@@ -3394,7 +3429,7 @@ other forms of residues.
 Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
 is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
 RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
-Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
 exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
 range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
 algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
@@ -3610,7 +3645,7 @@ safe to do so.
 In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
 future use so that the Barrett algorithm can be used without delay.  
 
-\begin{figure}[!here]
+\newpage\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3695,6 +3730,7 @@ $0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction
 \hline $6$ & $x/2 = 139$ \\
 \hline $7$ & $x + n = 396$, $x/2 = 198$ \\
 \hline $8$ & $x/2 = 99$ \\
+\hline $9$ & $x + n = 356$, $x/2 = 178$ \\
 \hline
 \end{tabular}
 \end{center}
@@ -3703,8 +3739,8 @@ $0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction
 \label{fig:MONT1}
 \end{figure}
 
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
-congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 9$ (note $\beta^k = 512$ which is larger than $n$).  The result of 
+the algorithm $r = 178$ is congruent to the value of $2^{-9} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^9$ modulo $257$ the correct residue 
 $r \equiv 158$ is produced.  
 
 Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
@@ -3716,10 +3752,10 @@ Fortunately there exists an alternative representation of the algorithm.
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ ($2^k > n$) \\
 \textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
 \hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
+1.  for $t$ from $1$ to $k$ do \\
 \hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
 \hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
 2.  Return $x/2^k$. \\
@@ -3747,7 +3783,8 @@ precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a s
 \hline $6$ & $8896$ & $10001011000000$ \\
 \hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
 \hline $8$ & $25344$ & $110001100000000$ \\
-\hline -- & $x/2^k = 99$ & \\
+\hline $9$ & $x + 2^{7}n = 91136$ & $10110010000000000$ \\
+\hline -- & $x/2^k = 178$ & \\
 \hline
 \end{tabular}
 \end{center}
@@ -3756,7 +3793,7 @@ precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a s
 \label{fig:MONT2}
 \end{figure}
 
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 9$. 
 With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
 loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
 zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
@@ -3770,7 +3807,7 @@ previous algorithm re-written to compute the Montgomery reduction in this new fa
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ ($\beta^k > n$) \\
 \textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
 \hline \\
 1.  for $t$ from $0$ to $k - 1$ do \\
@@ -3998,7 +4035,7 @@ To calculate the variable $\rho$ a relatively simple algorithm will be required.
 \hline \\
 1.  $b \leftarrow n_0$ \\
 2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
 4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
 \hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
 5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
@@ -4902,15 +4939,15 @@ a Left-to-Right algorithm is used to process the remaining few bits.
 
 EXAM,bn_s_mp_exptmod.c
 
-Lines @26,if@ through @40,}@ determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+Lines @31,if@ through @45,}@ determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
 from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line @32,if@ the value of $x$ is already known to be greater than $140$.  
+on line @37,if@ the value of $x$ is already known to be greater than $140$.  
 
 The conditional piece of code beginning on line @42,ifdef@ allows the window size to be restricted to five bits.  This logic is used to ensure
 the table of precomputed powers of $G$ remains relatively small.  
 
-The for loop on line @49,for@ initializes the $M$ array while lines @59,mp_init@ and @62,mp_reduce@ compute the value of $\mu$ required for
-Barrett reduction.  
+The for loop on line @60,for@ initializes the $M$ array while lines @71,mp_init@ and @75,mp_reduce@ through @85,}@ initialize the reduction
+function that will be used for this modulus.
 
 -- More later.
 
@@ -5193,23 +5230,23 @@ algorithm with only the quotient is
 mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
 \end{verbatim}
 
-Lines @37,if@ and @42,if@ handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
-respectively.  After the two trivial cases all of the temporary variables are initialized.  Line @76,neg@ determines the sign of 
-the quotient and line @77,sign@ ensures that both $x$ and $y$ are positive.  
+Lines @108,if@ and @113,if@ handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line @147,neg@ determines the sign of 
+the quotient and line @148,sign@ ensures that both $x$ and $y$ are positive.  
 
-The number of bits in the leading digit is calculated on line @80,norm@.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+The number of bits in the leading digit is calculated on line @151,norm@.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
 of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
 exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
 them to the left by $lg(\beta) - 1 - k$ bits.
 
 Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
-leading digit of the quotient.  The loop beginning on line @113,for@ will produce the remainder of the quotient digits.
+leading digit of the quotient.  The loop beginning on line @184,for@ will produce the remainder of the quotient digits.
 
-The conditional ``continue'' on line @114,if@ is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+The conditional ``continue'' on line @186,continue@ is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
 algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
 above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
 
-Lines @142,t1@, @143,t1@ and @150,t2@ through @152,t2@ manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+Lines @214,t1@, @216,t1@ and @222,t2@ through @225,t2@ manually construct the high accuracy estimations by setting the digits of the two mp\_int 
 variables directly.  
 
 \section{Single Digit Helpers}
@@ -5707,33 +5744,30 @@ and will produce the greatest common divisor.
 \textbf{Input}.   mp\_int $a$ and $b$ \\
 \textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
 \hline \\
-1.  If $a = 0$ and $b \ne 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow b$ \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow \vert b \vert $ \\
 \hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $a \ne 0$ and $b = 0$ then \\
-\hspace{3mm}2.1  $c \leftarrow a$ \\
+2.  If $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow \vert a \vert $ \\
 \hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  If $a = b = 0$ then \\
-\hspace{3mm}3.1  $c \leftarrow 1$ \\
-\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
-4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
-5.  $k \leftarrow 0$ \\
-6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-9.  While $v.used > 0$ \\
-\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
-\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
-\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
-\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-10.  $c \leftarrow u \cdot 2^k$ \\
-11.  Return(\textit{MP\_OKAY}). \\
+3.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}5.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+8.  While $v.used > 0$ \\
+\hspace{3mm}8.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}8.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}8.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}8.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}8.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  $c \leftarrow u \cdot 2^k$ \\
+10.  Return(\textit{MP\_OKAY}). \\
 \hline
 \end{tabular}
 \end{center}
@@ -5745,17 +5779,17 @@ This algorithm will produce the greatest common divisor of two mp\_ints $a$ and
 Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
 Algorithm B and in practice this appears to be true.  
 
-The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+The first two steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
 largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
 $a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
 
-Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+Step five will divide out any common factors of two and keep track of the count in the variable $k$.  After this step, two is no longer a
 factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
-seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+six and seven ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while--loops will iterate since 
 they cannot both be even.
 
-By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
-or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+By step eight both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 8.2 will always produce a positive and even result.  Step 8.3 removes any
 factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
 
 After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
@@ -5766,17 +5800,17 @@ EXAM,bn_mp_gcd.c
 This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
 integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
 it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
-trivial cases of inputs are handled on lines @25,zero@ through @34,}@.  After those lines the inputs are assumed to be non-zero.
+trivial cases of inputs are handled on lines @23,zero@ through @29,}@.  After those lines the inputs are assumed to be non-zero.
 
-Lines @36,if@ and @40,if@ make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
-must be divided out of the two inputs.  The while loop on line @49,while@ iterates so long as both are even.  The local integer $k$ is used to
-keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
-value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
-a limitation.}.  
+Lines @32,if@ and @36,if@ make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The block starting at line @43,common@ removes common factors of two by first counting the number of trailing
+zero bits in both.  The local integer $k$ is used to keep track of how many factors of $2$ are pulled out of both values.  It is assumed that 
+the number of factors will not exceed the maximum value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than 
+entries than are accessible by an ``int'' so this is not a limitation.}.  
 
-At this point there are no more common factors of two in the two values.  The while loops on lines @60,while@ and @65,while@ remove any independent
-factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
-on line @71, while@ performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+At this point there are no more common factors of two in the two values.  The divisions by a power of two on lines @60,div_2d@ and @67,div_2d@ remove 
+any independent factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line @72, while@ performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
 place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
 
 \section{Least Common Multiple}
@@ -5818,6 +5852,8 @@ To explain the Jacobi Symbol we shall first discuss the Legendre function\footno
 defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
 equivalent to equation \ref{eqn:legendre}.
 
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
 \begin{equation}
 a^{(p-1)/2} \equiv \begin{array}{rl}
                               -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
diff --git a/libtommath/tommath.tex b/libtommath/tommath.tex
index 9c4dc82..c79a537 100644
--- a/libtommath/tommath.tex
+++ b/libtommath/tommath.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\title{Multi--Precision Math}
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.39 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -77,7 +77,7 @@ K2L 1C3
 Canada
 
 Phone: 1-613-836-3160
-Email: tomstdenis@iahu.ca
+Email: tomstdenis@gmail.com
 \end{alltt}
 
 This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
@@ -85,66 +85,32 @@ This text is formatted to the international B5 paper size of 176mm wide by 250mm
 
 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Draft Edition}
-I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
-contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
-own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
-ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
-would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
-text.  
-
-Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
-off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
-a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
-to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
-managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
-rewarding.
-
-Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
-Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
-finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
-Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
-onto finishing the book not securing a contract.
-
-So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
-Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
-from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
-is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
-people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
-without hinderance.  
-
-I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
-to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
-software.  Several educational institutions use it as a matter of course and many freelance developers use it as
-part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
-multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
-to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
-
-The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
-said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
-
-At this time I feel I should share a little information about myself.  The most common question I was asked at 
-Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
-truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
-is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
-
-I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
-computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
-still far off from that goal.  
-
-Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
-corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
-in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
-sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
-of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
-his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
-in my written English have saved me on several occasions to say the least.
-
-What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
-been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
-plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
-should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
-people who will take it.
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
+
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
+
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
+
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
+
+Open Source.  Open Academia.  Open Minds.
 
 \begin{flushright} Tom St Denis \end{flushright}
 
@@ -302,7 +268,7 @@ and fast modular inversion, which we consider practical oversights.  These optim
 any form of useful performance in non-trivial applications.  
 
 To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
-package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.com}} package is used 
 to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
 tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
 discusses a very large portion of the inner workings of the library.
@@ -822,32 +788,6 @@ decrementally.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_init.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* init a new mp_int */
-018   int mp_init (mp_int * a)
-019   \{
-020     int i;
-021   
-022     /* allocate memory required and clear it */
-023     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
-024     if (a->dp == NULL) \{
-025       return MP_MEM;
-026     \}
-027   
-028     /* set the digits to zero */
-029     for (i = 0; i < MP_PREC; i++) \{
-030         a->dp[i] = 0;
-031     \}
-032   
-033     /* set the used to zero, allocated digits to the default precision
-034      * and sign to positive */
-035     a->used  = 0;
-036     a->alloc = MP_PREC;
-037     a->sign  = MP_ZPOS;
-038   
-039     return MP_OKAY;
-040   \}
-041   #endif
 \end{alltt}
 \end{small}
 
@@ -855,7 +795,7 @@ One immediate observation of this initializtion function is that it does not ret
 is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
 call to mp\_init() is used only to initialize the members of the structure to a known default state.  
 
-Here we see (line 23) the memory allocation is performed first.  This allows us to exit cleanly and quickly
+Here we see (line 24) the memory allocation is performed first.  This allows us to exit cleanly and quickly
 if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
 was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
 but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
@@ -863,11 +803,11 @@ memory allocation routine.
 
 In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
 accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
-portable fashion you have to actually assign the value.  The for loop (line 29) performs this required
+portable fashion you have to actually assign the value.  The for loop (line 30) performs this required
 operation.
 
 After the memory has been successfully initialized the remainder of the members are initialized 
-(lines 33 through 34) to their respective default states.  At this point the algorithm has succeeded and
+(lines 34 through 35) to their respective default states.  At this point the algorithm has succeeded and
 a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
 mp\_int structure has been properly initialized and is safe to use with other functions within the library.  
 
@@ -912,45 +852,21 @@ with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp
 \hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* clear one (frees)  */
-018   void
-019   mp_clear (mp_int * a)
-020   \{
-021     int i;
-022   
-023     /* only do anything if a hasn't been freed previously */
-024     if (a->dp != NULL) \{
-025       /* first zero the digits */
-026       for (i = 0; i < a->used; i++) \{
-027           a->dp[i] = 0;
-028       \}
-029   
-030       /* free ram */
-031       XFREE(a->dp);
-032   
-033       /* reset members to make debugging easier */
-034       a->dp    = NULL;
-035       a->alloc = a->used = 0;
-036       a->sign  = MP_ZPOS;
-037     \}
-038   \}
-039   #endif
 \end{alltt}
 \end{small}
 
-The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line 24)
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line 25)
 checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
 \textbf{NULL} in which case the if statement will evaluate to true.
 
-The digits of the mp\_int are cleared by the for loop (line 26) which assigns a zero to every digit.  Similar to mp\_init()
+The digits of the mp\_int are cleared by the for loop (line 27) which assigns a zero to every digit.  Similar to mp\_init()
 the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
 
 The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
 a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
-still has to be reset to \textbf{NULL} manually (line 34).  
+still has to be reset to \textbf{NULL} manually (line 35).  
 
-Now that the digits have been cleared and deallocated the other members are set to their final values (lines 35 and 36).
+Now that the digits have been cleared and deallocated the other members are set to their final values (lines 36 and 37).
 
 \section{Maintenance Algorithms}
 
@@ -1005,52 +921,15 @@ assumed to contain undefined values they are initially set to zero.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* grow as required */
-018   int mp_grow (mp_int * a, int size)
-019   \{
-020     int     i;
-021     mp_digit *tmp;
-022   
-023     /* if the alloc size is smaller alloc more ram */
-024     if (a->alloc < size) \{
-025       /* ensure there are always at least MP_PREC digits extra on top */
-026       size += (MP_PREC * 2) - (size % MP_PREC);
-027   
-028       /* reallocate the array a->dp
-029        *
-030        * We store the return in a temporary variable
-031        * in case the operation failed we don't want
-032        * to overwrite the dp member of a.
-033        */
-034       tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
-035       if (tmp == NULL) \{
-036         /* reallocation failed but "a" is still valid [can be freed] */
-037         return MP_MEM;
-038       \}
-039   
-040       /* reallocation succeeded so set a->dp */
-041       a->dp = tmp;
-042   
-043       /* zero excess digits */
-044       i        = a->alloc;
-045       a->alloc = size;
-046       for (; i < a->alloc; i++) \{
-047         a->dp[i] = 0;
-048       \}
-049     \}
-050     return MP_OKAY;
-051   \}
-052   #endif
 \end{alltt}
 \end{small}
 
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 23) checks
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 24) checks
 if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
 the function skips the re-allocation part thus saving time.
 
 When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
-padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line 26).  The XREALLOC function is used
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line 25).  The XREALLOC function is used
 to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
 function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
 the re-allocation.  All	that is left is to clear the newly allocated digits and return.
@@ -1102,45 +981,17 @@ correct no further memory re-allocations are required to work with the mp\_int.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* init an mp_init for a given size */
-018   int mp_init_size (mp_int * a, int size)
-019   \{
-020     int x;
-021   
-022     /* pad size so there are always extra digits */
-023     size += (MP_PREC * 2) - (size % MP_PREC);    
-024     
-025     /* alloc mem */
-026     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
-027     if (a->dp == NULL) \{
-028       return MP_MEM;
-029     \}
-030   
-031     /* set the members */
-032     a->used  = 0;
-033     a->alloc = size;
-034     a->sign  = MP_ZPOS;
-035   
-036     /* zero the digits */
-037     for (x = 0; x < size; x++) \{
-038         a->dp[x] = 0;
-039     \}
-040   
-041     return MP_OKAY;
-042   \}
-043   #endif
 \end{alltt}
 \end{small}
 
-The number of digits $b$ requested is padded (line 23) by first augmenting it to the next multiple of 
+The number of digits $b$ requested is padded (line 24) by first augmenting it to the next multiple of 
 \textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the 
 mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
-returned (line 28).  
+returned (line 29).  
 
 The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
 \textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
-to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines 32, 33 and 34).  If the function 
+to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines 33, 34 and 35).  If the function 
 returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
 functions to work with.
 
@@ -1178,45 +1029,6 @@ initialization which allows for quick recovery from runtime errors.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_init\_multi.c
 \vspace{-3mm}
 \begin{alltt}
-016   #include <stdarg.h>
-017   
-018   int mp_init_multi(mp_int *mp, ...) 
-019   \{
-020       mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
-021       int n = 0;                 /* Number of ok inits */
-022       mp_int* cur_arg = mp;
-023       va_list args;
-024   
-025       va_start(args, mp);        /* init args to next argument from caller */
-026       while (cur_arg != NULL) \{
-027           if (mp_init(cur_arg) != MP_OKAY) \{
-028               /* Oops - error! Back-track and mp_clear what we already
-029                  succeeded in init-ing, then return error.
-030               */
-031               va_list clean_args;
-032               
-033               /* end the current list */
-034               va_end(args);
-035               
-036               /* now start cleaning up */            
-037               cur_arg = mp;
-038               va_start(clean_args, mp);
-039               while (n--) \{
-040                   mp_clear(cur_arg);
-041                   cur_arg = va_arg(clean_args, mp_int*);
-042               \}
-043               va_end(clean_args);
-044               res = MP_MEM;
-045               break;
-046           \}
-047           n++;
-048           cur_arg = va_arg(args, mp_int*);
-049       \}
-050       va_end(args);
-051       return res;                /* Assumed ok, if error flagged above. */
-052   \}
-053   
-054   #endif
 \end{alltt}
 \end{small}
 
@@ -1226,8 +1038,8 @@ structures in an actual C array they are simply passed as arguments to the funct
 appended on the right.  
 
 The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
-$n$ of succesfully initialized mp\_int structures is maintained (line 47) such that if a failure does occur,
-the algorithm can backtrack and free the previously initialized structures (lines 27 to 46).  
+$n$ of succesfully initialized mp\_int structures is maintained (line 48) such that if a failure does occur,
+the algorithm can backtrack and free the previously initialized structures (lines 28 to 47).  
 
 
 \subsection{Clamping Excess Digits}
@@ -1278,37 +1090,13 @@ when all of the digits are zero to ensure that the mp\_int is valid at all times
 \hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* trim unused digits 
-018    *
-019    * This is used to ensure that leading zero digits are
-020    * trimed and the leading "used" digit will be non-zero
-021    * Typically very fast.  Also fixes the sign if there
-022    * are no more leading digits
-023    */
-024   void
-025   mp_clamp (mp_int * a)
-026   \{
-027     /* decrease used while the most significant digit is
-028      * zero.
-029      */
-030     while (a->used > 0 && a->dp[a->used - 1] == 0) \{
-031       --(a->used);
-032     \}
-033   
-034     /* reset the sign flag if used == 0 */
-035     if (a->used == 0) \{
-036       a->sign = MP_ZPOS;
-037     \}
-038   \}
-039   #endif
 \end{alltt}
 \end{small}
 
-Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+Note on line 28 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
 language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
 important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
-undesirable.  The parenthesis on line 30 is used to make sure the \textbf{used} count is decremented and not
+undesirable.  The parenthesis on line 31 is used to make sure the \textbf{used} count is decremented and not
 the pointer ``a''.  
 
 \section*{Exercises}
@@ -1391,69 +1179,21 @@ implement the pseudo-code.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* copy, b = a */
-018   int
-019   mp_copy (mp_int * a, mp_int * b)
-020   \{
-021     int     res, n;
-022   
-023     /* if dst == src do nothing */
-024     if (a == b) \{
-025       return MP_OKAY;
-026     \}
-027   
-028     /* grow dest */
-029     if (b->alloc < a->used) \{
-030        if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
-031           return res;
-032        \}
-033     \}
-034   
-035     /* zero b and copy the parameters over */
-036     \{
-037       register mp_digit *tmpa, *tmpb;
-038   
-039       /* pointer aliases */
-040   
-041       /* source */
-042       tmpa = a->dp;
-043   
-044       /* destination */
-045       tmpb = b->dp;
-046   
-047       /* copy all the digits */
-048       for (n = 0; n < a->used; n++) \{
-049         *tmpb++ = *tmpa++;
-050       \}
-051   
-052       /* clear high digits */
-053       for (; n < b->used; n++) \{
-054         *tmpb++ = 0;
-055       \}
-056     \}
-057   
-058     /* copy used count and sign */
-059     b->used = a->used;
-060     b->sign = a->sign;
-061     return MP_OKAY;
-062   \}
-063   #endif
 \end{alltt}
 \end{small}
 
 Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
 mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without 
-copying digits (line 24).  
+copying digits (line 25).  
 
 The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
-$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines 29 to 33).  In order to
+$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines 30 to 33).  In order to
 simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
-of the mp\_ints $a$ and $b$ respectively.  These aliases (lines 42 and 45) allow the compiler to access the digits without first dereferencing the
+of the mp\_ints $a$ and $b$ respectively.  These aliases (lines 43 and 46) allow the compiler to access the digits without first dereferencing the
 mp\_int pointers and then subsequently the pointer to the digits.  
 
-After the aliases are established the digits from $a$ are copied into $b$ (lines 48 to 50) and then the excess 
-digits of $b$ are set to zero (lines 53 to 55).  Both ``for'' loops make use of the pointer aliases and in 
+After the aliases are established the digits from $a$ are copied into $b$ (lines 49 to 51) and then the excess 
+digits of $b$ are set to zero (lines 54 to 56).  Both ``for'' loops make use of the pointer aliases and in 
 fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization 
 allows the alias to stay in a machine register fairly easy between the two loops.
 
@@ -1541,18 +1281,6 @@ such this algorithm will perform two operations in one step.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* creates "a" then copies b into it */
-018   int mp_init_copy (mp_int * a, mp_int * b)
-019   \{
-020     int     res;
-021   
-022     if ((res = mp_init (a)) != MP_OKAY) \{
-023       return res;
-024     \}
-025     return mp_copy (b, a);
-026   \}
-027   #endif
 \end{alltt}
 \end{small}
 
@@ -1588,16 +1316,6 @@ This algorithm simply resets a mp\_int to the default state.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* set to zero */
-018   void
-019   mp_zero (mp_int * a)
-020   \{
-021     a->sign = MP_ZPOS;
-022     a->used = 0;
-023     memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
-024   \}
-025   #endif
 \end{alltt}
 \end{small}
 
@@ -1609,7 +1327,7 @@ After the function is completed, all of the digits are zeroed, the \textbf{used}
 With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
 the absolute value of an mp\_int.
 
-\newpage\begin{figure}[here]
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_abs}. \\
@@ -1636,32 +1354,12 @@ logic to handle it.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* b = |a| 
-018    *
-019    * Simple function copies the input and fixes the sign to positive
-020    */
-021   int
-022   mp_abs (mp_int * a, mp_int * b)
-023   \{
-024     int     res;
-025   
-026     /* copy a to b */
-027     if (a != b) \{
-028        if ((res = mp_copy (a, b)) != MP_OKAY) \{
-029          return res;
-030        \}
-031     \}
-032   
-033     /* force the sign of b to positive */
-034     b->sign = MP_ZPOS;
-035   
-036     return MP_OKAY;
-037   \}
-038   #endif
 \end{alltt}
 \end{small}
 
+This fairly trivial algorithm first eliminates non--required duplications (line 28) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
 \subsection{Integer Negation}
 With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
 the negative of an mp\_int input.
@@ -1697,28 +1395,18 @@ zero as negative.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* b = -a */
-018   int mp_neg (mp_int * a, mp_int * b)
-019   \{
-020     int     res;
-021     if ((res = mp_copy (a, b)) != MP_OKAY) \{
-022       return res;
-023     \}
-024     if (mp_iszero(b) != MP_YES) \{
-025        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-026     \}
-027     return MP_OKAY;
-028   \}
-029   #endif
 \end{alltt}
 \end{small}
 
+Like mp\_abs() this function avoids non--required duplications (line 22) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
 \section{Small Constants}
 \subsection{Setting Small Constants}
 Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
 
-\begin{figure}[here]
+\newpage\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_set}. \\
@@ -1745,23 +1433,17 @@ single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adj
 \hspace{-5.1mm}{\bf File}: bn\_mp\_set.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* set to a digit */
-018   void mp_set (mp_int * a, mp_digit b)
-019   \{
-020     mp_zero (a);
-021     a->dp[0] = b & MP_MASK;
-022     a->used  = (a->dp[0] != 0) ? 1 : 0;
-023   \}
-024   #endif
 \end{alltt}
 \end{small}
 
-Line 20 calls mp\_zero() to clear the mp\_int and reset the sign.  Line 21 copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line 22 will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
+First we zero (line 21) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line 22).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
 
 One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
 this function should take that into account.  Only trivially small constants can be set using this function.
@@ -1803,41 +1485,13 @@ Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorith
 \hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* set a 32-bit const */
-018   int mp_set_int (mp_int * a, unsigned long b)
-019   \{
-020     int     x, res;
-021   
-022     mp_zero (a);
-023     
-024     /* set four bits at a time */
-025     for (x = 0; x < 8; x++) \{
-026       /* shift the number up four bits */
-027       if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{
-028         return res;
-029       \}
-030   
-031       /* OR in the top four bits of the source */
-032       a->dp[0] |= (b >> 28) & 15;
-033   
-034       /* shift the source up to the next four bits */
-035       b <<= 4;
-036   
-037       /* ensure that digits are not clamped off */
-038       a->used += 1;
-039     \}
-040     mp_clamp (a);
-041     return MP_OKAY;
-042   \}
-043   #endif
 \end{alltt}
 \end{small}
 
 This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
-addition on line 38 ensures that the newly added in bits are added to the number of digits.  While it may not 
-seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 
-as well as the  call to mp\_clamp() on line 40.  Both functions will clamp excess leading digits which keeps 
+addition on line 39 ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 28 
+as well as the  call to mp\_clamp() on line 41.  Both functions will clamp excess leading digits which keeps 
 the number of used digits low.
 
 \section{Comparisons}
@@ -1898,48 +1552,15 @@ the zero'th digit.  If after all of the digits have been compared, no difference
 \hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* compare maginitude of two ints (unsigned) */
-018   int mp_cmp_mag (mp_int * a, mp_int * b)
-019   \{
-020     int     n;
-021     mp_digit *tmpa, *tmpb;
-022   
-023     /* compare based on # of non-zero digits */
-024     if (a->used > b->used) \{
-025       return MP_GT;
-026     \}
-027     
-028     if (a->used < b->used) \{
-029       return MP_LT;
-030     \}
-031   
-032     /* alias for a */
-033     tmpa = a->dp + (a->used - 1);
-034   
-035     /* alias for b */
-036     tmpb = b->dp + (a->used - 1);
-037   
-038     /* compare based on digits  */
-039     for (n = 0; n < a->used; ++n, --tmpa, --tmpb) \{
-040       if (*tmpa > *tmpb) \{
-041         return MP_GT;
-042       \}
-043   
-044       if (*tmpa < *tmpb) \{
-045         return MP_LT;
-046       \}
-047     \}
-048     return MP_EQ;
-049   \}
-050   #endif
 \end{alltt}
 \end{small}
 
-The two if statements on lines 24 and 28 compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
+The two if statements (lines 25 and 29) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
 
 \subsection{Signed Comparisons}
 Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
@@ -1974,35 +1595,12 @@ $\vert a \vert < \vert b \vert$.  Step number four will compare the two when the
 \hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* compare two ints (signed)*/
-018   int
-019   mp_cmp (mp_int * a, mp_int * b)
-020   \{
-021     /* compare based on sign */
-022     if (a->sign != b->sign) \{
-023        if (a->sign == MP_NEG) \{
-024           return MP_LT;
-025        \} else \{
-026           return MP_GT;
-027        \}
-028     \}
-029     
-030     /* compare digits */
-031     if (a->sign == MP_NEG) \{
-032        /* if negative compare opposite direction */
-033        return mp_cmp_mag(b, a);
-034     \} else \{
-035        return mp_cmp_mag(a, b);
-036     \}
-037   \}
-038   #endif
 \end{alltt}
 \end{small}
 
-The two if statements on lines 22 and 23 perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line 31, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line 33}).  Otherwise, the signs are assumed to 
+The two if statements (lines 23 and 24) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line 32) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line 34).  Otherwise, the signs are assumed to 
 be both positive and a forward direction unsigned comparison is performed.
 
 \section*{Exercises}
@@ -2126,111 +1724,24 @@ The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* low level addition, based on HAC pp.594, Algorithm 14.7 */
-018   int
-019   s_mp_add (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     mp_int *x;
-022     int     olduse, res, min, max;
-023   
-024     /* find sizes, we let |a| <= |b| which means we have to sort
-025      * them.  "x" will point to the input with the most digits
-026      */
-027     if (a->used > b->used) \{
-028       min = b->used;
-029       max = a->used;
-030       x = a;
-031     \} else \{
-032       min = a->used;
-033       max = b->used;
-034       x = b;
-035     \}
-036   
-037     /* init result */
-038     if (c->alloc < max + 1) \{
-039       if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{
-040         return res;
-041       \}
-042     \}
-043   
-044     /* get old used digit count and set new one */
-045     olduse = c->used;
-046     c->used = max + 1;
-047   
-048     \{
-049       register mp_digit u, *tmpa, *tmpb, *tmpc;
-050       register int i;
-051   
-052       /* alias for digit pointers */
-053   
-054       /* first input */
-055       tmpa = a->dp;
-056   
-057       /* second input */
-058       tmpb = b->dp;
-059   
-060       /* destination */
-061       tmpc = c->dp;
-062   
-063       /* zero the carry */
-064       u = 0;
-065       for (i = 0; i < min; i++) \{
-066         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-067         *tmpc = *tmpa++ + *tmpb++ + u;
-068   
-069         /* U = carry bit of T[i] */
-070         u = *tmpc >> ((mp_digit)DIGIT_BIT);
-071   
-072         /* take away carry bit from T[i] */
-073         *tmpc++ &= MP_MASK;
-074       \}
-075   
-076       /* now copy higher words if any, that is in A+B 
-077        * if A or B has more digits add those in 
-078        */
-079       if (min != max) \{
-080         for (; i < max; i++) \{
-081           /* T[i] = X[i] + U */
-082           *tmpc = x->dp[i] + u;
-083   
-084           /* U = carry bit of T[i] */
-085           u = *tmpc >> ((mp_digit)DIGIT_BIT);
-086   
-087           /* take away carry bit from T[i] */
-088           *tmpc++ &= MP_MASK;
-089         \}
-090       \}
-091   
-092       /* add carry */
-093       *tmpc++ = u;
-094   
-095       /* clear digits above oldused */
-096       for (i = c->used; i < olduse; i++) \{
-097         *tmpc++ = 0;
-098       \}
-099     \}
-100   
-101     mp_clamp (c);
-102     return MP_OKAY;
-103   \}
-104   #endif
 \end{alltt}
 \end{small}
 
-Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines 37 to 42 ensure that the destination is grown to 
-accomodate the result of the addition. 
+We first sort (lines 28 to 36) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (38 to 42) ensure that it can accomodate the result of the addition. 
 
 Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
-lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
+lines 56, 59 and 62 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
 compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
 
-The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line 65 and ends on line 74.  Similarly the conditional addition loop
-begins on line 80 and ends on line 90.  The addition is finished with the final carry being stored in $tmpc$ on line 93.  
-Note the ``++'' operator on the same line.  After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines 96 to 99 which set any old upper digits to zero.
+The initial carry $u$ will be cleared (line 65), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line 66 to 75) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line 81 to 90) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line 94).  Note the ``++'' operator within the same expression.
+After line 94, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line 97 to 99) which set any old upper digits to zero.
 
 \subsection{Low Level Subtraction}
 The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
@@ -2245,7 +1756,7 @@ this algorithm we will assume that the variable $\gamma$ represents the number o
 mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
 
 For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
 
 \newpage\begin{figure}[!here]
 \begin{center}
@@ -2314,93 +1825,26 @@ If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and cop
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
-018   int
-019   s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     olduse, res, min, max;
-022   
-023     /* find sizes */
-024     min = b->used;
-025     max = a->used;
-026   
-027     /* init result */
-028     if (c->alloc < max) \{
-029       if ((res = mp_grow (c, max)) != MP_OKAY) \{
-030         return res;
-031       \}
-032     \}
-033     olduse = c->used;
-034     c->used = max;
-035   
-036     \{
-037       register mp_digit u, *tmpa, *tmpb, *tmpc;
-038       register int i;
-039   
-040       /* alias for digit pointers */
-041       tmpa = a->dp;
-042       tmpb = b->dp;
-043       tmpc = c->dp;
-044   
-045       /* set carry to zero */
-046       u = 0;
-047       for (i = 0; i < min; i++) \{
-048         /* T[i] = A[i] - B[i] - U */
-049         *tmpc = *tmpa++ - *tmpb++ - u;
-050   
-051         /* U = carry bit of T[i]
-052          * Note this saves performing an AND operation since
-053          * if a carry does occur it will propagate all the way to the
-054          * MSB.  As a result a single shift is enough to get the carry
-055          */
-056         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-057   
-058         /* Clear carry from T[i] */
-059         *tmpc++ &= MP_MASK;
-060       \}
-061   
-062       /* now copy higher words if any, e.g. if A has more digits than B  */
-063       for (; i < max; i++) \{
-064         /* T[i] = A[i] - U */
-065         *tmpc = *tmpa++ - u;
-066   
-067         /* U = carry bit of T[i] */
-068         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-069   
-070         /* Clear carry from T[i] */
-071         *tmpc++ &= MP_MASK;
-072       \}
-073   
-074       /* clear digits above used (since we may not have grown result above) */
-      
-075       for (i = c->used; i < olduse; i++) \{
-076         *tmpc++ = 0;
-077       \}
-078     \}
-079   
-080     mp_clamp (c);
-081     return MP_OKAY;
-082   \}
-083   
-084   #endif
 \end{alltt}
 \end{small}
 
-Line 24 and 25 perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines 41, 42 and 43 initialize the aliases for 
-$a$, $b$ and $c$ respectively.
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines 25 and 26).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines 42, 43 and 44) for $a$, $b$ and $c$ respectively.
 
-The first subtraction loop occurs on lines 46 through 60.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line 56}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
+The first subtraction loop (lines 47 through 61) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line 57).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
 
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
+If $a$ has a larger magnitude than $b$ an additional loop (lines 64 through 73) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
 
 \subsection{High Level Addition}
 Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
@@ -2483,39 +1927,6 @@ within algorithm s\_mp\_add will force $-0$ to become $0$.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_add.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* high level addition (handles signs) */
-018   int mp_add (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     int     sa, sb, res;
-021   
-022     /* get sign of both inputs */
-023     sa = a->sign;
-024     sb = b->sign;
-025   
-026     /* handle two cases, not four */
-027     if (sa == sb) \{
-028       /* both positive or both negative */
-029       /* add their magnitudes, copy the sign */
-030       c->sign = sa;
-031       res = s_mp_add (a, b, c);
-032     \} else \{
-033       /* one positive, the other negative */
-034       /* subtract the one with the greater magnitude from */
-035       /* the one of the lesser magnitude.  The result gets */
-036       /* the sign of the one with the greater magnitude. */
-037       if (mp_cmp_mag (a, b) == MP_LT) \{
-038         c->sign = sb;
-039         res = s_mp_sub (b, a, c);
-040       \} else \{
-041         c->sign = sa;
-042         res = s_mp_sub (a, b, c);
-043       \}
-044     \}
-045     return res;
-046   \}
-047   
-048   #endif
 \end{alltt}
 \end{small}
 
@@ -2589,50 +2000,11 @@ algorithm from producing $-a - -a = -0$ as a result.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* high level subtraction (handles signs) */
-018   int
-019   mp_sub (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     sa, sb, res;
-022   
-023     sa = a->sign;
-024     sb = b->sign;
-025   
-026     if (sa != sb) \{
-027       /* subtract a negative from a positive, OR */
-028       /* subtract a positive from a negative. */
-029       /* In either case, ADD their magnitudes, */
-030       /* and use the sign of the first number. */
-031       c->sign = sa;
-032       res = s_mp_add (a, b, c);
-033     \} else \{
-034       /* subtract a positive from a positive, OR */
-035       /* subtract a negative from a negative. */
-036       /* First, take the difference between their */
-037       /* magnitudes, then... */
-038       if (mp_cmp_mag (a, b) != MP_LT) \{
-039         /* Copy the sign from the first */
-040         c->sign = sa;
-041         /* The first has a larger or equal magnitude */
-042         res = s_mp_sub (a, b, c);
-043       \} else \{
-044         /* The result has the *opposite* sign from */
-045         /* the first number. */
-046         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-047         /* The second has a larger magnitude */
-048         res = s_mp_sub (b, a, c);
-049       \}
-050     \}
-051     return res;
-052   \}
-053   
-054   #endif
 \end{alltt}
 \end{small}
 
 Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
-and forward it to the end of the function.  On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+and forward it to the end of the function.  On line 39 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
 ``greater than or equal to'' comparison.  
 
 \section{Bit and Digit Shifting}
@@ -2700,73 +2072,11 @@ Step 8 clears any leading digits of $b$ in case it originally had a larger magni
 \hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* b = a*2 */
-018   int mp_mul_2(mp_int * a, mp_int * b)
-019   \{
-020     int     x, res, oldused;
-021   
-022     /* grow to accomodate result */
-023     if (b->alloc < a->used + 1) \{
-024       if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{
-025         return res;
-026       \}
-027     \}
-028   
-029     oldused = b->used;
-030     b->used = a->used;
-031   
-032     \{
-033       register mp_digit r, rr, *tmpa, *tmpb;
-034   
-035       /* alias for source */
-036       tmpa = a->dp;
-037       
-038       /* alias for dest */
-039       tmpb = b->dp;
-040   
-041       /* carry */
-042       r = 0;
-043       for (x = 0; x < a->used; x++) \{
-044       
-045         /* get what will be the *next* carry bit from the 
-046          * MSB of the current digit 
-047          */
-048         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
-049         
-050         /* now shift up this digit, add in the carry [from the previous] */
-051         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
-052         
-053         /* copy the carry that would be from the source 
-054          * digit into the next iteration 
-055          */
-056         r = rr;
-057       \}
-058   
-059       /* new leading digit? */
-060       if (r != 0) \{
-061         /* add a MSB which is always 1 at this point */
-062         *tmpb = 1;
-063         ++(b->used);
-064       \}
-065   
-066       /* now zero any excess digits on the destination 
-067        * that we didn't write to 
-068        */
-069       tmpb = b->dp + b->used;
-070       for (x = b->used; x < oldused; x++) \{
-071         *tmpb++ = 0;
-072       \}
-073     \}
-074     b->sign = a->sign;
-075     return MP_OKAY;
-076   \}
-077   #endif
 \end{alltt}
 \end{small}
 
 This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
-is the use of the logical shift operator on line 51 to perform a single precision doubling.  
+is the use of the logical shift operator on line 52 to perform a single precision doubling.  
 
 \subsection{Division by Two}
 A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
@@ -2814,54 +2124,6 @@ least significant bit not the most significant bit.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* b = a/2 */
-018   int mp_div_2(mp_int * a, mp_int * b)
-019   \{
-020     int     x, res, oldused;
-021   
-022     /* copy */
-023     if (b->alloc < a->used) \{
-024       if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
-025         return res;
-026       \}
-027     \}
-028   
-029     oldused = b->used;
-030     b->used = a->used;
-031     \{
-032       register mp_digit r, rr, *tmpa, *tmpb;
-033   
-034       /* source alias */
-035       tmpa = a->dp + b->used - 1;
-036   
-037       /* dest alias */
-038       tmpb = b->dp + b->used - 1;
-039   
-040       /* carry */
-041       r = 0;
-042       for (x = b->used - 1; x >= 0; x--) \{
-043         /* get the carry for the next iteration */
-044         rr = *tmpa & 1;
-045   
-046         /* shift the current digit, add in carry and store */
-047         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-048   
-049         /* forward carry to next iteration */
-050         r = rr;
-051       \}
-052   
-053       /* zero excess digits */
-054       tmpb = b->dp + b->used;
-055       for (x = b->used; x < oldused; x++) \{
-056         *tmpb++ = 0;
-057       \}
-058     \}
-059     b->sign = a->sign;
-060     mp_clamp (b);
-061     return MP_OKAY;
-062   \}
-063   #endif
 \end{alltt}
 \end{small}
 
@@ -2935,60 +2197,14 @@ step 8 sets the lower $b$ digits to zero.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* shift left a certain amount of digits */
-018   int mp_lshd (mp_int * a, int b)
-019   \{
-020     int     x, res;
-021   
-022     /* if its less than zero return */
-023     if (b <= 0) \{
-024       return MP_OKAY;
-025     \}
-026   
-027     /* grow to fit the new digits */
-028     if (a->alloc < a->used + b) \{
-029        if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{
-030          return res;
-031        \}
-032     \}
-033   
-034     \{
-035       register mp_digit *top, *bottom;
-036   
-037       /* increment the used by the shift amount then copy upwards */
-038       a->used += b;
-039   
-040       /* top */
-041       top = a->dp + a->used - 1;
-042   
-043       /* base */
-044       bottom = a->dp + a->used - 1 - b;
-045   
-046       /* much like mp_rshd this is implemented using a sliding window
-047        * except the window goes the otherway around.  Copying from
-048        * the bottom to the top.  see bn_mp_rshd.c for more info.
-049        */
-050       for (x = a->used - 1; x >= b; x--) \{
-051         *top-- = *bottom--;
-052       \}
-053   
-054       /* zero the lower digits */
-055       top = a->dp;
-056       for (x = 0; x < b; x++) \{
-057         *top++ = 0;
-058       \}
-059     \}
-060     return MP_OKAY;
-061   \}
-062   #endif
 \end{alltt}
 \end{small}
 
-The if statement on line 23 ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line 41 is an alias
-for the leading digit while $bottom$ on line 44 is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
+The if statement (line 24) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line 42) is an alias
+for the leading digit while $bottom$ (line 45) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
 
 \subsection{Division by $x$}
 
@@ -3040,64 +2256,12 @@ Once the window copy is complete the upper digits must be zeroed and the \textbf
 \hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* shift right a certain amount of digits */
-018   void mp_rshd (mp_int * a, int b)
-019   \{
-020     int     x;
-021   
-022     /* if b <= 0 then ignore it */
-023     if (b <= 0) \{
-024       return;
-025     \}
-026   
-027     /* if b > used then simply zero it and return */
-028     if (a->used <= b) \{
-029       mp_zero (a);
-030       return;
-031     \}
-032   
-033     \{
-034       register mp_digit *bottom, *top;
-035   
-036       /* shift the digits down */
-037   
-038       /* bottom */
-039       bottom = a->dp;
-040   
-041       /* top [offset into digits] */
-042       top = a->dp + b;
-043   
-044       /* this is implemented as a sliding window where 
-045        * the window is b-digits long and digits from 
-046        * the top of the window are copied to the bottom
-047        *
-048        * e.g.
-049   
-050        b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-051                    /\symbol{92}                   |      ---->
-052                     \symbol{92}-------------------/      ---->
-053        */
-054       for (x = 0; x < (a->used - b); x++) \{
-055         *bottom++ = *top++;
-056       \}
-057   
-058       /* zero the top digits */
-059       for (; x < a->used; x++) \{
-060         *bottom++ = 0;
-061       \}
-062     \}
-063     
-064     /* remove excess digits */
-065     a->used -= b;
-066   \}
-067   #endif
 \end{alltt}
 \end{small}
 
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line 60) we then zero
+the upper digits of the input to make sure the result is correct.
 
 \section{Powers of Two}
 
@@ -3150,7 +2314,7 @@ left.
 
 After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
 required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
-Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
 variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
 
 This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
@@ -3160,75 +2324,18 @@ complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm
 \hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* shift left by a certain bit count */
-018   int mp_mul_2d (mp_int * a, int b, mp_int * c)
-019   \{
-020     mp_digit d;
-021     int      res;
-022   
-023     /* copy */
-024     if (a != c) \{
-025        if ((res = mp_copy (a, c)) != MP_OKAY) \{
-026          return res;
-027        \}
-028     \}
-029   
-030     if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) \{
-031        if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) \{
-032          return res;
-033        \}
-034     \}
-035   
-036     /* shift by as many digits in the bit count */
-037     if (b >= (int)DIGIT_BIT) \{
-038       if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{
-039         return res;
-040       \}
-041     \}
-042   
-043     /* shift any bit count < DIGIT_BIT */
-044     d = (mp_digit) (b % DIGIT_BIT);
-045     if (d != 0) \{
-046       register mp_digit *tmpc, shift, mask, r, rr;
-047       register int x;
-048   
-049       /* bitmask for carries */
-050       mask = (((mp_digit)1) << d) - 1;
-051   
-052       /* shift for msbs */
-053       shift = DIGIT_BIT - d;
-054   
-055       /* alias */
-056       tmpc = c->dp;
-057   
-058       /* carry */
-059       r    = 0;
-060       for (x = 0; x < c->used; x++) \{
-061         /* get the higher bits of the current word */
-062         rr = (*tmpc >> shift) & mask;
-063   
-064         /* shift the current word and OR in the carry */
-065         *tmpc = ((*tmpc << d) | r) & MP_MASK;
-066         ++tmpc;
-067   
-068         /* set the carry to the carry bits of the current word */
-069         r = rr;
-070       \}
-071       
-072       /* set final carry */
-073       if (r != 0) \{
-074          c->dp[(c->used)++] = r;
-075       \}
-076     \}
-077     mp_clamp (c);
-078     return MP_OKAY;
-079   \}
-080   #endif
 \end{alltt}
 \end{small}
 
-Notes to be revised when code is updated. -- Tom
+The shifting is performed in--place which means the first step (line 25) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line 32) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines 46 to 76) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
 
 \subsection{Division by Power of Two}
 
@@ -3274,85 +2381,6 @@ by using algorithm mp\_mod\_2d.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* shift right by a certain bit count (store quotient in c, optional remaind
-      er in d) */
-018   int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
-019   \{
-020     mp_digit D, r, rr;
-021     int     x, res;
-022     mp_int  t;
-023   
-024   
-025     /* if the shift count is <= 0 then we do no work */
-026     if (b <= 0) \{
-027       res = mp_copy (a, c);
-028       if (d != NULL) \{
-029         mp_zero (d);
-030       \}
-031       return res;
-032     \}
-033   
-034     if ((res = mp_init (&t)) != MP_OKAY) \{
-035       return res;
-036     \}
-037   
-038     /* get the remainder */
-039     if (d != NULL) \{
-040       if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{
-041         mp_clear (&t);
-042         return res;
-043       \}
-044     \}
-045   
-046     /* copy */
-047     if ((res = mp_copy (a, c)) != MP_OKAY) \{
-048       mp_clear (&t);
-049       return res;
-050     \}
-051   
-052     /* shift by as many digits in the bit count */
-053     if (b >= (int)DIGIT_BIT) \{
-054       mp_rshd (c, b / DIGIT_BIT);
-055     \}
-056   
-057     /* shift any bit count < DIGIT_BIT */
-058     D = (mp_digit) (b % DIGIT_BIT);
-059     if (D != 0) \{
-060       register mp_digit *tmpc, mask, shift;
-061   
-062       /* mask */
-063       mask = (((mp_digit)1) << D) - 1;
-064   
-065       /* shift for lsb */
-066       shift = DIGIT_BIT - D;
-067   
-068       /* alias */
-069       tmpc = c->dp + (c->used - 1);
-070   
-071       /* carry */
-072       r = 0;
-073       for (x = c->used - 1; x >= 0; x--) \{
-074         /* get the lower  bits of this word in a temp */
-075         rr = *tmpc & mask;
-076   
-077         /* shift the current word and mix in the carry bits from the previous 
-      word */
-078         *tmpc = (*tmpc >> D) | (r << shift);
-079         --tmpc;
-080   
-081         /* set the carry to the carry bits of the current word found above */
-082         r = rr;
-083       \}
-084     \}
-085     mp_clamp (c);
-086     if (d != NULL) \{
-087       mp_exch (&t, d);
-088     \}
-089     mp_clear (&t);
-090     return MP_OKAY;
-091   \}
-092   #endif
 \end{alltt}
 \end{small}
 
@@ -3361,7 +2389,8 @@ ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The
 result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
 the quotient is obtained.
 
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
 
 \subsection{Remainder of Division by Power of Two}
 
@@ -3406,47 +2435,16 @@ is copied to $b$, leading digits are removed and the remaining leading digit is
 \hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* calc a value mod 2**b */
-018   int
-019   mp_mod_2d (mp_int * a, int b, mp_int * c)
-020   \{
-021     int     x, res;
-022   
-023     /* if b is <= 0 then zero the int */
-024     if (b <= 0) \{
-025       mp_zero (c);
-026       return MP_OKAY;
-027     \}
-028   
-029     /* if the modulus is larger than the value than return */
-030     if (b >= (int) (a->used * DIGIT_BIT)) \{
-031       res = mp_copy (a, c);
-032       return res;
-033     \}
-034   
-035     /* copy */
-036     if ((res = mp_copy (a, c)) != MP_OKAY) \{
-037       return res;
-038     \}
-039   
-040     /* zero digits above the last digit of the modulus */
-041     for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+
-      +) \{
-042       c->dp[x] = 0;
-043     \}
-044     /* clear the digit that is not completely outside/inside the modulus */
-045     c->dp[b / DIGIT_BIT] &=
-046       (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi
-      t) 1));
-047     mp_clamp (c);
-048     return MP_OKAY;
-049   \}
-050   #endif
 \end{alltt}
 \end{small}
 
--- Add comments later, Tom.
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line 42).  Next we reduce the 
+leading digit of both (line 46) and then mp\_clamp().
 
 \section*{Exercises}
 \begin{tabular}{cl}
@@ -3606,106 +2604,48 @@ exceed the precision requested.
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* multiplies |a| * |b| and only computes upto digs digits of result
-018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
-019    * many digits of output are created.
-020    */
-021   int
-022   s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-023   \{
-024     mp_int  t;
-025     int     res, pa, pb, ix, iy;
-026     mp_digit u;
-027     mp_word r;
-028     mp_digit tmpx, *tmpt, *tmpy;
-029   
-030     /* can we use the fast multiplier? */
-031     if (((digs) < MP_WARRAY) &&
-032         MIN (a->used, b->used) < 
-033             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-034       return fast_s_mp_mul_digs (a, b, c, digs);
-035     \}
-036   
-037     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
-038       return res;
-039     \}
-040     t.used = digs;
-041   
-042     /* compute the digits of the product directly */
-043     pa = a->used;
-044     for (ix = 0; ix < pa; ix++) \{
-045       /* set the carry to zero */
-046       u = 0;
-047   
-048       /* limit ourselves to making digs digits of output */
-049       pb = MIN (b->used, digs - ix);
-050   
-051       /* setup some aliases */
-052       /* copy of the digit from a used within the nested loop */
-053       tmpx = a->dp[ix];
-054       
-055       /* an alias for the destination shifted ix places */
-056       tmpt = t.dp + ix;
-057       
-058       /* an alias for the digits of b */
-059       tmpy = b->dp;
-060   
-061       /* compute the columns of the output and propagate the carry */
-062       for (iy = 0; iy < pb; iy++) \{
-063         /* compute the column as a mp_word */
-064         r       = ((mp_word)*tmpt) +
-065                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-066                   ((mp_word) u);
-067   
-068         /* the new column is the lower part of the result */
-069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-070   
-071         /* get the carry word from the result */
-072         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-073       \}
-074       /* set carry if it is placed below digs */
-075       if (ix + iy < digs) \{
-076         *tmpt = u;
-077       \}
-078     \}
-079   
-080     mp_clamp (&t);
-081     mp_exch (&t, c);
-082   
-083     mp_clear (&t);
-084     return MP_OKAY;
-085   \}
-086   #endif
 \end{alltt}
 \end{small}
 
-Lines 31 to 35 determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
+First we determine (line 31) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
+
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line 37) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line 49) to the maximum 
+number of inner loop iterations.  
 
-Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line 65 makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line 69) and the carry propagated (line 72) to the 
+next iteration.
 
 \subsection{Faster Multiplication by the ``Comba'' Method}
 
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
 
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
 
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
 
 \begin{equation}
 \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
@@ -3799,38 +2739,31 @@ $256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which,
 \textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
 \textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
 \hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
 1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
 2.  If step 1 failed return(\textit{MP\_MEM}).\\
 \\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
 \\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
 \\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+6.  $oldused \leftarrow c.used$ \\
+7.  $c.used \leftarrow digs$ \\
+8.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}8.1  $c_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\
 \\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
+10.  Clamp $c$. \\
+11.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -3840,15 +2773,24 @@ Zero excess digits. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
+
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
+
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
 
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
 
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
 
 To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
 cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
@@ -3860,114 +2802,23 @@ and addition operations in the nested loop in parallel.
 \hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* Fast (comba) multiplier
-018    *
-019    * This is the fast column-array [comba] multiplier.  It is 
-020    * designed to compute the columns of the product first 
-021    * then handle the carries afterwards.  This has the effect 
-022    * of making the nested loops that compute the columns very
-023    * simple and schedulable on super-scalar processors.
-024    *
-025    * This has been modified to produce a variable number of 
-026    * digits of output so if say only a half-product is required 
-027    * you don't have to compute the upper half (a feature 
-028    * required for fast Barrett reduction).
-029    *
-030    * Based on Algorithm 14.12 on pp.595 of HAC.
-031    *
-032    */
-033   int
-034   fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-035   \{
-036     int     olduse, res, pa, ix, iz;
-037     mp_digit W[MP_WARRAY];
-038     register mp_word  _W;
-039   
-040     /* grow the destination as required */
-041     if (c->alloc < digs) \{
-042       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
-043         return res;
-044       \}
-045     \}
-046   
-047     /* number of output digits to produce */
-048     pa = MIN(digs, a->used + b->used);
-049   
-050     /* clear the carry */
-051     _W = 0;
-052     for (ix = 0; ix < pa; ix++) \{ 
-053         int      tx, ty;
-054         int      iy;
-055         mp_digit *tmpx, *tmpy;
-056   
-057         /* get offsets into the two bignums */
-058         ty = MIN(b->used-1, ix);
-059         tx = ix - ty;
-060   
-061         /* setup temp aliases */
-062         tmpx = a->dp + tx;
-063         tmpy = b->dp + ty;
-064   
-065         /* this is the number of times the loop will iterrate, essentially its
-       
-066            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-067          */
-068         iy = MIN(a->used-tx, ty+1);
-069   
-070         /* execute loop */
-071         for (iz = 0; iz < iy; ++iz) \{
-072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-073         \}
-074   
-075         /* store term */
-076         W[ix] = ((mp_digit)_W) & MP_MASK;
-077   
-078         /* make next carry */
-079         _W = _W >> ((mp_word)DIGIT_BIT);
-080     \}
-081   
-082     /* store final carry */
-083     W[ix] = _W;
-084   
-085     /* setup dest */
-086     olduse  = c->used;
-087     c->used = digs;
-088   
-089     \{
-090       register mp_digit *tmpc;
-091       tmpc = c->dp;
-092       for (ix = 0; ix < digs; ix++) \{
-093         /* now extract the previous digit [below the carry] */
-094         *tmpc++ = W[ix];
-095       \}
-096   
-097       /* clear unused digits [that existed in the old copy of c] */
-098       for (; ix < olduse; ix++) \{
-099         *tmpc++ = 0;
-100       \}
-101     \}
-102     mp_clamp (c);
-103     return MP_OKAY;
-104   \}
-105   #endif
 \end{alltt}
 \end{small}
 
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines 62, 63 and 76}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+As per the pseudo--code we first calculate $pa$ (line 48) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 62, 63) to point
+inside the two multiplicands quickly.  
 
-The inner loop on lines 92, 79 and 80 is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
+The inner loop (lines 71 to 74) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
 
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 77, 80) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line 77) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -4071,26 +2922,25 @@ general purpose multiplication.  Given two polynomial basis representations $f(x
 light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
 
 \begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
 \end{equation}
 
 Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
 this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
 out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
 
 \begin{center}
 \begin{tabular}{rcrcrcrc}
 $\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
 $\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
 \end{tabular}
 \end{center}
 
 By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
 of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -4113,13 +2963,13 @@ Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
 Calculate the three products. \\
 8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
 9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
+10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
+11.  $x0 \leftarrow y1 + y0$ \\
 12.  $t1 \leftarrow t1 \cdot x0$ \\
 \\
 Calculate the middle term. \\
 13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
+14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
 \\
 Calculate the final product. \\
 15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
@@ -4146,7 +2996,7 @@ smallest input \textbf{used} count.  After the radix point is chosen the inputs
 compute the lower halves.  Step 6 and 7 computer the upper halves.  
 
 After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
 of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
 
 The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
@@ -4155,159 +3005,12 @@ The remaining steps 13 through 18 compute the Karatsuba polynomial through a var
 \hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* c = |a| * |b| using Karatsuba Multiplication using 
-018    * three half size multiplications
-019    *
-020    * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
-021    * let n represent half of the number of digits in 
-022    * the min(a,b)
-023    *
-024    * a = a1 * B**n + a0
-025    * b = b1 * B**n + b0
-026    *
-027    * Then, a * b => 
-028      a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
-029    *
-030    * Note that a1b1 and a0b0 are used twice and only need to be 
-031    * computed once.  So in total three half size (half # of 
-032    * digit) multiplications are performed, a0b0, a1b1 and 
-033    * (a1-b1)(a0-b0)
-034    *
-035    * Note that a multiplication of half the digits requires
-036    * 1/4th the number of single precision multiplications so in 
-037    * total after one call 25% of the single precision multiplications 
-038    * are saved.  Note also that the call to mp_mul can end up back 
-039    * in this function if the a0, a1, b0, or b1 are above the threshold.  
-040    * This is known as divide-and-conquer and leads to the famous 
-041    * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
-042    * the standard O(N**2) that the baseline/comba methods use.  
-043    * Generally though the overhead of this method doesn't pay off 
-044    * until a certain size (N ~ 80) is reached.
-045    */
-046   int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
-047   \{
-048     mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
-049     int     B, err;
-050   
-051     /* default the return code to an error */
-052     err = MP_MEM;
-053   
-054     /* min # of digits */
-055     B = MIN (a->used, b->used);
-056   
-057     /* now divide in two */
-058     B = B >> 1;
-059   
-060     /* init copy all the temps */
-061     if (mp_init_size (&x0, B) != MP_OKAY)
-062       goto ERR;
-063     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-064       goto X0;
-065     if (mp_init_size (&y0, B) != MP_OKAY)
-066       goto X1;
-067     if (mp_init_size (&y1, b->used - B) != MP_OKAY)
-068       goto Y0;
-069   
-070     /* init temps */
-071     if (mp_init_size (&t1, B * 2) != MP_OKAY)
-072       goto Y1;
-073     if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-074       goto T1;
-075     if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
-076       goto X0Y0;
-077   
-078     /* now shift the digits */
-079     x0.used = y0.used = B;
-080     x1.used = a->used - B;
-081     y1.used = b->used - B;
-082   
-083     \{
-084       register int x;
-085       register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-086   
-087       /* we copy the digits directly instead of using higher level functions
-088        * since we also need to shift the digits
-089        */
-090       tmpa = a->dp;
-091       tmpb = b->dp;
-092   
-093       tmpx = x0.dp;
-094       tmpy = y0.dp;
-095       for (x = 0; x < B; x++) \{
-096         *tmpx++ = *tmpa++;
-097         *tmpy++ = *tmpb++;
-098       \}
-099   
-100       tmpx = x1.dp;
-101       for (x = B; x < a->used; x++) \{
-102         *tmpx++ = *tmpa++;
-103       \}
-104   
-105       tmpy = y1.dp;
-106       for (x = B; x < b->used; x++) \{
-107         *tmpy++ = *tmpb++;
-108       \}
-109     \}
-110   
-111     /* only need to clamp the lower words since by definition the 
-112      * upper words x1/y1 must have a known number of digits
-113      */
-114     mp_clamp (&x0);
-115     mp_clamp (&y0);
-116   
-117     /* now calc the products x0y0 and x1y1 */
-118     /* after this x0 is no longer required, free temp [x0==t2]! */
-119     if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
-120       goto X1Y1;          /* x0y0 = x0*y0 */
-121     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-122       goto X1Y1;          /* x1y1 = x1*y1 */
-123   
-124     /* now calc x1-x0 and y1-y0 */
-125     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-126       goto X1Y1;          /* t1 = x1 - x0 */
-127     if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
-128       goto X1Y1;          /* t2 = y1 - y0 */
-129     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-130       goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
-131   
-132     /* add x0y0 */
-133     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
-134       goto X1Y1;          /* t2 = x0y0 + x1y1 */
-135     if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-136       goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-137   
-138     /* shift by B */
-139     if (mp_lshd (&t1, B) != MP_OKAY)
-140       goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-141     if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-142       goto X1Y1;          /* x1y1 = x1y1 << 2*B */
-143   
-144     if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-145       goto X1Y1;          /* t1 = x0y0 + t1 */
-146     if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-147       goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
-148   
-149     /* Algorithm succeeded set the return code to MP_OKAY */
-150     err = MP_OKAY;
-151   
-152   X1Y1:mp_clear (&x1y1);
-153   X0Y0:mp_clear (&x0y0);
-154   T1:mp_clear (&t1);
-155   Y1:mp_clear (&y1);
-156   Y0:mp_clear (&y0);
-157   X1:mp_clear (&x1);
-158   X0:mp_clear (&x0);
-159   ERR:
-160     return err;
-161   \}
-162   #endif
 \end{alltt}
 \end{small}
 
 The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
 wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
-to handle error recovery with a single piece of code.  Lines 61 to 75 handle initializing all of the temporary variables 
+to handle error recovery with a single piece of code.  Lines 62 to 76 handle initializing all of the temporary variables 
 required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
 the temporaries that have been successfully allocated so far.
 
@@ -4317,13 +3020,13 @@ number of digits for the next section of code.
 
 The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
 to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
-\textbf{sign} members are copied first.  The first for loop on line 101 copies the lower halves.  Since they are both the same magnitude it 
-is simpler to calculate both lower halves in a single loop.  The for loop on lines 106 and 106 calculate the upper halves $x1$ and 
+\textbf{sign} members are copied first.  The first for loop on line 96 copies the lower halves.  Since they are both the same magnitude it 
+is simpler to calculate both lower halves in a single loop.  The for loop on lines 102 and 107 calculate the upper halves $x1$ and 
 $y1$ respectively.
 
 By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
 
-When line 150 is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
+When line 151 is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
 the same code that handles errors can be used to clear the temporary variables and return.  
 
 \subsection{Toom-Cook $3$-Way Multiplication}
@@ -4441,280 +3144,29 @@ result $a \cdot b$ is produced.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_toom\_mul.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* multiplication using the Toom-Cook 3-way algorithm 
-018    *
-019    * Much more complicated than Karatsuba but has a lower asymptotic running t
-      ime of 
-020    * O(N**1.464).  This algorithm is only particularly useful on VERY large
-021    * inputs (we're talking 1000s of digits here...).
-022   */
-023   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-024   \{
-025       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-026       int res, B;
-027           
-028       /* init temps */
-029       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
-030                                &a0, &a1, &a2, &b0, &b1, 
-031                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
-032          return res;
-033       \}
-034       
-035       /* B */
-036       B = MIN(a->used, b->used) / 3;
-037       
-038       /* a = a2 * B**2 + a1 * B + a0 */
-039       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
-040          goto ERR;
-041       \}
-042   
-043       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
-044          goto ERR;
-045       \}
-046       mp_rshd(&a1, B);
-047       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-048   
-049       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
-050          goto ERR;
-051       \}
-052       mp_rshd(&a2, B*2);
-053       
-054       /* b = b2 * B**2 + b1 * B + b0 */
-055       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
-056          goto ERR;
-057       \}
-058   
-059       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
-060          goto ERR;
-061       \}
-062       mp_rshd(&b1, B);
-063       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-064   
-065       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
-066          goto ERR;
-067       \}
-068       mp_rshd(&b2, B*2);
-069       
-070       /* w0 = a0*b0 */
-071       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
-072          goto ERR;
-073       \}
-074       
-075       /* w4 = a2 * b2 */
-076       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
-077          goto ERR;
-078       \}
-079       
-080       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-081       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
-082          goto ERR;
-083       \}
-084       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-085          goto ERR;
-086       \}
-087       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-088          goto ERR;
-089       \}
-090       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
-091          goto ERR;
-092       \}
-093       
-094       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
-095          goto ERR;
-096       \}
-097       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-098          goto ERR;
-099       \}
-100       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-101          goto ERR;
-102       \}
-103       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
-104          goto ERR;
-105       \}
-106       
-107       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
-108          goto ERR;
-109       \}
-110       
-111       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-112       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
-113          goto ERR;
-114       \}
-115       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-116          goto ERR;
-117       \}
-118       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-119          goto ERR;
-120       \}
-121       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-122          goto ERR;
-123       \}
-124       
-125       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
-126          goto ERR;
-127       \}
-128       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-129          goto ERR;
-130       \}
-131       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-132          goto ERR;
-133       \}
-134       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-135          goto ERR;
-136       \}
-137       
-138       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
-139          goto ERR;
-140       \}
-141       
-142   
-143       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-144       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
-145          goto ERR;
-146       \}
-147       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-148          goto ERR;
-149       \}
-150       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
-151          goto ERR;
-152       \}
-153       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-154          goto ERR;
-155       \}
-156       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
-157          goto ERR;
-158       \}
-159       
-160       /* now solve the matrix 
-161       
-162          0  0  0  0  1
-163          1  2  4  8  16
-164          1  1  1  1  1
-165          16 8  4  2  1
-166          1  0  0  0  0
-167          
-168          using 12 subtractions, 4 shifts, 
-169                 2 small divisions and 1 small multiplication 
-170        */
-171        
-172        /* r1 - r4 */
-173        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
-174           goto ERR;
-175        \}
-176        /* r3 - r0 */
-177        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
-178           goto ERR;
-179        \}
-180        /* r1/2 */
-181        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
-182           goto ERR;
-183        \}
-184        /* r3/2 */
-185        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
-186           goto ERR;
-187        \}
-188        /* r2 - r0 - r4 */
-189        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
-190           goto ERR;
-191        \}
-192        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
-193           goto ERR;
-194        \}
-195        /* r1 - r2 */
-196        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-197           goto ERR;
-198        \}
-199        /* r3 - r2 */
-200        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-201           goto ERR;
-202        \}
-203        /* r1 - 8r0 */
-204        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
-205           goto ERR;
-206        \}
-207        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
-208           goto ERR;
-209        \}
-210        /* r3 - 8r4 */
-211        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
-212           goto ERR;
-213        \}
-214        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
-215           goto ERR;
-216        \}
-217        /* 3r2 - r1 - r3 */
-218        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
-219           goto ERR;
-220        \}
-221        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
-222           goto ERR;
-223        \}
-224        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
-225           goto ERR;
-226        \}
-227        /* r1 - r2 */
-228        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-229           goto ERR;
-230        \}
-231        /* r3 - r2 */
-232        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-233           goto ERR;
-234        \}
-235        /* r1/3 */
-236        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
-237           goto ERR;
-238        \}
-239        /* r3/3 */
-240        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
-241           goto ERR;
-242        \}
-243        
-244        /* at this point shift W[n] by B*n */
-245        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
-246           goto ERR;
-247        \}
-248        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
-249           goto ERR;
-250        \}
-251        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
-252           goto ERR;
-253        \}
-254        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
-255           goto ERR;
-256        \}     
-257        
-258        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
-259           goto ERR;
-260        \}
-261        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
-262           goto ERR;
-263        \}
-264        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
-265           goto ERR;
-266        \}
-267        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
-268           goto ERR;
-269        \}     
-270        
-271   ERR:
-272        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
-273                       &a0, &a1, &a2, &b0, &b1, 
-274                       &b2, &tmp1, &tmp2, NULL);
-275        return res;
-276   \}     
-277        
-278   #endif
 \end{alltt}
 \end{small}
 
--- Comments to be added during editing phase.
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines 41 to 70) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines 73 and 78).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
 
 \subsection{Signed Multiplication}
 Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
 of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -4754,57 +3206,11 @@ s\_mp\_mul\_digs will clear it.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* high level multiplication (handles sign) */
-018   int mp_mul (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     int     res, neg;
-021     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-022   
-023     /* use Toom-Cook? */
-024   #ifdef BN_MP_TOOM_MUL_C
-025     if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{
-026       res = mp_toom_mul(a, b, c);
-027     \} else 
-028   #endif
-029   #ifdef BN_MP_KARATSUBA_MUL_C
-030     /* use Karatsuba? */
-031     if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{
-032       res = mp_karatsuba_mul (a, b, c);
-033     \} else 
-034   #endif
-035     \{
-036       /* can we use the fast multiplier?
-037        *
-038        * The fast multiplier can be used if the output will 
-039        * have less than MP_WARRAY digits and the number of 
-040        * digits won't affect carry propagation
-041        */
-042       int     digs = a->used + b->used + 1;
-043   
-044   #ifdef BN_FAST_S_MP_MUL_DIGS_C
-045       if ((digs < MP_WARRAY) &&
-046           MIN(a->used, b->used) <= 
-047           (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-048         res = fast_s_mp_mul_digs (a, b, c, digs);
-049       \} else 
-050   #endif
-051   #ifdef BN_S_MP_MUL_DIGS_C
-052         res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
-053   #else
-054         res = MP_VAL;
-055   #endif
-056   
-057     \}
-058     c->sign = (c->used > 0) ? neg : MP_ZPOS;
-059     return res;
-060   \}
-061   #endif
 \end{alltt}
 \end{small}
 
-The implementation is rather simplistic and is not particularly noteworthy.  Line 23 computes the sign of the result using the ``?'' 
-operator from the C programming language.  Line 47 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
+The implementation is rather simplistic and is not particularly noteworthy.  Line 22 computes the sign of the result using the ``?'' 
+operator from the C programming language.  Line 48 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
 
 \section{Squaring}
 \label{sec:basesquare}
@@ -4847,7 +3253,7 @@ Column two of row one is a square and column three is the first unique column.
 The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
 will not handle.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -4905,77 +3311,17 @@ results calculated so far.  This involves expensive carry propagation which will
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-018   int
-019   s_mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     mp_int  t;
-022     int     res, ix, iy, pa;
-023     mp_word r;
-024     mp_digit u, tmpx, *tmpt;
-025   
-026     pa = a->used;
-027     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
-028       return res;
-029     \}
-030   
-031     /* default used is maximum possible size */
-032     t.used = 2*pa + 1;
-033   
-034     for (ix = 0; ix < pa; ix++) \{
-035       /* first calculate the digit at 2*ix */
-036       /* calculate double precision result */
-037       r = ((mp_word) t.dp[2*ix]) +
-038           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
-039   
-040       /* store lower part in result */
-041       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-042   
-043       /* get the carry */
-044       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-045   
-046       /* left hand side of A[ix] * A[iy] */
-047       tmpx        = a->dp[ix];
-048   
-049       /* alias for where to store the results */
-050       tmpt        = t.dp + (2*ix + 1);
-051       
-052       for (iy = ix + 1; iy < pa; iy++) \{
-053         /* first calculate the product */
-054         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
-055   
-056         /* now calculate the double precision result, note we use
-057          * addition instead of *2 since it's easier to optimize
-058          */
-059         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
-060   
-061         /* store lower part */
-062         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-063   
-064         /* get carry */
-065         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-066       \}
-067       /* propagate upwards */
-068       while (u != ((mp_digit) 0)) \{
-069         r       = ((mp_word) *tmpt) + ((mp_word) u);
-070         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-071         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-072       \}
-073     \}
-074   
-075     mp_clamp (&t);
-076     mp_exch (&t, b);
-077     mp_clear (&t);
-078     return MP_OKAY;
-079   \}
-080   #endif
 \end{alltt}
 \end{small}
 
-Inside the outer loop (\textit{see line 34}) the square term is calculated on line 37.  Line 44 extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 47 and 50 respectively.  The doubling is performed using two
-additions (\textit{see line 59}) since it is usually faster than shifting,if not at least as fast.  
+Inside the outer loop (line 34) the square term is calculated on line 37.  The carry (line 44) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines 47 and 50) to simplify the inner loop.  The doubling is performed using two
+additions (line 59) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
 
 \subsection{Faster Squaring by the ``Comba'' Method}
 A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
@@ -4987,9 +3333,9 @@ propagation operations from the inner loop.  However, the inner product must sti
 that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
 $ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
 
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -4999,34 +3345,34 @@ moved to a $O(n)$ work level outside the $O(n^2)$ level.
 \textbf{Input}.   mp\_int $a$ \\
 \textbf{Output}.  $b \leftarrow a^2$ \\
 \hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
 1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
 2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
 \\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
 \\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
 \hline
 \end{tabular}
 \end{center}
@@ -5035,146 +3381,29 @@ Double the products and propagate the carries simultaneously. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
 
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
 
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
 
 \vspace{+3mm}\begin{small}
 \hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* fast squaring
-018    *
-019    * This is the comba method where the columns of the product
-020    * are computed first then the carries are computed.  This
-021    * has the effect of making a very simple inner loop that
-022    * is executed the most
-023    *
-024    * W2 represents the outer products and W the inner.
-025    *
-026    * A further optimizations is made because the inner
-027    * products are of the form "A * B * 2".  The *2 part does
-028    * not need to be computed until the end which is good
-029    * because 64-bit shifts are slow!
-030    *
-031    * Based on Algorithm 14.16 on pp.597 of HAC.
-032    *
-033    */
-034   /* the jist of squaring...
-035   
-036   you do like mult except the offset of the tmpx [one that starts closer to ze
-      ro]
-037   can't equal the offset of tmpy.  So basically you set up iy like before then
-       you min it with
-038   (ty-tx) so that it never happens.  You double all those you add in the inner
-       loop
-039   
-040   After that loop you do the squares and add them in.
-041   
-042   Remove W2 and don't memset W
-043   
-044   */
-045   
-046   int fast_s_mp_sqr (mp_int * a, mp_int * b)
-047   \{
-048     int       olduse, res, pa, ix, iz;
-049     mp_digit   W[MP_WARRAY], *tmpx;
-050     mp_word   W1;
-051   
-052     /* grow the destination as required */
-053     pa = a->used + a->used;
-054     if (b->alloc < pa) \{
-055       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
-056         return res;
-057       \}
-058     \}
-059   
-060     /* number of output digits to produce */
-061     W1 = 0;
-062     for (ix = 0; ix < pa; ix++) \{ 
-063         int      tx, ty, iy;
-064         mp_word  _W;
-065         mp_digit *tmpy;
-066   
-067         /* clear counter */
-068         _W = 0;
-069   
-070         /* get offsets into the two bignums */
-071         ty = MIN(a->used-1, ix);
-072         tx = ix - ty;
-073   
-074         /* setup temp aliases */
-075         tmpx = a->dp + tx;
-076         tmpy = a->dp + ty;
-077   
-078         /* this is the number of times the loop will iterrate, essentially its
-       
-079            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-080          */
-081         iy = MIN(a->used-tx, ty+1);
-082   
-083         /* now for squaring tx can never equal ty 
-084          * we halve the distance since they approach at a rate of 2x
-085          * and we have to round because odd cases need to be executed
-086          */
-087         iy = MIN(iy, (ty-tx+1)>>1);
-088   
-089         /* execute loop */
-090         for (iz = 0; iz < iy; iz++) \{
-091            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-092         \}
-093   
-094         /* double the inner product and add carry */
-095         _W = _W + _W + W1;
-096   
-097         /* even columns have the square term in them */
-098         if ((ix&1) == 0) \{
-099            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
-100         \}
-101   
-102         /* store it */
-103         W[ix] = _W;
-104   
-105         /* make next carry */
-106         W1 = _W >> ((mp_word)DIGIT_BIT);
-107     \}
-108   
-109     /* setup dest */
-110     olduse  = b->used;
-111     b->used = a->used+a->used;
-112   
-113     \{
-114       mp_digit *tmpb;
-115       tmpb = b->dp;
-116       for (ix = 0; ix < pa; ix++) \{
-117         *tmpb++ = W[ix] & MP_MASK;
-118       \}
-119   
-120       /* clear unused digits [that existed in the old copy of c] */
-121       for (; ix < olduse; ix++) \{
-122         *tmpb++ = 0;
-123       \}
-124     \}
-125     mp_clamp (b);
-126     return MP_OKAY;
-127   \}
-128   #endif
 \end{alltt}
 \end{small}
 
--- Write something deep and insightful later, Tom.
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
 
 \subsection{Polynomial Basis Squaring}
 The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
@@ -5187,10 +3416,10 @@ Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  Th
 number with the following equation.
 
 \begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
 \end{equation}
 
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in 
 Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
 $O \left ( n^{lg(3)} \right )$.
 
@@ -5222,12 +3451,12 @@ Split the input.  e.g. $a = x1\beta^B + x0$ \\
 Calculate the three squares. \\
 6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
 7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
 9.  $t1 \leftarrow t1^2$ \\
 \\
 Compute the middle term. \\
 10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
+11.  $t1 \leftarrow t1 - t2$ \\
 \\
 Compute final product. \\
 12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
@@ -5250,7 +3479,7 @@ The radix point for squaring is simply placed exactly in the middle of the digit
 placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
 as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
 
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
 Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
 this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
 
@@ -5279,112 +3508,11 @@ ratio of 1:7.  } than simpler operations such as addition.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* Karatsuba squaring, computes b = a*a using three 
-018    * half size squarings
-019    *
-020    * See comments of karatsuba_mul for details.  It 
-021    * is essentially the same algorithm but merely 
-022    * tuned to perform recursive squarings.
-023    */
-024   int mp_karatsuba_sqr (mp_int * a, mp_int * b)
-025   \{
-026     mp_int  x0, x1, t1, t2, x0x0, x1x1;
-027     int     B, err;
-028   
-029     err = MP_MEM;
-030   
-031     /* min # of digits */
-032     B = a->used;
-033   
-034     /* now divide in two */
-035     B = B >> 1;
-036   
-037     /* init copy all the temps */
-038     if (mp_init_size (&x0, B) != MP_OKAY)
-039       goto ERR;
-040     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-041       goto X0;
-042   
-043     /* init temps */
-044     if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
-045       goto X1;
-046     if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
-047       goto T1;
-048     if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
-049       goto T2;
-050     if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
-051       goto X0X0;
-052   
-053     \{
-054       register int x;
-055       register mp_digit *dst, *src;
-056   
-057       src = a->dp;
-058   
-059       /* now shift the digits */
-060       dst = x0.dp;
-061       for (x = 0; x < B; x++) \{
-062         *dst++ = *src++;
-063       \}
-064   
-065       dst = x1.dp;
-066       for (x = B; x < a->used; x++) \{
-067         *dst++ = *src++;
-068       \}
-069     \}
-070   
-071     x0.used = B;
-072     x1.used = a->used - B;
-073   
-074     mp_clamp (&x0);
-075   
-076     /* now calc the products x0*x0 and x1*x1 */
-077     if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-078       goto X1X1;           /* x0x0 = x0*x0 */
-079     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-080       goto X1X1;           /* x1x1 = x1*x1 */
-081   
-082     /* now calc (x1-x0)**2 */
-083     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-084       goto X1X1;           /* t1 = x1 - x0 */
-085     if (mp_sqr (&t1, &t1) != MP_OKAY)
-086       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
-087   
-088     /* add x0y0 */
-089     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-090       goto X1X1;           /* t2 = x0x0 + x1x1 */
-091     if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-092       goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
-093   
-094     /* shift by B */
-095     if (mp_lshd (&t1, B) != MP_OKAY)
-096       goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
-097     if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-098       goto X1X1;           /* x1x1 = x1x1 << 2*B */
-099   
-100     if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-101       goto X1X1;           /* t1 = x0x0 + t1 */
-102     if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-103       goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
-104   
-105     err = MP_OKAY;
-106   
-107   X1X1:mp_clear (&x1x1);
-108   X0X0:mp_clear (&x0x0);
-109   T2:mp_clear (&t2);
-110   T1:mp_clear (&t1);
-111   X1:mp_clear (&x1);
-112   X0:mp_clear (&x0);
-113   ERR:
-114     return err;
-115   \}
-116   #endif
 \end{alltt}
 \end{small}
 
 This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
-shift the input into the two halves.  The loop from line 53 to line 69 has been modified since only one input exists.  The \textbf{used}
+shift the input into the two halves.  The loop from line 54 to line 70 has been modified since only one input exists.  The \textbf{used}
 count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
 to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
 
@@ -5392,14 +3520,13 @@ By inlining the copy and shift operations the cutoff point for Karatsuba multipl
 is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
 it is actually below the Comba limit (\textit{at 110 digits}).
 
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
 
 \subsection{Toom-Cook Squaring}
 The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
 derive their own Toom-Cook squaring algorithm.  
 
 \subsection{High Level Squaring}
@@ -5439,44 +3566,6 @@ neither of the polynomial basis algorithms should be used then either the Comba
 \hspace{-5.1mm}{\bf File}: bn\_mp\_sqr.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* computes b = a*a */
-018   int
-019   mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     int     res;
-022   
-023   #ifdef BN_MP_TOOM_SQR_C
-024     /* use Toom-Cook? */
-025     if (a->used >= TOOM_SQR_CUTOFF) \{
-026       res = mp_toom_sqr(a, b);
-027     /* Karatsuba? */
-028     \} else 
-029   #endif
-030   #ifdef BN_MP_KARATSUBA_SQR_C
-031   if (a->used >= KARATSUBA_SQR_CUTOFF) \{
-032       res = mp_karatsuba_sqr (a, b);
-033     \} else 
-034   #endif
-035     \{
-036   #ifdef BN_FAST_S_MP_SQR_C
-037       /* can we use the fast comba multiplier? */
-038       if ((a->used * 2 + 1) < MP_WARRAY && 
-039            a->used < 
-040            (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) \{
-041         res = fast_s_mp_sqr (a, b);
-042       \} else
-043   #endif
-044   #ifdef BN_S_MP_SQR_C
-045         res = s_mp_sqr (a, b);
-046   #else
-047         res = MP_VAL;
-048   #endif
-049     \}
-050     b->sign = MP_ZPOS;
-051     return res;
-052   \}
-053   #endif
 \end{alltt}
 \end{small}
 
@@ -5485,12 +3574,9 @@ neither of the polynomial basis algorithms should be used then either the Comba
 $\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
                       & that have different number of digits in Karatsuba multiplication. \\
                       & \\
-$\left [ 3 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
+$\left [ 2 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
                       & of double products and at most one square is stated.  Prove this statement. \\
                       & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
 $\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
                       & \\
 $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
@@ -5498,6 +3584,14 @@ $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3
 $\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
                       & required for equation $6.7$ to be true.  \\
                       & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
 \end{tabular}
 
 \chapter{Modular Reduction}
@@ -5516,7 +3610,7 @@ other forms of residues.
 Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
 is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
 RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
-Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
 exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
 range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
 algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
@@ -5725,87 +3819,6 @@ performed at most twice, and on average once. However, if $a \ge b^2$ than it wi
 \hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* reduces x mod m, assumes 0 < x < m**2, mu is 
-018    * precomputed via mp_reduce_setup.
-019    * From HAC pp.604 Algorithm 14.42
-020    */
-021   int
-022   mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-023   \{
-024     mp_int  q;
-025     int     res, um = m->used;
-026   
-027     /* q = x */
-028     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
-029       return res;
-030     \}
-031   
-032     /* q1 = x / b**(k-1)  */
-033     mp_rshd (&q, um - 1);         
-034   
-035     /* according to HAC this optimization is ok */
-036     if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
-037       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
-038         goto CLEANUP;
-039       \}
-040     \} else \{
-041   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-042       if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
-043         goto CLEANUP;
-044       \}
-045   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-046       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
-047         goto CLEANUP;
-048       \}
-049   #else 
-050       \{ 
-051         res = MP_VAL;
-052         goto CLEANUP;
-053       \}
-054   #endif
-055     \}
-056   
-057     /* q3 = q2 / b**(k+1) */
-058     mp_rshd (&q, um + 1);         
-059   
-060     /* x = x mod b**(k+1), quick (no division) */
-061     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
-062       goto CLEANUP;
-063     \}
-064   
-065     /* q = q * m mod b**(k+1), quick (no division) */
-066     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
-067       goto CLEANUP;
-068     \}
-069   
-070     /* x = x - q */
-071     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
-072       goto CLEANUP;
-073     \}
-074   
-075     /* If x < 0, add b**(k+1) to it */
-076     if (mp_cmp_d (x, 0) == MP_LT) \{
-077       mp_set (&q, 1);
-078       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-079         goto CLEANUP;
-080       if ((res = mp_add (x, &q, x)) != MP_OKAY)
-081         goto CLEANUP;
-082     \}
-083   
-084     /* Back off if it's too big */
-085     while (mp_cmp (x, m) != MP_LT) \{
-086       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
-087         goto CLEANUP;
-088       \}
-089     \}
-090     
-091   CLEANUP:
-092     mp_clear (&q);
-093   
-094     return res;
-095   \}
-096   #endif
 \end{alltt}
 \end{small}
 
@@ -5818,7 +3831,7 @@ safe to do so.
 In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
 future use so that the Barrett algorithm can be used without delay.  
 
-\begin{figure}[!here]
+\newpage\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -5844,20 +3857,6 @@ is equivalent and much faster.  The final value is computed by taking the intege
 \hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* pre-calculate the value required for Barrett reduction
-018    * For a given modulus "b" it calulates the value required in "a"
-019    */
-020   int mp_reduce_setup (mp_int * a, mp_int * b)
-021   \{
-022     int     res;
-023     
-024     if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{
-025       return res;
-026     \}
-027     return mp_div (a, b, a, NULL);
-028   \}
-029   #endif
 \end{alltt}
 \end{small}
 
@@ -5922,6 +3921,7 @@ $0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction
 \hline $6$ & $x/2 = 139$ \\
 \hline $7$ & $x + n = 396$, $x/2 = 198$ \\
 \hline $8$ & $x/2 = 99$ \\
+\hline $9$ & $x + n = 356$, $x/2 = 178$ \\
 \hline
 \end{tabular}
 \end{center}
@@ -5930,8 +3930,8 @@ $0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction
 \label{fig:MONT1}
 \end{figure}
 
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
-congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 9$ (note $\beta^k = 512$ which is larger than $n$).  The result of 
+the algorithm $r = 178$ is congruent to the value of $2^{-9} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^9$ modulo $257$ the correct residue 
 $r \equiv 158$ is produced.  
 
 Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
@@ -5943,10 +3943,10 @@ Fortunately there exists an alternative representation of the algorithm.
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ ($2^k > n$) \\
 \textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
 \hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
+1.  for $t$ from $1$ to $k$ do \\
 \hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
 \hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
 2.  Return $x/2^k$. \\
@@ -5974,7 +3974,8 @@ precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a s
 \hline $6$ & $8896$ & $10001011000000$ \\
 \hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
 \hline $8$ & $25344$ & $110001100000000$ \\
-\hline -- & $x/2^k = 99$ & \\
+\hline $9$ & $x + 2^{7}n = 91136$ & $10110010000000000$ \\
+\hline -- & $x/2^k = 178$ & \\
 \hline
 \end{tabular}
 \end{center}
@@ -5983,7 +3984,7 @@ precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a s
 \label{fig:MONT2}
 \end{figure}
 
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 9$. 
 With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
 loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
 zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
@@ -5997,7 +3998,7 @@ previous algorithm re-written to compute the Montgomery reduction in this new fa
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ ($\beta^k > n$) \\
 \textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
 \hline \\
 1.  for $t$ from $0$ to $k - 1$ do \\
@@ -6115,109 +4116,11 @@ multiplications.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* computes xR**-1 == x (mod N) via Montgomery Reduction */
-018   int
-019   mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-020   \{
-021     int     ix, res, digs;
-022     mp_digit mu;
-023   
-024     /* can the fast reduction [comba] method be used?
-025      *
-026      * Note that unlike in mul you're safely allowed *less*
-027      * than the available columns [255 per default] since carries
-028      * are fixed up in the inner loop.
-029      */
-030     digs = n->used * 2 + 1;
-031     if ((digs < MP_WARRAY) &&
-032         n->used <
-033         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-034       return fast_mp_montgomery_reduce (x, n, rho);
-035     \}
-036   
-037     /* grow the input as required */
-038     if (x->alloc < digs) \{
-039       if ((res = mp_grow (x, digs)) != MP_OKAY) \{
-040         return res;
-041       \}
-042     \}
-043     x->used = digs;
-044   
-045     for (ix = 0; ix < n->used; ix++) \{
-046       /* mu = ai * rho mod b
-047        *
-048        * The value of rho must be precalculated via
-049        * montgomery_setup() such that
-050        * it equals -1/n0 mod b this allows the
-051        * following inner loop to reduce the
-052        * input one digit at a time
-053        */
-054       mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
-055   
-056       /* a = a + mu * m * b**i */
-057       \{
-058         register int iy;
-059         register mp_digit *tmpn, *tmpx, u;
-060         register mp_word r;
-061   
-062         /* alias for digits of the modulus */
-063         tmpn = n->dp;
-064   
-065         /* alias for the digits of x [the input] */
-066         tmpx = x->dp + ix;
-067   
-068         /* set the carry to zero */
-069         u = 0;
-070   
-071         /* Multiply and add in place */
-072         for (iy = 0; iy < n->used; iy++) \{
-073           /* compute product and sum */
-074           r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
-075                     ((mp_word) u) + ((mp_word) * tmpx);
-076   
-077           /* get carry */
-078           u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-079   
-080           /* fix digit */
-081           *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
-082         \}
-083         /* At this point the ix'th digit of x should be zero */
-084   
-085   
-086         /* propagate carries upwards as required*/
-087         while (u) \{
-088           *tmpx   += u;
-089           u        = *tmpx >> DIGIT_BIT;
-090           *tmpx++ &= MP_MASK;
-091         \}
-092       \}
-093     \}
-094   
-095     /* at this point the n.used'th least
-096      * significant digits of x are all zero
-097      * which means we can shift x to the
-098      * right by n.used digits and the
-099      * residue is unchanged.
-100      */
-101   
-102     /* x = x/b**n.used */
-103     mp_clamp(x);
-104     mp_rshd (x, n->used);
-105   
-106     /* if x >= n then x = x - n */
-107     if (mp_cmp_mag (x, n) != MP_LT) \{
-108       return s_mp_sub (x, n, x);
-109     \}
-110   
-111     return MP_OKAY;
-112   \}
-113   #endif
 \end{alltt}
 \end{small}
 
-This is the baseline implementation of the Montgomery reduction algorithm.  Lines 30 to 35 determine if the Comba based
-routine can be used instead.  Line 48 computes the value of $\mu$ for that particular iteration of the outer loop.  
+This is the baseline implementation of the Montgomery reduction algorithm.  Lines 31 to 36 determine if the Comba based
+routine can be used instead.  Line 47 computes the value of $\mu$ for that particular iteration of the outer loop.  
 
 The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
 the alias $tmpn$ refers to the modulus $n$.  
@@ -6305,159 +4208,6 @@ stored in the destination $x$.
 \hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* computes xR**-1 == x (mod N) via Montgomery Reduction
-018    *
-019    * This is an optimized implementation of montgomery_reduce
-020    * which uses the comba method to quickly calculate the columns of the
-021    * reduction.
-022    *
-023    * Based on Algorithm 14.32 on pp.601 of HAC.
-024   */
-025   int
-026   fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-027   \{
-028     int     ix, res, olduse;
-029     mp_word W[MP_WARRAY];
-030   
-031     /* get old used count */
-032     olduse = x->used;
-033   
-034     /* grow a as required */
-035     if (x->alloc < n->used + 1) \{
-036       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
-037         return res;
-038       \}
-039     \}
-040   
-041     /* first we have to get the digits of the input into
-042      * an array of double precision words W[...]
-043      */
-044     \{
-045       register mp_word *_W;
-046       register mp_digit *tmpx;
-047   
-048       /* alias for the W[] array */
-049       _W   = W;
-050   
-051       /* alias for the digits of  x*/
-052       tmpx = x->dp;
-053   
-054       /* copy the digits of a into W[0..a->used-1] */
-055       for (ix = 0; ix < x->used; ix++) \{
-056         *_W++ = *tmpx++;
-057       \}
-058   
-059       /* zero the high words of W[a->used..m->used*2] */
-060       for (; ix < n->used * 2 + 1; ix++) \{
-061         *_W++ = 0;
-062       \}
-063     \}
-064   
-065     /* now we proceed to zero successive digits
-066      * from the least significant upwards
-067      */
-068     for (ix = 0; ix < n->used; ix++) \{
-069       /* mu = ai * m' mod b
-070        *
-071        * We avoid a double precision multiplication (which isn't required)
-072        * by casting the value down to a mp_digit.  Note this requires
-073        * that W[ix-1] have  the carry cleared (see after the inner loop)
-074        */
-075       register mp_digit mu;
-076       mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
-077   
-078       /* a = a + mu * m * b**i
-079        *
-080        * This is computed in place and on the fly.  The multiplication
-081        * by b**i is handled by offseting which columns the results
-082        * are added to.
-083        *
-084        * Note the comba method normally doesn't handle carries in the
-085        * inner loop In this case we fix the carry from the previous
-086        * column since the Montgomery reduction requires digits of the
-087        * result (so far) [see above] to work.  This is
-088        * handled by fixing up one carry after the inner loop.  The
-089        * carry fixups are done in order so after these loops the
-090        * first m->used words of W[] have the carries fixed
-091        */
-092       \{
-093         register int iy;
-094         register mp_digit *tmpn;
-095         register mp_word *_W;
-096   
-097         /* alias for the digits of the modulus */
-098         tmpn = n->dp;
-099   
-100         /* Alias for the columns set by an offset of ix */
-101         _W = W + ix;
-102   
-103         /* inner loop */
-104         for (iy = 0; iy < n->used; iy++) \{
-105             *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
-106         \}
-107       \}
-108   
-109       /* now fix carry for next digit, W[ix+1] */
-110       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-111     \}
-112   
-113     /* now we have to propagate the carries and
-114      * shift the words downward [all those least
-115      * significant digits we zeroed].
-116      */
-117     \{
-118       register mp_digit *tmpx;
-119       register mp_word *_W, *_W1;
-120   
-121       /* nox fix rest of carries */
-122   
-123       /* alias for current word */
-124       _W1 = W + ix;
-125   
-126       /* alias for next word, where the carry goes */
-127       _W = W + ++ix;
-128   
-129       for (; ix <= n->used * 2 + 1; ix++) \{
-130         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-131       \}
-132   
-133       /* copy out, A = A/b**n
-134        *
-135        * The result is A/b**n but instead of converting from an
-136        * array of mp_word to mp_digit than calling mp_rshd
-137        * we just copy them in the right order
-138        */
-139   
-140       /* alias for destination word */
-141       tmpx = x->dp;
-142   
-143       /* alias for shifted double precision result */
-144       _W = W + n->used;
-145   
-146       for (ix = 0; ix < n->used + 1; ix++) \{
-147         *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
-148       \}
-149   
-150       /* zero oldused digits, if the input a was larger than
-151        * m->used+1 we'll have to clear the digits
-152        */
-153       for (; ix < olduse; ix++) \{
-154         *tmpx++ = 0;
-155       \}
-156     \}
-157   
-158     /* set the max used and clamp */
-159     x->used = n->used + 1;
-160     mp_clamp (x);
-161   
-162     /* if A >= m then A = A - m */
-163     if (mp_cmp_mag (x, n) != MP_LT) \{
-164       return s_mp_sub (x, n, x);
-165     \}
-166     return MP_OKAY;
-167   \}
-168   #endif
 \end{alltt}
 \end{small}
 
@@ -6485,7 +4235,7 @@ To calculate the variable $\rho$ a relatively simple algorithm will be required.
 \hline \\
 1.  $b \leftarrow n_0$ \\
 2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
 4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
 \hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
 5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
@@ -6505,45 +4255,6 @@ to calculate $1/n_0$ when $\beta$ is a power of two.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* setups the montgomery reduction stuff */
-018   int
-019   mp_montgomery_setup (mp_int * n, mp_digit * rho)
-020   \{
-021     mp_digit x, b;
-022   
-023   /* fast inversion mod 2**k
-024    *
-025    * Based on the fact that
-026    *
-027    * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
-028    *                    =>  2*X*A - X*X*A*A = 1
-029    *                    =>  2*(1) - (1)     = 1
-030    */
-031     b = n->dp[0];
-032   
-033     if ((b & 1) == 0) \{
-034       return MP_VAL;
-035     \}
-036   
-037     x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-038     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
-039   #if !defined(MP_8BIT)
-040     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
-041   #endif
-042   #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-043     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-044   #endif
-045   #ifdef MP_64BIT
-046     x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
-047   #endif
-048   
-049     /* rho = -1/m mod b */
-050     *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
-051   
-052     return MP_OKAY;
-053   \}
-054   #endif
 \end{alltt}
 \end{small}
 
@@ -6736,96 +4447,22 @@ at step 3.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
-018    *
-019    * Based on algorithm from the paper
-020    *
-021    * "Generating Efficient Primes for Discrete Log Cryptosystems"
-022    *                 Chae Hoon Lim, Pil Joong Lee,
-023    *          POSTECH Information Research Laboratories
-024    *
-025    * The modulus must be of a special format [see manual]
-026    *
-027    * Has been modified to use algorithm 7.10 from the LTM book instead
-028    *
-029    * Input x must be in the range 0 <= x <= (n-1)**2
-030    */
-031   int
-032   mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
-033   \{
-034     int      err, i, m;
-035     mp_word  r;
-036     mp_digit mu, *tmpx1, *tmpx2;
-037   
-038     /* m = digits in modulus */
-039     m = n->used;
-040   
-041     /* ensure that "x" has at least 2m digits */
-042     if (x->alloc < m + m) \{
-043       if ((err = mp_grow (x, m + m)) != MP_OKAY) \{
-044         return err;
-045       \}
-046     \}
-047   
-048   /* top of loop, this is where the code resumes if
-049    * another reduction pass is required.
-050    */
-051   top:
-052     /* aliases for digits */
-053     /* alias for lower half of x */
-054     tmpx1 = x->dp;
-055   
-056     /* alias for upper half of x, or x/B**m */
-057     tmpx2 = x->dp + m;
-058   
-059     /* set carry to zero */
-060     mu = 0;
-061   
-062     /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
-063     for (i = 0; i < m; i++) \{
-064         r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
-065         *tmpx1++  = (mp_digit)(r & MP_MASK);
-066         mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
-067     \}
-068   
-069     /* set final carry */
-070     *tmpx1++ = mu;
-071   
-072     /* zero words above m */
-073     for (i = m + 1; i < x->used; i++) \{
-074         *tmpx1++ = 0;
-075     \}
-076   
-077     /* clamp, sub and return */
-078     mp_clamp (x);
-079   
-080     /* if x >= n then subtract and reduce again
-081      * Each successive "recursion" makes the input smaller and smaller.
-082      */
-083     if (mp_cmp_mag (x, n) != MP_LT) \{
-084       s_mp_sub(x, n, x);
-085       goto top;
-086     \}
-087     return MP_OKAY;
-088   \}
-089   #endif
 \end{alltt}
 \end{small}
 
-The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line 51 is where
+The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line 52 is where
 the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
 the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
 
 The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
-a division by $\beta^m$ can be simulated virtually for free.  The loop on line 63 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
+a division by $\beta^m$ can be simulated virtually for free.  The loop on line 64 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
 in this algorithm.
 
-By line 70 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line 73 the 
+By line 67 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line 74 the 
 same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
 
 Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
-With the same logic at line 84 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
+With the same logic at line 81 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
 as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
 does not need to be checked.
 
@@ -6853,18 +4490,6 @@ completeness.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* determines the setup value */
-018   void mp_dr_setup(mp_int *a, mp_digit *d)
-019   \{
-020      /* the casts are required if DIGIT_BIT is one less than
-021       * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
-022       */
-023      *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
-024           ((mp_word)a->dp[0]));
-025   \}
-026   
-027   #endif
 \end{alltt}
 \end{small}
 
@@ -6900,29 +4525,6 @@ step 3 then $n$ must be of Diminished Radix form.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* determines if a number is a valid DR modulus */
-018   int mp_dr_is_modulus(mp_int *a)
-019   \{
-020      int ix;
-021   
-022      /* must be at least two digits */
-023      if (a->used < 2) \{
-024         return 0;
-025      \}
-026   
-027      /* must be of the form b**k - a [a <= b] so all
-028       * but the first digit must be equal to -1 (mod b).
-029       */
-030      for (ix = 1; ix < a->used; ix++) \{
-031          if (a->dp[ix] != MP_MASK) \{
-032             return 0;
-033          \}
-034      \}
-035      return 1;
-036   \}
-037   
-038   #endif
 \end{alltt}
 \end{small}
 
@@ -6966,48 +4568,6 @@ shift which makes the algorithm fairly inexpensive to use.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* reduces a modulo n where n is of the form 2**p - d */
-018   int
-019   mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
-020   \{
-021      mp_int q;
-022      int    p, res;
-023      
-024      if ((res = mp_init(&q)) != MP_OKAY) \{
-025         return res;
-026      \}
-027      
-028      p = mp_count_bits(n);    
-029   top:
-030      /* q = a/2**p, a = a mod 2**p */
-031      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
-032         goto ERR;
-033      \}
-034      
-035      if (d != 1) \{
-036         /* q = q * d */
-037         if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{ 
-038            goto ERR;
-039         \}
-040      \}
-041      
-042      /* a = a + q */
-043      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
-044         goto ERR;
-045      \}
-046      
-047      if (mp_cmp_mag(a, n) != MP_LT) \{
-048         s_mp_sub(a, n, a);
-049         goto top;
-050      \}
-051      
-052   ERR:
-053      mp_clear(&q);
-054      return res;
-055   \}
-056   
-057   #endif
 \end{alltt}
 \end{small}
 
@@ -7050,34 +4610,6 @@ is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit th
 \hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* determines the setup value */
-018   int 
-019   mp_reduce_2k_setup(mp_int *a, mp_digit *d)
-020   \{
-021      int res, p;
-022      mp_int tmp;
-023      
-024      if ((res = mp_init(&tmp)) != MP_OKAY) \{
-025         return res;
-026      \}
-027      
-028      p = mp_count_bits(a);
-029      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
-030         mp_clear(&tmp);
-031         return res;
-032      \}
-033      
-034      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
-035         mp_clear(&tmp);
-036         return res;
-037      \}
-038      
-039      *d = tmp.dp[0];
-040      mp_clear(&tmp);
-041      return MP_OKAY;
-042   \}
-043   #endif
 \end{alltt}
 \end{small}
 
@@ -7122,38 +4654,6 @@ This algorithm quickly determines if a modulus is of the form required for algor
 \hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* determines if mp_reduce_2k can be used */
-018   int mp_reduce_is_2k(mp_int *a)
-019   \{
-020      int ix, iy, iw;
-021      mp_digit iz;
-022      
-023      if (a->used == 0) \{
-024         return 0;
-025      \} else if (a->used == 1) \{
-026         return 1;
-027      \} else if (a->used > 1) \{
-028         iy = mp_count_bits(a);
-029         iz = 1;
-030         iw = 1;
-031       
-032         /* Test every bit from the second digit up, must be 1 */
-033         for (ix = DIGIT_BIT; ix < iy; ix++) \{
-034             if ((a->dp[iw] & iz) == 0) \{
-035                return 0;
-036             \}
-037             iz <<= 1;
-038             if (iz > (mp_digit)MP_MASK) \{
-039                ++iw;
-040                iz = 1;
-041             \}
-042         \}
-043      \}
-044      return 1;
-045   \}
-046   
-047   #endif
 \end{alltt}
 \end{small}
 
@@ -7326,50 +4826,13 @@ iteration of the loop moves the bits of the exponent $b$ upwards to the most sig
 \hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* calculate c = a**b  using a square-multiply algorithm */
-018   int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
-019   \{
-020     int     res, x;
-021     mp_int  g;
-022   
-023     if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{
-024       return res;
-025     \}
-026   
-027     /* set initial result */
-028     mp_set (c, 1);
-029   
-030     for (x = 0; x < (int) DIGIT_BIT; x++) \{
-031       /* square */
-032       if ((res = mp_sqr (c, c)) != MP_OKAY) \{
-033         mp_clear (&g);
-034         return res;
-035       \}
-036   
-037       /* if the bit is set multiply */
-038       if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{
-039         if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{
-040            mp_clear (&g);
-041            return res;
-042         \}
-043       \}
-044   
-045       /* shift to next bit */
-046       b <<= 1;
-047     \}
-048   
-049     mp_clear (&g);
-050     return MP_OKAY;
-051   \}
-052   #endif
 \end{alltt}
 \end{small}
 
-Line 28 sets the initial value of the result to $1$.  Next the loop on line 30 steps through each bit of the exponent starting from
-the most significant down towards the least significant. The invariant squaring operation placed on line 32 is performed first.  After 
+Line 29 sets the initial value of the result to $1$.  Next the loop on line 31 steps through each bit of the exponent starting from
+the most significant down towards the least significant. The invariant squaring operation placed on line 33 is performed first.  After 
 the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
-46 moves all of the bits of the exponent upwards towards the most significant location.  
+47 moves all of the bits of the exponent upwards towards the most significant location.  
 
 \section{$k$-ary Exponentiation}
 When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
@@ -7550,100 +5013,16 @@ algorithm since their arguments are essentially the same (\textit{two mp\_ints a
 \hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   
-018   /* this is a shell function that calls either the normal or Montgomery
-019    * exptmod functions.  Originally the call to the montgomery code was
-020    * embedded in the normal function but that wasted alot of stack space
-021    * for nothing (since 99% of the time the Montgomery code would be called)
-022    */
-023   int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-024   \{
-025     int dr;
-026   
-027     /* modulus P must be positive */
-028     if (P->sign == MP_NEG) \{
-029        return MP_VAL;
-030     \}
-031   
-032     /* if exponent X is negative we have to recurse */
-033     if (X->sign == MP_NEG) \{
-034   #ifdef BN_MP_INVMOD_C
-035        mp_int tmpG, tmpX;
-036        int err;
-037   
-038        /* first compute 1/G mod P */
-039        if ((err = mp_init(&tmpG)) != MP_OKAY) \{
-040           return err;
-041        \}
-042        if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{
-043           mp_clear(&tmpG);
-044           return err;
-045        \}
-046   
-047        /* now get |X| */
-048        if ((err = mp_init(&tmpX)) != MP_OKAY) \{
-049           mp_clear(&tmpG);
-050           return err;
-051        \}
-052        if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{
-053           mp_clear_multi(&tmpG, &tmpX, NULL);
-054           return err;
-055        \}
-056   
-057        /* and now compute (1/G)**|X| instead of G**X [X < 0] */
-058        err = mp_exptmod(&tmpG, &tmpX, P, Y);
-059        mp_clear_multi(&tmpG, &tmpX, NULL);
-060        return err;
-061   #else 
-062        /* no invmod */
-063        return MP_VAL;
-064   #endif
-065     \}
-066   
-067   #ifdef BN_MP_DR_IS_MODULUS_C
-068     /* is it a DR modulus? */
-069     dr = mp_dr_is_modulus(P);
-070   #else
-071     dr = 0;
-072   #endif
-073   
-074   #ifdef BN_MP_REDUCE_IS_2K_C
-075     /* if not, is it a uDR modulus? */
-076     if (dr == 0) \{
-077        dr = mp_reduce_is_2k(P) << 1;
-078     \}
-079   #endif
-080       
-081     /* if the modulus is odd or dr != 0 use the fast method */
-082   #ifdef BN_MP_EXPTMOD_FAST_C
-083     if (mp_isodd (P) == 1 || dr !=  0) \{
-084       return mp_exptmod_fast (G, X, P, Y, dr);
-085     \} else \{
-086   #endif
-087   #ifdef BN_S_MP_EXPTMOD_C
-088       /* otherwise use the generic Barrett reduction technique */
-089       return s_mp_exptmod (G, X, P, Y);
-090   #else
-091       /* no exptmod for evens */
-092       return MP_VAL;
-093   #endif
-094   #ifdef BN_MP_EXPTMOD_FAST_C
-095     \}
-096   #endif
-097   \}
-098   
-099   #endif
 \end{alltt}
 \end{small}
 
-In order to keep the algorithms in a known state the first step on line 28 is to reject any negative modulus as input.  If the exponent is
+In order to keep the algorithms in a known state the first step on line 29 is to reject any negative modulus as input.  If the exponent is
 negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
 the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
 exponent.
 
-If the exponent is positive the algorithm resumes the exponentiation.  Line 69 determines if the modulus is of the restricted Diminished Radix 
-form.  If it is not line 77 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
+If the exponent is positive the algorithm resumes the exponentiation.  Line 77 determines if the modulus is of the restricted Diminished Radix 
+form.  If it is not line 70 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
 of three values.
 
 \begin{enumerate}
@@ -7652,7 +5031,7 @@ of three values.
 \item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
 \end{enumerate}
 
-Line 67 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
+Line 69 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
 the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
 
 \subsection{Barrett Modular Exponentiation}
@@ -7813,238 +5192,18 @@ a Left-to-Right algorithm is used to process the remaining few bits.
 \hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   #ifdef MP_LOW_MEM
-018      #define TAB_SIZE 32
-019   #else
-020      #define TAB_SIZE 256
-021   #endif
-022   
-023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-024   \{
-025     mp_int  M[TAB_SIZE], res, mu;
-026     mp_digit buf;
-027     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-028   
-029     /* find window size */
-030     x = mp_count_bits (X);
-031     if (x <= 7) \{
-032       winsize = 2;
-033     \} else if (x <= 36) \{
-034       winsize = 3;
-035     \} else if (x <= 140) \{
-036       winsize = 4;
-037     \} else if (x <= 450) \{
-038       winsize = 5;
-039     \} else if (x <= 1303) \{
-040       winsize = 6;
-041     \} else if (x <= 3529) \{
-042       winsize = 7;
-043     \} else \{
-044       winsize = 8;
-045     \}
-046   
-047   #ifdef MP_LOW_MEM
-048       if (winsize > 5) \{
-049          winsize = 5;
-050       \}
-051   #endif
-052   
-053     /* init M array */
-054     /* init first cell */
-055     if ((err = mp_init(&M[1])) != MP_OKAY) \{
-056        return err; 
-057     \}
-058   
-059     /* now init the second half of the array */
-060     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-061       if ((err = mp_init(&M[x])) != MP_OKAY) \{
-062         for (y = 1<<(winsize-1); y < x; y++) \{
-063           mp_clear (&M[y]);
-064         \}
-065         mp_clear(&M[1]);
-066         return err;
-067       \}
-068     \}
-069   
-070     /* create mu, used for Barrett reduction */
-071     if ((err = mp_init (&mu)) != MP_OKAY) \{
-072       goto LBL_M;
-073     \}
-074     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
-075       goto LBL_MU;
-076     \}
-077   
-078     /* create M table
-079      *
-080      * The M table contains powers of the base, 
-081      * e.g. M[x] = G**x mod P
-082      *
-083      * The first half of the table is not 
-084      * computed though accept for M[0] and M[1]
-085      */
-086     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
-087       goto LBL_MU;
-088     \}
-089   
-090     /* compute the value at M[1<<(winsize-1)] by squaring 
-091      * M[1] (winsize-1) times 
-092      */
-093     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
-094       goto LBL_MU;
-095     \}
-096   
-097     for (x = 0; x < (winsize - 1); x++) \{
-098       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-099                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
-100         goto LBL_MU;
-101       \}
-102       if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
-103         goto LBL_MU;
-104       \}
-105     \}
-106   
-107     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-108      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-109      */
-110     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
-111       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
-112         goto LBL_MU;
-113       \}
-114       if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) \{
-115         goto LBL_MU;
-116       \}
-117     \}
-118   
-119     /* setup result */
-120     if ((err = mp_init (&res)) != MP_OKAY) \{
-121       goto LBL_MU;
-122     \}
-123     mp_set (&res, 1);
-124   
-125     /* set initial mode and bit cnt */
-126     mode   = 0;
-127     bitcnt = 1;
-128     buf    = 0;
-129     digidx = X->used - 1;
-130     bitcpy = 0;
-131     bitbuf = 0;
-132   
-133     for (;;) \{
-134       /* grab next digit as required */
-135       if (--bitcnt == 0) \{
-136         /* if digidx == -1 we are out of digits */
-137         if (digidx == -1) \{
-138           break;
-139         \}
-140         /* read next digit and reset the bitcnt */
-141         buf    = X->dp[digidx--];
-142         bitcnt = (int) DIGIT_BIT;
-143       \}
-144   
-145       /* grab the next msb from the exponent */
-146       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-147       buf <<= (mp_digit)1;
-148   
-149       /* if the bit is zero and mode == 0 then we ignore it
-150        * These represent the leading zero bits before the first 1 bit
-151        * in the exponent.  Technically this opt is not required but it
-152        * does lower the # of trivial squaring/reductions used
-153        */
-154       if (mode == 0 && y == 0) \{
-155         continue;
-156       \}
-157   
-158       /* if the bit is zero and mode == 1 then we square */
-159       if (mode == 1 && y == 0) \{
-160         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-161           goto LBL_RES;
-162         \}
-163         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-164           goto LBL_RES;
-165         \}
-166         continue;
-167       \}
-168   
-169       /* else we add it to the window */
-170       bitbuf |= (y << (winsize - ++bitcpy));
-171       mode    = 2;
-172   
-173       if (bitcpy == winsize) \{
-174         /* ok window is filled so square as required and multiply  */
-175         /* square first */
-176         for (x = 0; x < winsize; x++) \{
-177           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-178             goto LBL_RES;
-179           \}
-180           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-181             goto LBL_RES;
-182           \}
-183         \}
-184   
-185         /* then multiply */
-186         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
-187           goto LBL_RES;
-188         \}
-189         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-190           goto LBL_RES;
-191         \}
-192   
-193         /* empty window and reset */
-194         bitcpy = 0;
-195         bitbuf = 0;
-196         mode   = 1;
-197       \}
-198     \}
-199   
-200     /* if bits remain then square/multiply */
-201     if (mode == 2 && bitcpy > 0) \{
-202       /* square then multiply if the bit is set */
-203       for (x = 0; x < bitcpy; x++) \{
-204         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-205           goto LBL_RES;
-206         \}
-207         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-208           goto LBL_RES;
-209         \}
-210   
-211         bitbuf <<= 1;
-212         if ((bitbuf & (1 << winsize)) != 0) \{
-213           /* then multiply */
-214           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
-215             goto LBL_RES;
-216           \}
-217           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-218             goto LBL_RES;
-219           \}
-220         \}
-221       \}
-222     \}
-223   
-224     mp_exch (&res, Y);
-225     err = MP_OKAY;
-226   LBL_RES:mp_clear (&res);
-227   LBL_MU:mp_clear (&mu);
-228   LBL_M:
-229     mp_clear(&M[1]);
-230     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-231       mp_clear (&M[x]);
-232     \}
-233     return err;
-234   \}
-235   #endif
 \end{alltt}
 \end{small}
 
-Lines 31 through 41 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+Lines 32 through 46 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
 from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line 33 the value of $x$ is already known to be greater than $140$.  
+on line 38 the value of $x$ is already known to be greater than $140$.  
 
-The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits.  This logic is used to ensure
+The conditional piece of code beginning on line 48 allows the window size to be restricted to five bits.  This logic is used to ensure
 the table of precomputed powers of $G$ remains relatively small.  
 
-The for loop on line 60 initializes the $M$ array while lines 61 and 74 compute the value of $\mu$ required for
-Barrett reduction.  
+The for loop on line 61 initializes the $M$ array while lines 72 and 77 through 86 initialize the reduction
+function that will be used for this modulus.
 
 -- More later.
 
@@ -8078,34 +5237,6 @@ equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two ca
 \hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* computes a = 2**b 
-018    *
-019    * Simple algorithm which zeroes the int, grows it then just sets one bit
-020    * as required.
-021    */
-022   int
-023   mp_2expt (mp_int * a, int b)
-024   \{
-025     int     res;
-026   
-027     /* zero a as per default */
-028     mp_zero (a);
-029   
-030     /* grow a to accomodate the single bit */
-031     if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) \{
-032       return res;
-033     \}
-034   
-035     /* set the used count of where the bit will go */
-036     a->used = b / DIGIT_BIT + 1;
-037   
-038     /* put the single bit in its place */
-039     a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
-040   
-041     return MP_OKAY;
-042   \}
-043   #endif
 \end{alltt}
 \end{small}
 
@@ -8354,278 +5485,6 @@ respectively be replaced with a zero.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_div.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   #ifdef BN_MP_DIV_SMALL
-018   
-019   /* slower bit-bang division... also smaller */
-020   int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-021   \{
-022      mp_int ta, tb, tq, q;
-023      int    res, n, n2;
-024   
-025     /* is divisor zero ? */
-026     if (mp_iszero (b) == 1) \{
-027       return MP_VAL;
-028     \}
-029   
-030     /* if a < b then q=0, r = a */
-031     if (mp_cmp_mag (a, b) == MP_LT) \{
-032       if (d != NULL) \{
-033         res = mp_copy (a, d);
-034       \} else \{
-035         res = MP_OKAY;
-036       \}
-037       if (c != NULL) \{
-038         mp_zero (c);
-039       \}
-040       return res;
-041     \}
-042       
-043     /* init our temps */
-044     if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) \{
-045        return res;
-046     \}
-047   
-048   
-049     mp_set(&tq, 1);
-050     n = mp_count_bits(a) - mp_count_bits(b);
-051     if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
-052         ((res = mp_abs(b, &tb)) != MP_OKAY) || 
-053         ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
-054         ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) \{
-055         goto LBL_ERR;
-056     \}
-057   
-058     while (n-- >= 0) \{
-059        if (mp_cmp(&tb, &ta) != MP_GT) \{
-060           if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
-061               ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) \{
-062              goto LBL_ERR;
-063           \}
-064        \}
-065        if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
-066            ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) \{
-067              goto LBL_ERR;
-068        \}
-069     \}
-070   
-071     /* now q == quotient and ta == remainder */
-072     n  = a->sign;
-073     n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
-074     if (c != NULL) \{
-075        mp_exch(c, &q);
-076        c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
-077     \}
-078     if (d != NULL) \{
-079        mp_exch(d, &ta);
-080        d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
-081     \}
-082   LBL_ERR:
-083      mp_clear_multi(&ta, &tb, &tq, &q, NULL);
-084      return res;
-085   \}
-086   
-087   #else
-088   
-089   /* integer signed division. 
-090    * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
-091    * HAC pp.598 Algorithm 14.20
-092    *
-093    * Note that the description in HAC is horribly 
-094    * incomplete.  For example, it doesn't consider 
-095    * the case where digits are removed from 'x' in 
-096    * the inner loop.  It also doesn't consider the 
-097    * case that y has fewer than three digits, etc..
-098    *
-099    * The overall algorithm is as described as 
-100    * 14.20 from HAC but fixed to treat these cases.
-101   */
-102   int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-103   \{
-104     mp_int  q, x, y, t1, t2;
-105     int     res, n, t, i, norm, neg;
-106   
-107     /* is divisor zero ? */
-108     if (mp_iszero (b) == 1) \{
-109       return MP_VAL;
-110     \}
-111   
-112     /* if a < b then q=0, r = a */
-113     if (mp_cmp_mag (a, b) == MP_LT) \{
-114       if (d != NULL) \{
-115         res = mp_copy (a, d);
-116       \} else \{
-117         res = MP_OKAY;
-118       \}
-119       if (c != NULL) \{
-120         mp_zero (c);
-121       \}
-122       return res;
-123     \}
-124   
-125     if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) \{
-126       return res;
-127     \}
-128     q.used = a->used + 2;
-129   
-130     if ((res = mp_init (&t1)) != MP_OKAY) \{
-131       goto LBL_Q;
-132     \}
-133   
-134     if ((res = mp_init (&t2)) != MP_OKAY) \{
-135       goto LBL_T1;
-136     \}
-137   
-138     if ((res = mp_init_copy (&x, a)) != MP_OKAY) \{
-139       goto LBL_T2;
-140     \}
-141   
-142     if ((res = mp_init_copy (&y, b)) != MP_OKAY) \{
-143       goto LBL_X;
-144     \}
-145   
-146     /* fix the sign */
-147     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-148     x.sign = y.sign = MP_ZPOS;
-149   
-150     /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
-151     norm = mp_count_bits(&y) % DIGIT_BIT;
-152     if (norm < (int)(DIGIT_BIT-1)) \{
-153        norm = (DIGIT_BIT-1) - norm;
-154        if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) \{
-155          goto LBL_Y;
-156        \}
-157        if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) \{
-158          goto LBL_Y;
-159        \}
-160     \} else \{
-161        norm = 0;
-162     \}
-163   
-164     /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-165     n = x.used - 1;
-166     t = y.used - 1;
-167   
-168     /* while (x >= y*b**n-t) do \{ q[n-t] += 1; x -= y*b**\{n-t\} \} */
-169     if ((res = mp_lshd (&y, n - t)) != MP_OKAY) \{ /* y = y*b**\{n-t\} */
-170       goto LBL_Y;
-171     \}
-172   
-173     while (mp_cmp (&x, &y) != MP_LT) \{
-174       ++(q.dp[n - t]);
-175       if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) \{
-176         goto LBL_Y;
-177       \}
-178     \}
-179   
-180     /* reset y by shifting it back down */
-181     mp_rshd (&y, n - t);
-182   
-183     /* step 3. for i from n down to (t + 1) */
-184     for (i = n; i >= (t + 1); i--) \{
-185       if (i > x.used) \{
-186         continue;
-187       \}
-188   
-189       /* step 3.1 if xi == yt then set q\{i-t-1\} to b-1, 
-190        * otherwise set q\{i-t-1\} to (xi*b + x\{i-1\})/yt */
-191       if (x.dp[i] == y.dp[t]) \{
-192         q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
-193       \} else \{
-194         mp_word tmp;
-195         tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
-196         tmp |= ((mp_word) x.dp[i - 1]);
-197         tmp /= ((mp_word) y.dp[t]);
-198         if (tmp > (mp_word) MP_MASK)
-199           tmp = MP_MASK;
-200         q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
-201       \}
-202   
-203       /* while (q\{i-t-1\} * (yt * b + y\{t-1\})) > 
-204                xi * b**2 + xi-1 * b + xi-2 
-205        
-206          do q\{i-t-1\} -= 1; 
-207       */
-208       q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
-209       do \{
-210         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
-211   
-212         /* find left hand */
-213         mp_zero (&t1);
-214         t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
-215         t1.dp[1] = y.dp[t];
-216         t1.used = 2;
-217         if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
-218           goto LBL_Y;
-219         \}
-220   
-221         /* find right hand */
-222         t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
-223         t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
-224         t2.dp[2] = x.dp[i];
-225         t2.used = 3;
-226       \} while (mp_cmp_mag(&t1, &t2) == MP_GT);
-227   
-228       /* step 3.3 x = x - q\{i-t-1\} * y * b**\{i-t-1\} */
-229       if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
-230         goto LBL_Y;
-231       \}
-232   
-233       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
-234         goto LBL_Y;
-235       \}
-236   
-237       if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) \{
-238         goto LBL_Y;
-239       \}
-240   
-241       /* if x < 0 then \{ x = x + y*b**\{i-t-1\}; q\{i-t-1\} -= 1; \} */
-242       if (x.sign == MP_NEG) \{
-243         if ((res = mp_copy (&y, &t1)) != MP_OKAY) \{
-244           goto LBL_Y;
-245         \}
-246         if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
-247           goto LBL_Y;
-248         \}
-249         if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) \{
-250           goto LBL_Y;
-251         \}
-252   
-253         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
-254       \}
-255     \}
-256   
-257     /* now q is the quotient and x is the remainder 
-258      * [which we have to normalize] 
-259      */
-260     
-261     /* get sign before writing to c */
-262     x.sign = x.used == 0 ? MP_ZPOS : a->sign;
-263   
-264     if (c != NULL) \{
-265       mp_clamp (&q);
-266       mp_exch (&q, c);
-267       c->sign = neg;
-268     \}
-269   
-270     if (d != NULL) \{
-271       mp_div_2d (&x, norm, &x, NULL);
-272       mp_exch (&x, d);
-273     \}
-274   
-275     res = MP_OKAY;
-276   
-277   LBL_Y:mp_clear (&y);
-278   LBL_X:mp_clear (&x);
-279   LBL_T2:mp_clear (&t2);
-280   LBL_T1:mp_clear (&t1);
-281   LBL_Q:mp_clear (&q);
-282     return res;
-283   \}
-284   
-285   #endif
-286   
-287   #endif
 \end{alltt}
 \end{small}
 
@@ -8637,23 +5496,23 @@ algorithm with only the quotient is
 mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
 \end{verbatim}
 
-Lines 37 and 44 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
-respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 105 determines the sign of 
-the quotient and line 76 ensures that both $x$ and $y$ are positive.  
+Lines 109 and 113 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 148 determines the sign of 
+the quotient and line 148 ensures that both $x$ and $y$ are positive.  
 
-The number of bits in the leading digit is calculated on line 105.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+The number of bits in the leading digit is calculated on line 151.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
 of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
 exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
 them to the left by $lg(\beta) - 1 - k$ bits.
 
 Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
-leading digit of the quotient.  The loop beginning on line 183 will produce the remainder of the quotient digits.
+leading digit of the quotient.  The loop beginning on line 184 will produce the remainder of the quotient digits.
 
-The conditional ``continue'' on line 114 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+The conditional ``continue'' on line 187 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
 algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
 above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
 
-Lines 130, 130 and 134 through 134 manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+Lines 214, 216 and 223 through 225 manually construct the high accuracy estimations by setting the digits of the two mp\_int 
 variables directly.  
 
 \section{Single Digit Helpers}
@@ -8691,95 +5550,6 @@ This algorithm initiates a temporary mp\_int with the value of the single digit
 \hspace{-5.1mm}{\bf File}: bn\_mp\_add\_d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* single digit addition */
-018   int
-019   mp_add_d (mp_int * a, mp_digit b, mp_int * c)
-020   \{
-021     int     res, ix, oldused;
-022     mp_digit *tmpa, *tmpc, mu;
-023   
-024     /* grow c as required */
-025     if (c->alloc < a->used + 1) \{
-026        if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) \{
-027           return res;
-028        \}
-029     \}
-030   
-031     /* if a is negative and |a| >= b, call c = |a| - b */
-032     if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) \{
-033        /* temporarily fix sign of a */
-034        a->sign = MP_ZPOS;
-035   
-036        /* c = |a| - b */
-037        res = mp_sub_d(a, b, c);
-038   
-039        /* fix sign  */
-040        a->sign = c->sign = MP_NEG;
-041   
-042        return res;
-043     \}
-044   
-045     /* old number of used digits in c */
-046     oldused = c->used;
-047   
-048     /* sign always positive */
-049     c->sign = MP_ZPOS;
-050   
-051     /* source alias */
-052     tmpa    = a->dp;
-053   
-054     /* destination alias */
-055     tmpc    = c->dp;
-056   
-057     /* if a is positive */
-058     if (a->sign == MP_ZPOS) \{
-059        /* add digit, after this we're propagating
-060         * the carry.
-061         */
-062        *tmpc   = *tmpa++ + b;
-063        mu      = *tmpc >> DIGIT_BIT;
-064        *tmpc++ &= MP_MASK;
-065   
-066        /* now handle rest of the digits */
-067        for (ix = 1; ix < a->used; ix++) \{
-068           *tmpc   = *tmpa++ + mu;
-069           mu      = *tmpc >> DIGIT_BIT;
-070           *tmpc++ &= MP_MASK;
-071        \}
-072        /* set final carry */
-073        ix++;
-074        *tmpc++  = mu;
-075   
-076        /* setup size */
-077        c->used = a->used + 1;
-078     \} else \{
-079        /* a was negative and |a| < b */
-080        c->used  = 1;
-081   
-082        /* the result is a single digit */
-083        if (a->used == 1) \{
-084           *tmpc++  =  b - a->dp[0];
-085        \} else \{
-086           *tmpc++  =  b;
-087        \}
-088   
-089        /* setup count so the clearing of oldused
-090         * can fall through correctly
-091         */
-092        ix       = 1;
-093     \}
-094   
-095     /* now zero to oldused */
-096     while (ix++ < oldused) \{
-097        *tmpc++ = 0;
-098     \}
-099     mp_clamp(c);
-100   
-101     return MP_OKAY;
-102   \}
-103   
-104   #endif
 \end{alltt}
 \end{small}
 
@@ -8830,64 +5600,6 @@ Unlike the full multiplication algorithms this algorithm does not require any si
 \hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* multiply by a digit */
-018   int
-019   mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
-020   \{
-021     mp_digit u, *tmpa, *tmpc;
-022     mp_word  r;
-023     int      ix, res, olduse;
-024   
-025     /* make sure c is big enough to hold a*b */
-026     if (c->alloc < a->used + 1) \{
-027       if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) \{
-028         return res;
-029       \}
-030     \}
-031   
-032     /* get the original destinations used count */
-033     olduse = c->used;
-034   
-035     /* set the sign */
-036     c->sign = a->sign;
-037   
-038     /* alias for a->dp [source] */
-039     tmpa = a->dp;
-040   
-041     /* alias for c->dp [dest] */
-042     tmpc = c->dp;
-043   
-044     /* zero carry */
-045     u = 0;
-046   
-047     /* compute columns */
-048     for (ix = 0; ix < a->used; ix++) \{
-049       /* compute product and carry sum for this term */
-050       r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
-051   
-052       /* mask off higher bits to get a single digit */
-053       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
-054   
-055       /* send carry into next iteration */
-056       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-057     \}
-058   
-059     /* store final carry [if any] */
-060     *tmpc++ = u;
-061   
-062     /* now zero digits above the top */
-063     while (ix++ < olduse) \{
-064        *tmpc++ = 0;
-065     \}
-066   
-067     /* set used count */
-068     c->used = a->used + 1;
-069     mp_clamp(c);
-070   
-071     return MP_OKAY;
-072   \}
-073   #endif
 \end{alltt}
 \end{small}
 
@@ -8943,103 +5655,13 @@ from chapter seven.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_div\_d.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   static int s_is_power_of_two(mp_digit b, int *p)
-018   \{
-019      int x;
-020   
-021      for (x = 1; x < DIGIT_BIT; x++) \{
-022         if (b == (((mp_digit)1)<<x)) \{
-023            *p = x;
-024            return 1;
-025         \}
-026      \}
-027      return 0;
-028   \}
-029   
-030   /* single digit division (based on routine from MPI) */
-031   int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
-032   \{
-033     mp_int  q;
-034     mp_word w;
-035     mp_digit t;
-036     int     res, ix;
-037   
-038     /* cannot divide by zero */
-039     if (b == 0) \{
-040        return MP_VAL;
-041     \}
-042   
-043     /* quick outs */
-044     if (b == 1 || mp_iszero(a) == 1) \{
-045        if (d != NULL) \{
-046           *d = 0;
-047        \}
-048        if (c != NULL) \{
-049           return mp_copy(a, c);
-050        \}
-051        return MP_OKAY;
-052     \}
-053   
-054     /* power of two ? */
-055     if (s_is_power_of_two(b, &ix) == 1) \{
-056        if (d != NULL) \{
-057           *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
-058        \}
-059        if (c != NULL) \{
-060           return mp_div_2d(a, ix, c, NULL);
-061        \}
-062        return MP_OKAY;
-063     \}
-064   
-065   #ifdef BN_MP_DIV_3_C
-066     /* three? */
-067     if (b == 3) \{
-068        return mp_div_3(a, c, d);
-069     \}
-070   #endif
-071   
-072     /* no easy answer [c'est la vie].  Just division */
-073     if ((res = mp_init_size(&q, a->used)) != MP_OKAY) \{
-074        return res;
-075     \}
-076     
-077     q.used = a->used;
-078     q.sign = a->sign;
-079     w = 0;
-080     for (ix = a->used - 1; ix >= 0; ix--) \{
-081        w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-082        
-083        if (w >= b) \{
-084           t = (mp_digit)(w / b);
-085           w -= ((mp_word)t) * ((mp_word)b);
-086         \} else \{
-087           t = 0;
-088         \}
-089         q.dp[ix] = (mp_digit)t;
-090     \}
-091     
-092     if (d != NULL) \{
-093        *d = (mp_digit)w;
-094     \}
-095     
-096     if (c != NULL) \{
-097        mp_clamp(&q);
-098        mp_exch(&q, c);
-099     \}
-100     mp_clear(&q);
-101     
-102     return res;
-103   \}
-104   
-105   #endif
 \end{alltt}
 \end{small}
 
 Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
 indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
 
-The division and remainder on lines 43 and @45,%@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based 
+The division and remainder on lines 44 and @45,%@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based 
 processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC 
 compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.  
 
@@ -9107,118 +5729,6 @@ root.  Ideally this algorithm is meant to find the $n$'th root of an input where
 \hspace{-5.1mm}{\bf File}: bn\_mp\_n\_root.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* find the n'th root of an integer 
-018    *
-019    * Result found such that (c)**b <= a and (c+1)**b > a 
-020    *
-021    * This algorithm uses Newton's approximation 
-022    * x[i+1] = x[i] - f(x[i])/f'(x[i]) 
-023    * which will find the root in log(N) time where 
-024    * each step involves a fair bit.  This is not meant to 
-025    * find huge roots [square and cube, etc].
-026    */
-027   int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
-028   \{
-029     mp_int  t1, t2, t3;
-030     int     res, neg;
-031   
-032     /* input must be positive if b is even */
-033     if ((b & 1) == 0 && a->sign == MP_NEG) \{
-034       return MP_VAL;
-035     \}
-036   
-037     if ((res = mp_init (&t1)) != MP_OKAY) \{
-038       return res;
-039     \}
-040   
-041     if ((res = mp_init (&t2)) != MP_OKAY) \{
-042       goto LBL_T1;
-043     \}
-044   
-045     if ((res = mp_init (&t3)) != MP_OKAY) \{
-046       goto LBL_T2;
-047     \}
-048   
-049     /* if a is negative fudge the sign but keep track */
-050     neg     = a->sign;
-051     a->sign = MP_ZPOS;
-052   
-053     /* t2 = 2 */
-054     mp_set (&t2, 2);
-055   
-056     do \{
-057       /* t1 = t2 */
-058       if ((res = mp_copy (&t2, &t1)) != MP_OKAY) \{
-059         goto LBL_T3;
-060       \}
-061   
-062       /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
-063       
-064       /* t3 = t1**(b-1) */
-065       if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) \{   
-066         goto LBL_T3;
-067       \}
-068   
-069       /* numerator */
-070       /* t2 = t1**b */
-071       if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) \{    
-072         goto LBL_T3;
-073       \}
-074   
-075       /* t2 = t1**b - a */
-076       if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) \{  
-077         goto LBL_T3;
-078       \}
-079   
-080       /* denominator */
-081       /* t3 = t1**(b-1) * b  */
-082       if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) \{    
-083         goto LBL_T3;
-084       \}
-085   
-086       /* t3 = (t1**b - a)/(b * t1**(b-1)) */
-087       if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) \{  
-088         goto LBL_T3;
-089       \}
-090   
-091       if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) \{
-092         goto LBL_T3;
-093       \}
-094     \}  while (mp_cmp (&t1, &t2) != MP_EQ);
-095   
-096     /* result can be off by a few so check */
-097     for (;;) \{
-098       if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) \{
-099         goto LBL_T3;
-100       \}
-101   
-102       if (mp_cmp (&t2, a) == MP_GT) \{
-103         if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) \{
-104            goto LBL_T3;
-105         \}
-106       \} else \{
-107         break;
-108       \}
-109     \}
-110   
-111     /* reset the sign of a first */
-112     a->sign = neg;
-113   
-114     /* set the result */
-115     mp_exch (&t1, c);
-116   
-117     /* set the sign of the result */
-118     c->sign = neg;
-119   
-120     res = MP_OKAY;
-121   
-122   LBL_T3:mp_clear (&t3);
-123   LBL_T2:mp_clear (&t2);
-124   LBL_T1:mp_clear (&t1);
-125     return res;
-126   \}
-127   #endif
 \end{alltt}
 \end{small}
 
@@ -9260,41 +5770,6 @@ the integers from $0$ to $\beta - 1$.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_rand.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* makes a pseudo-random int of a given size */
-018   int
-019   mp_rand (mp_int * a, int digits)
-020   \{
-021     int     res;
-022     mp_digit d;
-023   
-024     mp_zero (a);
-025     if (digits <= 0) \{
-026       return MP_OKAY;
-027     \}
-028   
-029     /* first place a random non-zero digit */
-030     do \{
-031       d = ((mp_digit) abs (rand ()));
-032     \} while (d == 0);
-033   
-034     if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{
-035       return res;
-036     \}
-037   
-038     while (digits-- > 0) \{
-039       if ((res = mp_lshd (a, 1)) != MP_OKAY) \{
-040         return res;
-041       \}
-042   
-043       if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) \{
-044         return res;
-045       \}
-046     \}
-047   
-048     return MP_OKAY;
-049   \}
-050   #endif
 \end{alltt}
 \end{small}
 
@@ -9377,68 +5852,6 @@ as part of larger input without any significant problem.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_read\_radix.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* read a string [ASCII] in a given radix */
-018   int mp_read_radix (mp_int * a, char *str, int radix)
-019   \{
-020     int     y, res, neg;
-021     char    ch;
-022   
-023     /* make sure the radix is ok */
-024     if (radix < 2 || radix > 64) \{
-025       return MP_VAL;
-026     \}
-027   
-028     /* if the leading digit is a 
-029      * minus set the sign to negative. 
-030      */
-031     if (*str == '-') \{
-032       ++str;
-033       neg = MP_NEG;
-034     \} else \{
-035       neg = MP_ZPOS;
-036     \}
-037   
-038     /* set the integer to the default of zero */
-039     mp_zero (a);
-040     
-041     /* process each digit of the string */
-042     while (*str) \{
-043       /* if the radix < 36 the conversion is case insensitive
-044        * this allows numbers like 1AB and 1ab to represent the same  value
-045        * [e.g. in hex]
-046        */
-047       ch = (char) ((radix < 36) ? toupper (*str) : *str);
-048       for (y = 0; y < 64; y++) \{
-049         if (ch == mp_s_rmap[y]) \{
-050            break;
-051         \}
-052       \}
-053   
-054       /* if the char was found in the map 
-055        * and is less than the given radix add it
-056        * to the number, otherwise exit the loop. 
-057        */
-058       if (y < radix) \{
-059         if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{
-060            return res;
-061         \}
-062         if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{
-063            return res;
-064         \}
-065       \} else \{
-066         break;
-067       \}
-068       ++str;
-069     \}
-070     
-071     /* set the sign only if a != 0 */
-072     if (mp_iszero(a) != 1) \{
-073        a->sign = neg;
-074     \}
-075     return MP_OKAY;
-076   \}
-077   #endif
 \end{alltt}
 \end{small}
 
@@ -9503,61 +5916,6 @@ are required instead of a series of $n \times k$ divisions.  One design flaw of
 \hspace{-5.1mm}{\bf File}: bn\_mp\_toradix.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* stores a bignum as a ASCII string in a given radix (2..64) */
-018   int mp_toradix (mp_int * a, char *str, int radix)
-019   \{
-020     int     res, digs;
-021     mp_int  t;
-022     mp_digit d;
-023     char   *_s = str;
-024   
-025     /* check range of the radix */
-026     if (radix < 2 || radix > 64) \{
-027       return MP_VAL;
-028     \}
-029   
-030     /* quick out if its zero */
-031     if (mp_iszero(a) == 1) \{
-032        *str++ = '0';
-033        *str = '\symbol{92}0';
-034        return MP_OKAY;
-035     \}
-036   
-037     if ((res = mp_init_copy (&t, a)) != MP_OKAY) \{
-038       return res;
-039     \}
-040   
-041     /* if it is negative output a - */
-042     if (t.sign == MP_NEG) \{
-043       ++_s;
-044       *str++ = '-';
-045       t.sign = MP_ZPOS;
-046     \}
-047   
-048     digs = 0;
-049     while (mp_iszero (&t) == 0) \{
-050       if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) \{
-051         mp_clear (&t);
-052         return res;
-053       \}
-054       *str++ = mp_s_rmap[d];
-055       ++digs;
-056     \}
-057   
-058     /* reverse the digits of the string.  In this case _s points
-059      * to the first digit [exluding the sign] of the number]
-060      */
-061     bn_reverse ((unsigned char *)_s, digs);
-062   
-063     /* append a NULL so the string is properly terminated */
-064     *str = '\symbol{92}0';
-065   
-066     mp_clear (&t);
-067     return MP_OKAY;
-068   \}
-069   
-070   #endif
 \end{alltt}
 \end{small}
 
@@ -9687,33 +6045,30 @@ and will produce the greatest common divisor.
 \textbf{Input}.   mp\_int $a$ and $b$ \\
 \textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
 \hline \\
-1.  If $a = 0$ and $b \ne 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow b$ \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow \vert b \vert $ \\
 \hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $a \ne 0$ and $b = 0$ then \\
-\hspace{3mm}2.1  $c \leftarrow a$ \\
+2.  If $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow \vert a \vert $ \\
 \hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  If $a = b = 0$ then \\
-\hspace{3mm}3.1  $c \leftarrow 1$ \\
-\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
-4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
-5.  $k \leftarrow 0$ \\
-6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-9.  While $v.used > 0$ \\
-\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
-\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
-\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
-\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-10.  $c \leftarrow u \cdot 2^k$ \\
-11.  Return(\textit{MP\_OKAY}). \\
+3.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}5.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+8.  While $v.used > 0$ \\
+\hspace{3mm}8.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}8.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}8.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}8.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}8.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  $c \leftarrow u \cdot 2^k$ \\
+10.  Return(\textit{MP\_OKAY}). \\
 \hline
 \end{tabular}
 \end{center}
@@ -9725,17 +6080,17 @@ This algorithm will produce the greatest common divisor of two mp\_ints $a$ and
 Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
 Algorithm B and in practice this appears to be true.  
 
-The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+The first two steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
 largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
 $a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
 
-Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+Step five will divide out any common factors of two and keep track of the count in the variable $k$.  After this step, two is no longer a
 factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
-seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+six and seven ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while--loops will iterate since 
 they cannot both be even.
 
-By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
-or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+By step eight both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 8.2 will always produce a positive and even result.  Step 8.3 removes any
 factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
 
 After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
@@ -9745,116 +6100,23 @@ must be adjusted by multiplying by the common factors of two ($2^k$) removed ear
 \hspace{-5.1mm}{\bf File}: bn\_mp\_gcd.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* Greatest Common Divisor using the binary method */
-018   int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     mp_int  u, v;
-021     int     k, u_lsb, v_lsb, res;
-022   
-023     /* either zero than gcd is the largest */
-024     if (mp_iszero (a) == 1 && mp_iszero (b) == 0) \{
-025       return mp_abs (b, c);
-026     \}
-027     if (mp_iszero (a) == 0 && mp_iszero (b) == 1) \{
-028       return mp_abs (a, c);
-029     \}
-030   
-031     /* optimized.  At this point if a == 0 then
-032      * b must equal zero too
-033      */
-034     if (mp_iszero (a) == 1) \{
-035       mp_zero(c);
-036       return MP_OKAY;
-037     \}
-038   
-039     /* get copies of a and b we can modify */
-040     if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{
-041       return res;
-042     \}
-043   
-044     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
-045       goto LBL_U;
-046     \}
-047   
-048     /* must be positive for the remainder of the algorithm */
-049     u.sign = v.sign = MP_ZPOS;
-050   
-051     /* B1.  Find the common power of two for u and v */
-052     u_lsb = mp_cnt_lsb(&u);
-053     v_lsb = mp_cnt_lsb(&v);
-054     k     = MIN(u_lsb, v_lsb);
-055   
-056     if (k > 0) \{
-057        /* divide the power of two out */
-058        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
-059           goto LBL_V;
-060        \}
-061   
-062        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
-063           goto LBL_V;
-064        \}
-065     \}
-066   
-067     /* divide any remaining factors of two out */
-068     if (u_lsb != k) \{
-069        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
-070           goto LBL_V;
-071        \}
-072     \}
-073   
-074     if (v_lsb != k) \{
-075        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
-076           goto LBL_V;
-077        \}
-078     \}
-079   
-080     while (mp_iszero(&v) == 0) \{
-081        /* make sure v is the largest */
-082        if (mp_cmp_mag(&u, &v) == MP_GT) \{
-083           /* swap u and v to make sure v is >= u */
-084           mp_exch(&u, &v);
-085        \}
-086        
-087        /* subtract smallest from largest */
-088        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
-089           goto LBL_V;
-090        \}
-091        
-092        /* Divide out all factors of two */
-093        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
-094           goto LBL_V;
-095        \} 
-096     \} 
-097   
-098     /* multiply by 2**k which we divided out at the beginning */
-099     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
-100        goto LBL_V;
-101     \}
-102     c->sign = MP_ZPOS;
-103     res = MP_OKAY;
-104   LBL_V:mp_clear (&u);
-105   LBL_U:mp_clear (&v);
-106     return res;
-107   \}
-108   #endif
 \end{alltt}
 \end{small}
 
 This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
 integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
 it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
-trivial cases of inputs are handled on lines 24 through 37.  After those lines the inputs are assumed to be non-zero.
+trivial cases of inputs are handled on lines 24 through 30.  After those lines the inputs are assumed to be non-zero.
 
-Lines 34 and 40 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
-must be divided out of the two inputs.  The while loop on line 80 iterates so long as both are even.  The local integer $k$ is used to
-keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
-value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
-a limitation.}.  
+Lines 32 and 37 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The block starting at line 44 removes common factors of two by first counting the number of trailing
+zero bits in both.  The local integer $k$ is used to keep track of how many factors of $2$ are pulled out of both values.  It is assumed that 
+the number of factors will not exceed the maximum value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than 
+entries than are accessible by an ``int'' so this is not a limitation.}.  
 
-At this point there are no more common factors of two in the two values.  The while loops on lines 80 and 80 remove any independent
-factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
-on line 80 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+At this point there are no more common factors of two in the two values.  The divisions by a power of two on lines 62 and 68 remove 
+any independent factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line 73 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
 place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
 
 \section{Least Common Multiple}
@@ -9893,46 +6155,6 @@ dividing the product of the two inputs by their greatest common divisor.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_lcm.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* computes least common multiple as |a*b|/(a, b) */
-018   int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     int     res;
-021     mp_int  t1, t2;
-022   
-023   
-024     if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) \{
-025       return res;
-026     \}
-027   
-028     /* t1 = get the GCD of the two inputs */
-029     if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) \{
-030       goto LBL_T;
-031     \}
-032   
-033     /* divide the smallest by the GCD */
-034     if (mp_cmp_mag(a, b) == MP_LT) \{
-035        /* store quotient in t2 such that t2 * b is the LCM */
-036        if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) \{
-037           goto LBL_T;
-038        \}
-039        res = mp_mul(b, &t2, c);
-040     \} else \{
-041        /* store quotient in t2 such that t2 * a is the LCM */
-042        if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) \{
-043           goto LBL_T;
-044        \}
-045        res = mp_mul(a, &t2, c);
-046     \}
-047   
-048     /* fix the sign to positive */
-049     c->sign = MP_ZPOS;
-050   
-051   LBL_T:
-052     mp_clear_multi (&t1, &t2, NULL);
-053     return res;
-054   \}
-055   #endif
 \end{alltt}
 \end{small}
 
@@ -9941,6 +6163,8 @@ To explain the Jacobi Symbol we shall first discuss the Legendre function\footno
 defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
 equivalent to equation \ref{eqn:legendre}.
 
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
 \begin{equation}
 a^{(p-1)/2} \equiv \begin{array}{rl}
                               -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
@@ -10090,91 +6314,6 @@ $\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi
 \hspace{-5.1mm}{\bf File}: bn\_mp\_jacobi.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* computes the jacobi c = (a | n) (or Legendre if n is prime)
-018    * HAC pp. 73 Algorithm 2.149
-019    */
-020   int mp_jacobi (mp_int * a, mp_int * p, int *c)
-021   \{
-022     mp_int  a1, p1;
-023     int     k, s, r, res;
-024     mp_digit residue;
-025   
-026     /* if p <= 0 return MP_VAL */
-027     if (mp_cmp_d(p, 0) != MP_GT) \{
-028        return MP_VAL;
-029     \}
-030   
-031     /* step 1.  if a == 0, return 0 */
-032     if (mp_iszero (a) == 1) \{
-033       *c = 0;
-034       return MP_OKAY;
-035     \}
-036   
-037     /* step 2.  if a == 1, return 1 */
-038     if (mp_cmp_d (a, 1) == MP_EQ) \{
-039       *c = 1;
-040       return MP_OKAY;
-041     \}
-042   
-043     /* default */
-044     s = 0;
-045   
-046     /* step 3.  write a = a1 * 2**k  */
-047     if ((res = mp_init_copy (&a1, a)) != MP_OKAY) \{
-048       return res;
-049     \}
-050   
-051     if ((res = mp_init (&p1)) != MP_OKAY) \{
-052       goto LBL_A1;
-053     \}
-054   
-055     /* divide out larger power of two */
-056     k = mp_cnt_lsb(&a1);
-057     if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) \{
-058        goto LBL_P1;
-059     \}
-060   
-061     /* step 4.  if e is even set s=1 */
-062     if ((k & 1) == 0) \{
-063       s = 1;
-064     \} else \{
-065       /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
-066       residue = p->dp[0] & 7;
-067   
-068       if (residue == 1 || residue == 7) \{
-069         s = 1;
-070       \} else if (residue == 3 || residue == 5) \{
-071         s = -1;
-072       \}
-073     \}
-074   
-075     /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-076     if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) \{
-077       s = -s;
-078     \}
-079   
-080     /* if a1 == 1 we're done */
-081     if (mp_cmp_d (&a1, 1) == MP_EQ) \{
-082       *c = s;
-083     \} else \{
-084       /* n1 = n mod a1 */
-085       if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) \{
-086         goto LBL_P1;
-087       \}
-088       if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) \{
-089         goto LBL_P1;
-090       \}
-091       *c = s * r;
-092     \}
-093   
-094     /* done */
-095     res = MP_OKAY;
-096   LBL_P1:mp_clear (&p1);
-097   LBL_A1:mp_clear (&a1);
-098     return res;
-099   \}
-100   #endif
 \end{alltt}
 \end{small}
 
@@ -10189,9 +6328,9 @@ After a local copy of $a$ is made all of the factors of two are divided out and
 bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same 
 processor requirements and neither is faster than the other.
 
-Line 61 through 70 determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
+Line 58 through 71 determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
 $k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
-$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines 75 through 73.  
+$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines 71 through 74.  
 
 Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.  
 
@@ -10300,29 +6439,6 @@ then only a couple of additions or subtractions will be required to adjust the i
 \hspace{-5.1mm}{\bf File}: bn\_mp\_invmod.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* hac 14.61, pp608 */
-018   int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     /* b cannot be negative */
-021     if (b->sign == MP_NEG || mp_iszero(b) == 1) \{
-022       return MP_VAL;
-023     \}
-024   
-025   #ifdef BN_FAST_MP_INVMOD_C
-026     /* if the modulus is odd we can use a faster routine instead */
-027     if (mp_isodd (b) == 1) \{
-028       return fast_mp_invmod (a, b, c);
-029     \}
-030   #endif
-031   
-032   #ifdef BN_MP_INVMOD_SLOW_C
-033     return mp_invmod_slow(a, b, c);
-034   #endif
-035   
-036     return MP_VAL;
-037   \}
-038   #endif
 \end{alltt}
 \end{small}
 
@@ -10394,36 +6510,6 @@ This algorithm attempts to determine if a candidate integer $n$ is composite by
 \hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_is\_divisible.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* determines if an integers is divisible by one 
-018    * of the first PRIME_SIZE primes or not
-019    *
-020    * sets result to 0 if not, 1 if yes
-021    */
-022   int mp_prime_is_divisible (mp_int * a, int *result)
-023   \{
-024     int     err, ix;
-025     mp_digit res;
-026   
-027     /* default to not */
-028     *result = MP_NO;
-029   
-030     for (ix = 0; ix < PRIME_SIZE; ix++) \{
-031       /* what is a mod LBL_prime_tab[ix] */
-032       if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) \{
-033         return err;
-034       \}
-035   
-036       /* is the residue zero? */
-037       if (res == 0) \{
-038         *result = MP_YES;
-039         return MP_OKAY;
-040       \}
-041     \}
-042   
-043     return MP_OKAY;
-044   \}
-045   #endif
 \end{alltt}
 \end{small}
 
@@ -10434,47 +6520,6 @@ mp\_digit.  The table \_\_prime\_tab is defined in the following file.
 \hspace{-5.1mm}{\bf File}: bn\_prime\_tab.c
 \vspace{-3mm}
 \begin{alltt}
-016   const mp_digit ltm_prime_tab[] = \{
-017     0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-018     0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-019     0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-020     0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
-021   #ifndef MP_8BIT
-022     0x0083,
-023     0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-024     0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-025     0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-026     0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-027   
-028     0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-029     0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-030     0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-031     0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-032     0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-033     0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-034     0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-035     0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-036   
-037     0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-038     0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-039     0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-040     0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-041     0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-042     0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-043     0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-044     0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-045   
-046     0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-047     0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-048     0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-049     0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-050     0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-051     0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-052     0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-053     0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
-054   #endif
-055   \};
-056   #endif
 \end{alltt}
 \end{small}
 
@@ -10521,48 +6566,6 @@ determine the result.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_fermat.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* performs one Fermat test.
-018    * 
-019    * If "a" were prime then b**a == b (mod a) since the order of
-020    * the multiplicative sub-group would be phi(a) = a-1.  That means
-021    * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
-022    *
-023    * Sets result to 1 if the congruence holds, or zero otherwise.
-024    */
-025   int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
-026   \{
-027     mp_int  t;
-028     int     err;
-029   
-030     /* default to composite  */
-031     *result = MP_NO;
-032   
-033     /* ensure b > 1 */
-034     if (mp_cmp_d(b, 1) != MP_GT) \{
-035        return MP_VAL;
-036     \}
-037   
-038     /* init t */
-039     if ((err = mp_init (&t)) != MP_OKAY) \{
-040       return err;
-041     \}
-042   
-043     /* compute t = b**a mod a */
-044     if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) \{
-045       goto LBL_T;
-046     \}
-047   
-048     /* is it equal to b? */
-049     if (mp_cmp (&t, b) == MP_EQ) \{
-050       *result = MP_YES;
-051     \}
-052   
-053     err = MP_OKAY;
-054   LBL_T:mp_clear (&t);
-055     return err;
-056   \}
-057   #endif
 \end{alltt}
 \end{small}
 
@@ -10615,89 +6618,6 @@ composite then it is \textit{probably} prime.
 \hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_miller\_rabin.c
 \vspace{-3mm}
 \begin{alltt}
-016   
-017   /* Miller-Rabin test of "a" to the base of "b" as described in 
-018    * HAC pp. 139 Algorithm 4.24
-019    *
-020    * Sets result to 0 if definitely composite or 1 if probably prime.
-021    * Randomly the chance of error is no more than 1/4 and often 
-022    * very much lower.
-023    */
-024   int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
-025   \{
-026     mp_int  n1, y, r;
-027     int     s, j, err;
-028   
-029     /* default */
-030     *result = MP_NO;
-031   
-032     /* ensure b > 1 */
-033     if (mp_cmp_d(b, 1) != MP_GT) \{
-034        return MP_VAL;
-035     \}     
-036   
-037     /* get n1 = a - 1 */
-038     if ((err = mp_init_copy (&n1, a)) != MP_OKAY) \{
-039       return err;
-040     \}
-041     if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) \{
-042       goto LBL_N1;
-043     \}
-044   
-045     /* set 2**s * r = n1 */
-046     if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) \{
-047       goto LBL_N1;
-048     \}
-049   
-050     /* count the number of least significant bits
-051      * which are zero
-052      */
-053     s = mp_cnt_lsb(&r);
-054   
-055     /* now divide n - 1 by 2**s */
-056     if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) \{
-057       goto LBL_R;
-058     \}
-059   
-060     /* compute y = b**r mod a */
-061     if ((err = mp_init (&y)) != MP_OKAY) \{
-062       goto LBL_R;
-063     \}
-064     if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) \{
-065       goto LBL_Y;
-066     \}
-067   
-068     /* if y != 1 and y != n1 do */
-069     if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) \{
-070       j = 1;
-071       /* while j <= s-1 and y != n1 */
-072       while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) \{
-073         if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) \{
-074            goto LBL_Y;
-075         \}
-076   
-077         /* if y == 1 then composite */
-078         if (mp_cmp_d (&y, 1) == MP_EQ) \{
-079            goto LBL_Y;
-080         \}
-081   
-082         ++j;
-083       \}
-084   
-085       /* if y != n1 then composite */
-086       if (mp_cmp (&y, &n1) != MP_EQ) \{
-087         goto LBL_Y;
-088       \}
-089     \}
-090   
-091     /* probably prime now */
-092     *result = MP_YES;
-093   LBL_Y:mp_clear (&y);
-094   LBL_R:mp_clear (&r);
-095   LBL_N1:mp_clear (&n1);
-096     return err;
-097   \}
-098   #endif
 \end{alltt}
 \end{small}
 
diff --git a/libtommath/tommath_class.h b/libtommath/tommath_class.h
index 53bfa31..b9cc902 100644
--- a/libtommath/tommath_class.h
+++ b/libtommath/tommath_class.h
@@ -90,8 +90,11 @@
 #define BN_MP_READ_UNSIGNED_BIN_C
 #define BN_MP_REDUCE_C
 #define BN_MP_REDUCE_2K_C
+#define BN_MP_REDUCE_2K_L_C
 #define BN_MP_REDUCE_2K_SETUP_C
+#define BN_MP_REDUCE_2K_SETUP_L_C
 #define BN_MP_REDUCE_IS_2K_C
+#define BN_MP_REDUCE_IS_2K_L_C
 #define BN_MP_REDUCE_SETUP_C
 #define BN_MP_RSHD_C
 #define BN_MP_SET_C
@@ -105,7 +108,9 @@
 #define BN_MP_SUB_D_C
 #define BN_MP_SUBMOD_C
 #define BN_MP_TO_SIGNED_BIN_C
+#define BN_MP_TO_SIGNED_BIN_N_C
 #define BN_MP_TO_UNSIGNED_BIN_C
+#define BN_MP_TO_UNSIGNED_BIN_N_C
 #define BN_MP_TOOM_MUL_C
 #define BN_MP_TOOM_SQR_C
 #define BN_MP_TORADIX_C
@@ -132,7 +137,7 @@
    #define BN_MP_ISEVEN_C
    #define BN_MP_INIT_MULTI_C
    #define BN_MP_COPY_C
-   #define BN_MP_ABS_C
+   #define BN_MP_MOD_C
    #define BN_MP_SET_C
    #define BN_MP_DIV_2_C
    #define BN_MP_ISODD_C
@@ -324,11 +329,12 @@
    #define BN_MP_CLEAR_C
    #define BN_MP_ABS_C
    #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_REDUCE_IS_2K_L_C
+   #define BN_S_MP_EXPTMOD_C
    #define BN_MP_DR_IS_MODULUS_C
    #define BN_MP_REDUCE_IS_2K_C
    #define BN_MP_ISODD_C
    #define BN_MP_EXPTMOD_FAST_C
-   #define BN_S_MP_EXPTMOD_C
 #endif
 
 #if defined(BN_MP_EXPTMOD_FAST_C)
@@ -360,6 +366,7 @@
    #define BN_MP_DIV_C
    #define BN_MP_MUL_C
    #define BN_MP_SUB_C
+   #define BN_MP_NEG_C
    #define BN_MP_EXCH_C
    #define BN_MP_CLEAR_MULTI_C
 #endif
@@ -434,6 +441,7 @@
 #if defined(BN_MP_INVMOD_SLOW_C)
    #define BN_MP_ISZERO_C
    #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_C
    #define BN_MP_COPY_C
    #define BN_MP_ISEVEN_C
    #define BN_MP_SET_C
@@ -679,6 +687,7 @@
 #if defined(BN_MP_READ_RADIX_C)
    #define BN_MP_ZERO_C
    #define BN_MP_S_RMAP_C
+   #define BN_MP_RADIX_SMAP_C
    #define BN_MP_MUL_D_C
    #define BN_MP_ADD_D_C
    #define BN_MP_ISZERO_C
@@ -725,6 +734,17 @@
    #define BN_MP_CLEAR_C
 #endif
 
+#if defined(BN_MP_REDUCE_2K_L_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MUL_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
 #if defined(BN_MP_REDUCE_2K_SETUP_C)
    #define BN_MP_INIT_C
    #define BN_MP_COUNT_BITS_C
@@ -733,11 +753,22 @@
    #define BN_S_MP_SUB_C
 #endif
 
+#if defined(BN_MP_REDUCE_2K_SETUP_L_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
 #if defined(BN_MP_REDUCE_IS_2K_C)
    #define BN_MP_REDUCE_2K_C
    #define BN_MP_COUNT_BITS_C
 #endif
 
+#if defined(BN_MP_REDUCE_IS_2K_L_C)
+#endif
+
 #if defined(BN_MP_REDUCE_SETUP_C)
    #define BN_MP_2EXPT_C
    #define BN_MP_DIV_C
@@ -815,6 +846,11 @@
    #define BN_MP_TO_UNSIGNED_BIN_C
 #endif
 
+#if defined(BN_MP_TO_SIGNED_BIN_N_C)
+   #define BN_MP_SIGNED_BIN_SIZE_C
+   #define BN_MP_TO_SIGNED_BIN_C
+#endif
+
 #if defined(BN_MP_TO_UNSIGNED_BIN_C)
    #define BN_MP_INIT_COPY_C
    #define BN_MP_ISZERO_C
@@ -822,6 +858,11 @@
    #define BN_MP_CLEAR_C
 #endif
 
+#if defined(BN_MP_TO_UNSIGNED_BIN_N_C)
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+   #define BN_MP_TO_UNSIGNED_BIN_C
+#endif
+
 #if defined(BN_MP_TOOM_MUL_C)
    #define BN_MP_INIT_MULTI_C
    #define BN_MP_MOD_2D_C
@@ -902,10 +943,12 @@
    #define BN_MP_INIT_C
    #define BN_MP_CLEAR_C
    #define BN_MP_REDUCE_SETUP_C
+   #define BN_MP_REDUCE_C
+   #define BN_MP_REDUCE_2K_SETUP_L_C
+   #define BN_MP_REDUCE_2K_L_C
    #define BN_MP_MOD_C
    #define BN_MP_COPY_C
    #define BN_MP_SQR_C
-   #define BN_MP_REDUCE_C
    #define BN_MP_MUL_C
    #define BN_MP_SET_C
    #define BN_MP_EXCH_C
diff --git a/libtommath/tommath_superclass.h b/libtommath/tommath_superclass.h
index b50ecb0..e3926df 100644
--- a/libtommath/tommath_superclass.h
+++ b/libtommath/tommath_superclass.h
@@ -4,7 +4,7 @@
 #define LTM_ALL
 
 /* RSA only (does not support DH/DSA/ECC) */
-// #define SC_RSA_1
+/* #define SC_RSA_1 */
 
 /* For reference.... On an Athlon64 optimizing for speed...