Merge trunk. Re-generate tclTomMath.h. Use faster exponentiation-function from libtommath 1.0 (in tclExecute.c).

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2016-11-17 10:46:09 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2016-11-17 10:46:09 (GMT)
commit: c011864b3411bd607efb52ffd86bb9b91e8e1bf3 (patch)
tree: 5ae27db53133eec3cea2523f3df4c28df354b71e /libtommath
parent: 68111aa5bf7fc228dcfda8beb9de265734925b56 (diff)
parent: 3dd86e6ebc0137c3a2c02d3a046de046571e3789 (diff)
download: tcl-c011864b3411bd607efb52ffd86bb9b91e8e1bf3.zip
tcl-c011864b3411bd607efb52ffd86bb9b91e8e1bf3.tar.gz
tcl-c011864b3411bd607efb52ffd86bb9b91e8e1bf3.tar.bz2
79 files changed, 556 insertions, 37631 deletions
diff --git a/libtommath/bn.ilg b/libtommath/bn.ilg
deleted file mode 100644
index 2a14624..0000000
--- a/libtommath/bn.ilg
+++ /dev/null
@@ -1,6 +0,0 @@
-This is makeindex, version 2.15 [TeX Live 2013] (kpathsea + Thai support).
-Scanning input file bn.idx....done (85 entries accepted, 0 rejected).
-Sorting entries....done (554 comparisons).
-Generating output file bn.ind....done (88 lines written, 0 warnings).
-Output written in bn.ind.
-Transcript written in bn.ilg.
diff --git a/libtommath/bn.ind b/libtommath/bn.ind
deleted file mode 100644
index 01cff1a..0000000
--- a/libtommath/bn.ind
+++ /dev/null
@@ -1,88 +0,0 @@
-\begin{theindex}
-
-  \item mp\_add, \hyperpage{24}
-  \item mp\_add\_d, \hyperpage{44}
-  \item mp\_and, \hyperpage{24}
-  \item mp\_clear, \hyperpage{9}
-  \item mp\_clear\_multi, \hyperpage{10}
-  \item mp\_cmp, \hyperpage{19}
-  \item mp\_cmp\_d, \hyperpage{20}
-  \item mp\_cmp\_mag, \hyperpage{18}
-  \item mp\_div, \hyperpage{24}
-  \item mp\_div\_2, \hyperpage{22}
-  \item mp\_div\_2d, \hyperpage{23}
-  \item mp\_div\_d, \hyperpage{44}
-  \item mp\_dr\_reduce, \hyperpage{33}
-  \item mp\_dr\_setup, \hyperpage{33}
-  \item MP\_EQ, \hyperpage{18}
-  \item mp\_error\_to\_string, \hyperpage{7}
-  \item mp\_expt\_d, \hyperpage{35}
-  \item mp\_expt\_d\_ex, \hyperpage{35}
-  \item mp\_exptmod, \hyperpage{35}
-  \item mp\_exteuclid, \hyperpage{43}
-  \item mp\_gcd, \hyperpage{43}
-  \item mp\_get\_int, \hyperpage{16}
-  \item mp\_get\_long, \hyperpage{17}
-  \item mp\_get\_long\_long, \hyperpage{17}
-  \item mp\_grow, \hyperpage{13}
-  \item MP\_GT, \hyperpage{18}
-  \item mp\_init, \hyperpage{8}
-  \item mp\_init\_copy, \hyperpage{10}
-  \item mp\_init\_multi, \hyperpage{10}
-  \item mp\_init\_set, \hyperpage{17}
-  \item mp\_init\_set\_int, \hyperpage{17}
-  \item mp\_init\_size, \hyperpage{11}
-  \item mp\_int, \hyperpage{8}
-  \item mp\_invmod, \hyperpage{44}
-  \item mp\_jacobi, \hyperpage{43}
-  \item mp\_lcm, \hyperpage{43}
-  \item mp\_lshd, \hyperpage{23}
-  \item MP\_LT, \hyperpage{18}
-  \item MP\_MEM, \hyperpage{7}
-  \item mp\_mod, \hyperpage{29}
-  \item mp\_mod\_d, \hyperpage{44}
-  \item mp\_montgomery\_calc\_normalization, \hyperpage{31}
-  \item mp\_montgomery\_reduce, \hyperpage{31}
-  \item mp\_montgomery\_setup, \hyperpage{31}
-  \item mp\_mul, \hyperpage{25}
-  \item mp\_mul\_2, \hyperpage{22}
-  \item mp\_mul\_2d, \hyperpage{23}
-  \item mp\_mul\_d, \hyperpage{44}
-  \item mp\_n\_root, \hyperpage{36}
-  \item mp\_neg, \hyperpage{24}
-  \item MP\_NO, \hyperpage{7}
-  \item MP\_OKAY, \hyperpage{7}
-  \item mp\_or, \hyperpage{24}
-  \item mp\_prime\_fermat, \hyperpage{37}
-  \item mp\_prime\_is\_divisible, \hyperpage{37}
-  \item mp\_prime\_is\_prime, \hyperpage{38}
-  \item mp\_prime\_miller\_rabin, \hyperpage{37}
-  \item mp\_prime\_next\_prime, \hyperpage{38}
-  \item mp\_prime\_rabin\_miller\_trials, \hyperpage{38}
-  \item mp\_prime\_random, \hyperpage{38}
-  \item mp\_prime\_random\_ex, \hyperpage{39}
-  \item mp\_radix\_size, \hyperpage{41}
-  \item mp\_read\_radix, \hyperpage{41}
-  \item mp\_read\_unsigned\_bin, \hyperpage{42}
-  \item mp\_reduce, \hyperpage{30}
-  \item mp\_reduce\_2k, \hyperpage{34}
-  \item mp\_reduce\_2k\_setup, \hyperpage{34}
-  \item mp\_reduce\_setup, \hyperpage{29}
-  \item mp\_rshd, \hyperpage{23}
-  \item mp\_set, \hyperpage{15}
-  \item mp\_set\_int, \hyperpage{16}
-  \item mp\_set\_long, \hyperpage{17}
-  \item mp\_set\_long\_long, \hyperpage{17}
-  \item mp\_shrink, \hyperpage{12}
-  \item mp\_sqr, \hyperpage{26}
-  \item mp\_sqrtmod\_prime, \hyperpage{44}
-  \item mp\_sub, \hyperpage{24}
-  \item mp\_sub\_d, \hyperpage{44}
-  \item mp\_to\_unsigned\_bin, \hyperpage{42}
-  \item mp\_toradix, \hyperpage{41}
-  \item mp\_unsigned\_bin\_size, \hyperpage{41}
-  \item MP\_VAL, \hyperpage{7}
-  \item mp\_xor, \hyperpage{24}
-  \item MP\_YES, \hyperpage{7}
-
-\end{theindex}
diff --git a/libtommath/bn.pdf b/libtommath/bn.pdf
deleted file mode 100644
index 392b649..0000000
--- a/libtommath/bn.pdf
+++ /dev/null
diff --git a/libtommath/bn.tex b/libtommath/bn.tex
deleted file mode 100644
index 8d52075..0000000
--- a/libtommath/bn.tex
+++ /dev/null
@@ -1,1913 +0,0 @@
-\documentclass[b5paper]{book}
-\usepackage{hyperref}
-\usepackage{makeidx}
-\usepackage{amssymb}
-\usepackage{color}
-\usepackage{alltt}
-\usepackage{graphicx}
-\usepackage{layout}
-\def\union{\cup}
-\def\intersect{\cap}
-\def\getsrandom{\stackrel{\rm R}{\gets}}
-\def\cross{\times}
-\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
-\def\catn{$\|$}
-\def\divides{\hspace{0.3em} | \hspace{0.3em}}
-\def\nequiv{\not\equiv}
-\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
-\def\lcm{{\rm lcm}}
-\def\gcd{{\rm gcd}}
-\def\log{{\rm log}}
-\def\ord{{\rm ord}}
-\def\abs{{\mathit abs}}
-\def\rep{{\mathit rep}}
-\def\mod{{\mathit\ mod\ }}
-\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
-\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
-\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
-\def\Or{{\rm\ or\ }}
-\def\And{{\rm\ and\ }}
-\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
-\def\implies{\Rightarrow}
-\def\undefined{{\rm ``undefined"}}
-\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
-\let\oldphi\phi
-\def\phi{\varphi}
-\def\Pr{{\rm Pr}}
-\newcommand{\str}[1]{{\mathbf{#1}}}
-\def\F{{\mathbb F}}
-\def\N{{\mathbb N}}
-\def\Z{{\mathbb Z}}
-\def\R{{\mathbb R}}
-\def\C{{\mathbb C}}
-\def\Q{{\mathbb Q}}
-\definecolor{DGray}{gray}{0.5}
-\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
-\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
-\def\gap{\vspace{0.5ex}}
-\makeindex
-\begin{document}
-\frontmatter
-\pagestyle{empty}
-\title{LibTomMath User Manual \\ v1.0.0}
-\author{Tom St Denis \\ tstdenis82@gmail.com}
-\maketitle
-This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been
-formatted for B5 [176x250] paper using the \LaTeX{} {\em book} macro package.
-
-\vspace{10cm}
-
-\begin{flushright}Open Source.  Open Academia.  Open Minds.
-
-\mbox{ }
-
-Tom St Denis,
-
-Ontario, Canada
-\end{flushright}
-
-\tableofcontents
-\listoffigures
-\mainmatter
-\pagestyle{headings}
-\chapter{Introduction}
-\section{What is LibTomMath?}
-LibTomMath is a library of source code which provides a series of efficient and carefully written functions for manipulating
-large integer numbers.  It was written in portable ISO C source code so that it will build on any platform with a conforming
-C compiler.
-
-In a nutshell the library was written from scratch with verbose comments to help instruct computer science students how
-to implement ``bignum'' math.  However, the resulting code has proven to be very useful.  It has been used by numerous
-universities, commercial and open source software developers.  It has been used on a variety of platforms ranging from
-Linux and Windows based x86 to ARM based Gameboys and PPC based MacOS machines.
-
-\section{License}
-As of the v0.25 the library source code has been placed in the public domain with every new release.  As of the v0.28
-release the textbook ``Implementing Multiple Precision Arithmetic'' has been placed in the public domain with every new
-release as well.  This textbook is meant to compliment the project by providing a more solid walkthrough of the development
-algorithms used in the library.
-
-Since both\footnote{Note that the MPI files under mtest/ are copyrighted by Michael Fromberger.  They are not required to use LibTomMath.} are in the
-public domain everyone is entitled to do with them as they see fit.
-
-\section{Building LibTomMath}
-
-LibTomMath is meant to be very ``GCC friendly'' as it comes with a makefile well suited for GCC.  However, the library will
-also build in MSVC, Borland C out of the box.  For any other ISO C compiler a makefile will have to be made by the end
-developer.
-
-\subsection{Static Libraries}
-To build as a static library for GCC issue the following
-\begin{alltt}
-make
-\end{alltt}
-
-command.  This will build the library and archive the object files in ``libtommath.a''.  Now you link against
-that and include ``tommath.h'' within your programs.  Alternatively to build with MSVC issue the following
-\begin{alltt}
-nmake -f makefile.msvc
-\end{alltt}
-
-This will build the library and archive the object files in ``tommath.lib''.  This has been tested with MSVC
-version 6.00 with service pack 5.
-
-\subsection{Shared Libraries}
-To build as a shared library for GCC issue the following
-\begin{alltt}
-make -f makefile.shared
-\end{alltt}
-This requires the ``libtool'' package (common on most Linux/BSD systems).  It will build LibTomMath as both shared
-and static then install (by default) into /usr/lib as well as install the header files in /usr/include.  The shared
-library (resource) will be called ``libtommath.la'' while the static library called ``libtommath.a''.  Generally
-you use libtool to link your application against the shared object.
-
-There is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile.  It requires
-Cygwin to work with since it requires the auto-export/import functionality.  The resulting DLL and import library
-``libtommath.dll.a'' can be used to link LibTomMath dynamically to any Windows program using Cygwin.
-
-\subsection{Testing}
-To build the library and the test harness type
-
-\begin{alltt}
-make test
-\end{alltt}
-
-This will build the library, ``test'' and ``mtest/mtest''.  The ``test'' program will accept test vectors and verify the
-results.  ``mtest/mtest'' will generate test vectors using the MPI library by Michael Fromberger\footnote{A copy of MPI
-is included in the package}.  Simply pipe mtest into test using
-
-\begin{alltt}
-mtest/mtest | test
-\end{alltt}
-
-If you do not have a ``/dev/urandom'' style RNG source you will have to write your own PRNG and simply pipe that into
-mtest.  For example, if your PRNG program is called ``myprng'' simply invoke
-
-\begin{alltt}
-myprng | mtest/mtest | test
-\end{alltt}
-
-This will output a row of numbers that are increasing.  Each column is a different test (such as addition, multiplication, etc)
-that is being performed.  The numbers represent how many times the test was invoked.  If an error is detected the program
-will exit with a dump of the relevent numbers it was working with.
-
-\section{Build Configuration}
-LibTomMath can configured at build time in three phases we shall call ``depends'', ``tweaks'' and ``trims''.
-Each phase changes how the library is built and they are applied one after another respectively.
-
-To make the system more powerful you can tweak the build process.  Classes are defined in the file
-``tommath\_superclass.h''.  By default, the symbol ``LTM\_ALL'' shall be defined which simply
-instructs the system to build all of the functions.  This is how LibTomMath used to be packaged.  This will give you
-access to every function LibTomMath offers.
-
-However, there are cases where such a build is not optional.  For instance, you want to perform RSA operations.  You
-don't need the vast majority of the library to perform these operations.  Aside from LTM\_ALL there is
-another pre--defined class ``SC\_RSA\_1'' which works in conjunction with the RSA from LibTomCrypt.  Additional
-classes can be defined base on the need of the user.
-
-\subsection{Build Depends}
-In the file tommath\_class.h you will see a large list of C ``defines'' followed by a series of ``ifdefs''
-which further define symbols.  All of the symbols (technically they're macros $\ldots$) represent a given C source
-file.  For instance, BN\_MP\_ADD\_C represents the file ``bn\_mp\_add.c''.  When a define has been enabled the
-function in the respective file will be compiled and linked into the library.  Accordingly when the define
-is absent the file will not be compiled and not contribute any size to the library.
-
-You will also note that the header tommath\_class.h is actually recursively included (it includes itself twice).
-This is to help resolve as many dependencies as possible.  In the last pass the symbol LTM\_LAST will be defined.
-This is useful for ``trims''.
-
-\subsection{Build Tweaks}
-A tweak is an algorithm ``alternative''.  For example, to provide tradeoffs (usually between size and space).
-They can be enabled at any pass of the configuration phase.
-
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Define} & \textbf{Purpose} \\
-\hline BN\_MP\_DIV\_SMALL & Enables a slower, smaller and equally \\
-                          & functional mp\_div() function \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-
-\subsection{Build Trims}
-A trim is a manner of removing functionality from a function that is not required.  For instance, to perform
-RSA cryptography you only require exponentiation with odd moduli so even moduli support can be safely removed.
-Build trims are meant to be defined on the last pass of the configuration which means they are to be defined
-only if LTM\_LAST has been defined.
-
-\subsubsection{Moduli Related}
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Restriction} & \textbf{Undefine} \\
-\hline Exponentiation with odd moduli only & BN\_S\_MP\_EXPTMOD\_C \\
-                                           & BN\_MP\_REDUCE\_C \\
-                                           & BN\_MP\_REDUCE\_SETUP\_C \\
-                                           & BN\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
-                                           & BN\_FAST\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
-\hline Exponentiation with random odd moduli & (The above plus the following) \\
-                                           & BN\_MP\_REDUCE\_2K\_C \\
-                                           & BN\_MP\_REDUCE\_2K\_SETUP\_C \\
-                                           & BN\_MP\_REDUCE\_IS\_2K\_C \\
-                                           & BN\_MP\_DR\_IS\_MODULUS\_C \\
-                                           & BN\_MP\_DR\_REDUCE\_C \\
-                                           & BN\_MP\_DR\_SETUP\_C \\
-\hline Modular inverse odd moduli only     & BN\_MP\_INVMOD\_SLOW\_C \\
-\hline Modular inverse (both, smaller/slower) & BN\_FAST\_MP\_INVMOD\_C \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-
-\subsubsection{Operand Size Related}
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Restriction} & \textbf{Undefine} \\
-\hline Moduli $\le 2560$ bits              & BN\_MP\_MONTGOMERY\_REDUCE\_C \\
-                                           & BN\_S\_MP\_MUL\_DIGS\_C \\
-                                           & BN\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
-                                           & BN\_S\_MP\_SQR\_C \\
-\hline Polynomial Schmolynomial            & BN\_MP\_KARATSUBA\_MUL\_C \\
-                                           & BN\_MP\_KARATSUBA\_SQR\_C \\
-                                           & BN\_MP\_TOOM\_MUL\_C \\
-                                           & BN\_MP\_TOOM\_SQR\_C \\
-
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-
-
-\section{Purpose of LibTomMath}
-Unlike  GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath was not written with
-bleeding edge performance in mind.  First and foremost LibTomMath was written to be entirely open.  Not only is the
-source code public domain (unlike various other GPL/etc licensed code), not only is the code freely downloadable but the
-source code is also accessible for computer science students attempting to learn ``BigNum'' or multiple precision
-arithmetic techniques.
-
-LibTomMath was written to be an instructive collection of source code.  This is why there are many comments, only one
-function per source file and often I use a ``middle-road'' approach where I don't cut corners for an extra 2\% speed
-increase.
-
-Source code alone cannot really teach how the algorithms work which is why I also wrote a textbook that accompanies
-the library (beat that!).
-
-So you may be thinking ``should I use LibTomMath?'' and the answer is a definite maybe.  Let me tabulate what I think
-are the pros and cons of LibTomMath by comparing it to the math routines from GnuPG\footnote{GnuPG v1.2.3 versus LibTomMath v0.28}.
-
-\newpage\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|c|c|l|}
-\hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
-\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 71.97$ \\
-\hline Commented function prototypes & X && GnuPG function names are cryptic. \\
-\hline Speed && X & LibTomMath is slower.  \\
-\hline Totally free & X & & GPL has unfavourable restrictions.\\
-\hline Large function base & X & & GnuPG is barebones. \\
-\hline Five modular reduction algorithms & X & & Faster modular exponentiation for a variety of moduli. \\
-\hline Portable & X & & GnuPG requires configuration to build. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{LibTomMath Valuation}
-\end{figure}
-
-It may seem odd to compare LibTomMath to GnuPG since the math in GnuPG is only a small portion of the entire application.
-However, LibTomMath was written with cryptography in mind.  It provides essentially all of the functions a cryptosystem
-would require when working with large integers.
-
-So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
-own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
-not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
-exponentiations.  It depends largely on the processor, compiler and the moduli being used.
-
-Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.  However,
-on the other side of the coin LibTomMath offers you a totally free (public domain) well structured math library
-that is very flexible, complete and performs well in resource contrained environments.  Fast RSA for example can
-be performed with as little as 8KB of ram for data (again depending on build options).
-
-\chapter{Getting Started with LibTomMath}
-\section{Building Programs}
-In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library file (typically
-libtommath.a).  There is no library initialization required and the entire library is thread safe.
-
-\section{Return Codes}
-There are three possible return codes a function may return.
-
-\index{MP\_OKAY}\index{MP\_YES}\index{MP\_NO}\index{MP\_VAL}\index{MP\_MEM}
-\begin{figure}[here!]
-\begin{center}
-\begin{small}
-\begin{tabular}{|l|l|}
-\hline \textbf{Code} & \textbf{Meaning} \\
-\hline MP\_OKAY & The function succeeded. \\
-\hline MP\_VAL  & The function input was invalid. \\
-\hline MP\_MEM  & Heap memory exhausted. \\
-\hline &\\
-\hline MP\_YES  & Response is yes. \\
-\hline MP\_NO   & Response is no. \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Return Codes}
-\end{figure}
-
-The last two codes listed are not actually ``return'ed'' by a function.  They are placed in an integer (the caller must
-provide the address of an integer it can store to) which the caller can access.  To convert one of the three return codes
-to a string use the following function.
-
-\index{mp\_error\_to\_string}
-\begin{alltt}
-char *mp_error_to_string(int code);
-\end{alltt}
-
-This will return a pointer to a string which describes the given error code.  It will not work for the return codes
-MP\_YES and MP\_NO.
-
-\section{Data Types}
-The basic ``multiple precision integer'' type is known as the ``mp\_int'' within LibTomMath.  This data type is used to
-organize all of the data required to manipulate the integer it represents.  Within LibTomMath it has been prototyped
-as the following.
-
-\index{mp\_int}
-\begin{alltt}
-typedef struct  \{
-    int used, alloc, sign;
-    mp_digit *dp;
-\} mp_int;
-\end{alltt}
-
-Where ``mp\_digit'' is a data type that represents individual digits of the integer.  By default, an mp\_digit is the
-ISO C ``unsigned long'' data type and each digit is $28-$bits long.  The mp\_digit type can be configured to suit other
-platforms by defining the appropriate macros.
-
-All LTM functions that use the mp\_int type will expect a pointer to mp\_int structure.  You must allocate memory to
-hold the structure itself by yourself (whether off stack or heap it doesn't matter).  The very first thing that must be
-done to use an mp\_int is that it must be initialized.
-
-\section{Function Organization}
-
-The arithmetic functions of the library are all organized to have the same style prototype.  That is source operands
-are passed on the left and the destination is on the right.  For instance,
-
-\begin{alltt}
-mp_add(&a, &b, &c);       /* c = a + b */
-mp_mul(&a, &a, &c);       /* c = a * a */
-mp_div(&a, &b, &c, &d);   /* c = [a/b], d = a mod b */
-\end{alltt}
-
-Another feature of the way the functions have been implemented is that source operands can be destination operands as well.
-For instance,
-
-\begin{alltt}
-mp_add(&a, &b, &b);       /* b = a + b */
-mp_div(&a, &b, &a, &c);   /* a = [a/b], c = a mod b */
-\end{alltt}
-
-This allows operands to be re-used which can make programming simpler.
-
-\section{Initialization}
-\subsection{Single Initialization}
-A single mp\_int can be initialized with the ``mp\_init'' function.
-
-\index{mp\_init}
-\begin{alltt}
-int mp_init (mp_int * a);
-\end{alltt}
-
-This function expects a pointer to an mp\_int structure and will initialize the members of the structure so the mp\_int
-represents the default integer which is zero.  If the functions returns MP\_OKAY then the mp\_int is ready to be used
-by the other LibTomMath functions.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use the number */
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\subsection{Single Free}
-When you are finished with an mp\_int it is ideal to return the heap it used back to the system.  The following function
-provides this functionality.
-
-\index{mp\_clear}
-\begin{alltt}
-void mp_clear (mp_int * a);
-\end{alltt}
-
-The function expects a pointer to a previously initialized mp\_int structure and frees the heap it uses.  It sets the
-pointer\footnote{The ``dp'' member.} within the mp\_int to \textbf{NULL} which is used to prevent double free situations.
-Is is legal to call mp\_clear() twice on the same mp\_int in a row.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use the number */
-
-   /* We're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\subsection{Multiple Initializations}
-Certain algorithms require more than one large integer.  In these instances it is ideal to initialize all of the mp\_int
-variables in an ``all or nothing'' fashion.  That is, they are either all initialized successfully or they are all
-not initialized.
-
-The  mp\_init\_multi() function provides this functionality.
-
-\index{mp\_init\_multi} \index{mp\_clear\_multi}
-\begin{alltt}
-int mp_init_multi(mp_int *mp, ...);
-\end{alltt}
-
-It accepts a \textbf{NULL} terminated list of pointers to mp\_int structures.  It will attempt to initialize them all
-at once.  If the function returns MP\_OKAY then all of the mp\_int variables are ready to use, otherwise none of them
-are available for use.  A complementary mp\_clear\_multi() function allows multiple mp\_int variables to be free'd
-from the heap at the same time.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int num1, num2, num3;
-   int result;
-
-   if ((result = mp_init_multi(&num1,
-                               &num2,
-                               &num3, NULL)) != MP\_OKAY) \{
-      printf("Error initializing the numbers.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use the numbers */
-
-   /* We're done with them. */
-   mp_clear_multi(&num1, &num2, &num3, NULL);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\subsection{Other Initializers}
-To initialized and make a copy of an mp\_int the mp\_init\_copy() function has been provided.
-
-\index{mp\_init\_copy}
-\begin{alltt}
-int mp_init_copy (mp_int * a, mp_int * b);
-\end{alltt}
-
-This function will initialize $a$ and make it a copy of $b$ if all goes well.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int num1, num2;
-   int result;
-
-   /* initialize and do work on num1 ... */
-
-   /* We want a copy of num1 in num2 now */
-   if ((result = mp_init_copy(&num2, &num1)) != MP_OKAY) \{
-     printf("Error initializing the copy.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* now num2 is ready and contains a copy of num1 */
-
-   /* We're done with them. */
-   mp_clear_multi(&num1, &num2, NULL);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-Another less common initializer is mp\_init\_size() which allows the user to initialize an mp\_int with a given
-default number of digits.  By default, all initializers allocate \textbf{MP\_PREC} digits.  This function lets
-you override this behaviour.
-
-\index{mp\_init\_size}
-\begin{alltt}
-int mp_init_size (mp_int * a, int size);
-\end{alltt}
-
-The $size$ parameter must be greater than zero.  If the function succeeds the mp\_int $a$ will be initialized
-to have $size$ digits (which are all initially zero).
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   /* we need a 60-digit number */
-   if ((result = mp_init_size(&number, 60)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use the number */
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\section{Maintenance Functions}
-
-\subsection{Reducing Memory Usage}
-When an mp\_int is in a state where it won't be changed again\footnote{A Diffie-Hellman modulus for instance.} excess
-digits can be removed to return memory to the heap with the mp\_shrink() function.
-
-\index{mp\_shrink}
-\begin{alltt}
-int mp_shrink (mp_int * a);
-\end{alltt}
-
-This will remove excess digits of the mp\_int $a$.  If the operation fails the mp\_int should be intact without the
-excess digits being removed.  Note that you can use a shrunk mp\_int in further computations, however, such operations
-will require heap operations which can be slow.  It is not ideal to shrink mp\_int variables that you will further
-modify in the system (unless you are seriously low on memory).
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use the number [e.g. pre-computation]  */
-
-   /* We're done with it for now. */
-   if ((result = mp_shrink(&number)) != MP_OKAY) \{
-      printf("Error shrinking the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use it .... */
-
-
-   /* we're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\subsection{Adding additional digits}
-
-Within the mp\_int structure are two parameters which control the limitations of the array of digits that represent
-the integer the mp\_int is meant to equal.   The \textit{used} parameter dictates how many digits are significant, that is,
-contribute to the value of the mp\_int.  The \textit{alloc} parameter dictates how many digits are currently available in
-the array.  If you need to perform an operation that requires more digits you will have to mp\_grow() the mp\_int to
-your desired size.
-
-\index{mp\_grow}
-\begin{alltt}
-int mp_grow (mp_int * a, int size);
-\end{alltt}
-
-This will grow the array of digits of $a$ to $size$.  If the \textit{alloc} parameter is already bigger than
-$size$ the function will not do anything.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* use the number */
-
-   /* We need to add 20 digits to the number  */
-   if ((result = mp_grow(&number, number.alloc + 20)) != MP_OKAY) \{
-      printf("Error growing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-
-   /* use the number */
-
-   /* we're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\chapter{Basic Operations}
-\section{Small Constants}
-Setting mp\_ints to small constants is a relatively common operation.  To accomodate these instances there are two
-small constant assignment functions.  The first function is used to set a single digit constant while the second sets
-an ISO C style ``unsigned long'' constant.  The reason for both functions is efficiency.  Setting a single digit is quick but the
-domain of a digit can change (it's always at least $0 \ldots 127$).
-
-\subsection{Single Digit}
-
-Setting a single digit can be accomplished with the following function.
-
-\index{mp\_set}
-\begin{alltt}
-void mp_set (mp_int * a, mp_digit b);
-\end{alltt}
-
-This will zero the contents of $a$ and make it represent an integer equal to the value of $b$.  Note that this
-function has a return type of \textbf{void}.  It cannot cause an error so it is safe to assume the function
-succeeded.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the number to 5 */
-   mp_set(&number, 5);
-
-   /* we're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-\subsection{Long Constants}
-
-To set a constant that is the size of an ISO C ``unsigned long'' and larger than a single digit the following function
-can be used.
-
-\index{mp\_set\_int}
-\begin{alltt}
-int mp_set_int (mp_int * a, unsigned long b);
-\end{alltt}
-
-This will assign the value of the 32-bit variable $b$ to the mp\_int $a$.  Unlike mp\_set() this function will always
-accept a 32-bit input regardless of the size of a single digit.  However, since the value may span several digits
-this function can fail if it runs out of heap memory.
-
-To get the ``unsigned long'' copy of an mp\_int the following function can be used.
-
-\index{mp\_get\_int}
-\begin{alltt}
-unsigned long mp_get_int (mp_int * a);
-\end{alltt}
-
-This will return the 32 least significant bits of the mp\_int $a$.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the number to 654321 (note this is bigger than 127) */
-   if ((result = mp_set_int(&number, 654321)) != MP_OKAY) \{
-      printf("Error setting the value of the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   printf("number == \%lu", mp_get_int(&number));
-
-   /* we're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-This should output the following if the program succeeds.
-
-\begin{alltt}
-number == 654321
-\end{alltt}
-
-\subsection{Long Constants - platform dependant}
-
-\index{mp\_set\_long}
-\begin{alltt}
-int mp_set_long (mp_int * a, unsigned long b);
-\end{alltt}
-
-This will assign the value of the platform-dependant sized variable $b$ to the mp\_int $a$.
-
-To get the ``unsigned long'' copy of an mp\_int the following function can be used.
-
-\index{mp\_get\_long}
-\begin{alltt}
-unsigned long mp_get_long (mp_int * a);
-\end{alltt}
-
-This will return the least significant bits of the mp\_int $a$ that fit into an ``unsigned long''.
-
-\subsection{Long Long Constants}
-
-\index{mp\_set\_long\_long}
-\begin{alltt}
-int mp_set_long_long (mp_int * a, unsigned long long b);
-\end{alltt}
-
-This will assign the value of the 64-bit variable $b$ to the mp\_int $a$.
-
-To get the ``unsigned long long'' copy of an mp\_int the following function can be used.
-
-\index{mp\_get\_long\_long}
-\begin{alltt}
-unsigned long long mp_get_long_long (mp_int * a);
-\end{alltt}
-
-This will return the 64 least significant bits of the mp\_int $a$.
-
-\subsection{Initialize and Setting Constants}
-To both initialize and set small constants the following two functions are available.
-\index{mp\_init\_set} \index{mp\_init\_set\_int}
-\begin{alltt}
-int mp_init_set (mp_int * a, mp_digit b);
-int mp_init_set_int (mp_int * a, unsigned long b);
-\end{alltt}
-
-Both functions work like the previous counterparts except they first mp\_init $a$ before setting the values.
-
-\begin{alltt}
-int main(void)
-\{
-   mp_int number1, number2;
-   int    result;
-
-   /* initialize and set a single digit */
-   if ((result = mp_init_set(&number1, 100)) != MP_OKAY) \{
-      printf("Error setting number1: \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* initialize and set a long */
-   if ((result = mp_init_set_int(&number2, 1023)) != MP_OKAY) \{
-      printf("Error setting number2: \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* display */
-   printf("Number1, Number2 == \%lu, \%lu",
-          mp_get_int(&number1), mp_get_int(&number2));
-
-   /* clear */
-   mp_clear_multi(&number1, &number2, NULL);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt}
-
-If this program succeeds it shall output.
-\begin{alltt}
-Number1, Number2 == 100, 1023
-\end{alltt}
-
-\section{Comparisons}
-
-Comparisons in LibTomMath are always performed in a ``left to right'' fashion.  There are three possible return codes
-for any comparison.
-
-\index{MP\_GT} \index{MP\_EQ} \index{MP\_LT}
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|c|c|}
-\hline \textbf{Result Code} & \textbf{Meaning} \\
-\hline MP\_GT & $a > b$ \\
-\hline MP\_EQ & $a = b$ \\
-\hline MP\_LT & $a < b$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Comparison Codes for $a, b$}
-\label{fig:CMP}
-\end{figure}
-
-In figure \ref{fig:CMP} two integers $a$ and $b$ are being compared.  In this case $a$ is said to be ``to the left'' of
-$b$.
-
-\subsection{Unsigned comparison}
-
-An unsigned comparison considers only the digits themselves and not the associated \textit{sign} flag of the
-mp\_int structures.  This is analogous to an absolute comparison.  The function mp\_cmp\_mag() will compare two
-mp\_int variables based on their digits only.
-
-\index{mp\_cmp\_mag}
-\begin{alltt}
-int mp_cmp_mag(mp_int * a, mp_int * b);
-\end{alltt}
-This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
-three compare codes listed in figure \ref{fig:CMP}.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number1, number2;
-   int result;
-
-   if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{
-      printf("Error initializing the numbers.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the number1 to 5 */
-   mp_set(&number1, 5);
-
-   /* set the number2 to -6 */
-   mp_set(&number2, 6);
-   if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{
-      printf("Error negating number2.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   switch(mp_cmp_mag(&number1, &number2)) \{
-       case MP_GT:  printf("|number1| > |number2|"); break;
-       case MP_EQ:  printf("|number1| = |number2|"); break;
-       case MP_LT:  printf("|number1| < |number2|"); break;
-   \}
-
-   /* we're done with it. */
-   mp_clear_multi(&number1, &number2, NULL);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes
-successfully it should print the following.
-
-\begin{alltt}
-|number1| < |number2|
-\end{alltt}
-
-This is because $\vert -6 \vert = 6$ and obviously $5 < 6$.
-
-\subsection{Signed comparison}
-
-To compare two mp\_int variables based on their signed value the mp\_cmp() function is provided.
-
-\index{mp\_cmp}
-\begin{alltt}
-int mp_cmp(mp_int * a, mp_int * b);
-\end{alltt}
-
-This will compare $a$ to the left of $b$.  It will first compare the signs of the two mp\_int variables.  If they
-differ it will return immediately based on their signs.  If the signs are equal then it will compare the digits
-individually.  This function will return one of the compare conditions codes listed in figure \ref{fig:CMP}.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number1, number2;
-   int result;
-
-   if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{
-      printf("Error initializing the numbers.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the number1 to 5 */
-   mp_set(&number1, 5);
-
-   /* set the number2 to -6 */
-   mp_set(&number2, 6);
-   if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{
-      printf("Error negating number2.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   switch(mp_cmp(&number1, &number2)) \{
-       case MP_GT:  printf("number1 > number2"); break;
-       case MP_EQ:  printf("number1 = number2"); break;
-       case MP_LT:  printf("number1 < number2"); break;
-   \}
-
-   /* we're done with it. */
-   mp_clear_multi(&number1, &number2, NULL);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes
-successfully it should print the following.
-
-\begin{alltt}
-number1 > number2
-\end{alltt}
-
-\subsection{Single Digit}
-
-To compare a single digit against an mp\_int the following function has been provided.
-
-\index{mp\_cmp\_d}
-\begin{alltt}
-int mp_cmp_d(mp_int * a, mp_digit b);
-\end{alltt}
-
-This will compare $a$ to the left of $b$ using a signed comparison.  Note that it will always treat $b$ as
-positive.  This function is rather handy when you have to compare against small values such as $1$ (which often
-comes up in cryptography).  The function cannot fail and will return one of the tree compare condition codes
-listed in figure \ref{fig:CMP}.
-
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the number to 5 */
-   mp_set(&number, 5);
-
-   switch(mp_cmp_d(&number, 7)) \{
-       case MP_GT:  printf("number > 7"); break;
-       case MP_EQ:  printf("number = 7"); break;
-       case MP_LT:  printf("number < 7"); break;
-   \}
-
-   /* we're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-If this program functions properly it will print out the following.
-
-\begin{alltt}
-number < 7
-\end{alltt}
-
-\section{Logical Operations}
-
-Logical operations are operations that can be performed either with simple shifts or boolean operators such as
-AND, XOR and OR directly.  These operations are very quick.
-
-\subsection{Multiplication by two}
-
-Multiplications and divisions by any power of two can be performed with quick logical shifts either left or
-right depending on the operation.
-
-When multiplying or dividing by two a special case routine can be used which are as follows.
-\index{mp\_mul\_2} \index{mp\_div\_2}
-\begin{alltt}
-int mp_mul_2(mp_int * a, mp_int * b);
-int mp_div_2(mp_int * a, mp_int * b);
-\end{alltt}
-
-The former will assign twice $a$ to $b$ while the latter will assign half $a$ to $b$.  These functions are fast
-since the shift counts and maskes are hardcoded into the routines.
-
-\begin{small} \begin{alltt}
-int main(void)
-\{
-   mp_int number;
-   int result;
-
-   if ((result = mp_init(&number)) != MP_OKAY) \{
-      printf("Error initializing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the number to 5 */
-   mp_set(&number, 5);
-
-   /* multiply by two */
-   if ((result = mp\_mul\_2(&number, &number)) != MP_OKAY) \{
-      printf("Error multiplying the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-   switch(mp_cmp_d(&number, 7)) \{
-       case MP_GT:  printf("2*number > 7"); break;
-       case MP_EQ:  printf("2*number = 7"); break;
-       case MP_LT:  printf("2*number < 7"); break;
-   \}
-
-   /* now divide by two */
-   if ((result = mp\_div\_2(&number, &number)) != MP_OKAY) \{
-      printf("Error dividing the number.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-   switch(mp_cmp_d(&number, 7)) \{
-       case MP_GT:  printf("2*number/2 > 7"); break;
-       case MP_EQ:  printf("2*number/2 = 7"); break;
-       case MP_LT:  printf("2*number/2 < 7"); break;
-   \}
-
-   /* we're done with it. */
-   mp_clear(&number);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt} \end{small}
-
-If this program is successful it will print out the following text.
-
-\begin{alltt}
-2*number > 7
-2*number/2 < 7
-\end{alltt}
-
-Since $10 > 7$ and $5 < 7$.
-
-To multiply by a power of two the following function can be used.
-
-\index{mp\_mul\_2d}
-\begin{alltt}
-int mp_mul_2d(mp_int * a, int b, mp_int * c);
-\end{alltt}
-
-This will multiply $a$ by $2^b$ and store the result in ``c''.  If the value of $b$ is less than or equal to
-zero the function will copy $a$ to ``c'' without performing any further actions.  The multiplication itself
-is implemented as a right-shift operation of $a$ by $b$ bits.
-
-To divide by a power of two use the following.
-
-\index{mp\_div\_2d}
-\begin{alltt}
-int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d);
-\end{alltt}
-Which will divide $a$ by $2^b$, store the quotient in ``c'' and the remainder in ``d'.  If $b \le 0$ then the
-function simply copies $a$ over to ``c'' and zeroes $d$.  The variable $d$ may be passed as a \textbf{NULL}
-value to signal that the remainder is not desired.  The division itself is implemented as a left-shift
-operation of $a$ by $b$ bits.
-
-\subsection{Polynomial Basis Operations}
-
-Strictly speaking the organization of the integers within the mp\_int structures is what is known as a
-``polynomial basis''.  This simply means a field element is stored by divisions of a radix.  For example, if
-$f(x) = \sum_{i=0}^{k} y_ix^k$ for any vector $\vec y$ then the array of digits in $\vec y$ are said to be
-the polynomial basis representation of $z$ if $f(\beta) = z$ for a given radix $\beta$.
-
-To multiply by the polynomial $g(x) = x$ all you have todo is shift the digits of the basis left one place.  The
-following function provides this operation.
-
-\index{mp\_lshd}
-\begin{alltt}
-int mp_lshd (mp_int * a, int b);
-\end{alltt}
-
-This will multiply $a$ in place by $x^b$ which is equivalent to shifting the digits left $b$ places and inserting zeroes
-in the least significant digits.  Similarly to divide by a power of $x$ the following function is provided.
-
-\index{mp\_rshd}
-\begin{alltt}
-void mp_rshd (mp_int * a, int b)
-\end{alltt}
-This will divide $a$ in place by $x^b$ and discard the remainder.  This function cannot fail as it performs the operations
-in place and no new digits are required to complete it.
-
-\subsection{AND, OR and XOR Operations}
-
-While AND, OR and XOR operations are not typical ``bignum functions'' they can be useful in several instances.  The
-three functions are prototyped as follows.
-
-\index{mp\_or} \index{mp\_and} \index{mp\_xor}
-\begin{alltt}
-int mp_or  (mp_int * a, mp_int * b, mp_int * c);
-int mp_and (mp_int * a, mp_int * b, mp_int * c);
-int mp_xor (mp_int * a, mp_int * b, mp_int * c);
-\end{alltt}
-
-Which compute $c = a \odot b$ where $\odot$ is one of OR, AND or XOR.
-
-\section{Addition and Subtraction}
-
-To compute an addition or subtraction the following two functions can be used.
-
-\index{mp\_add} \index{mp\_sub}
-\begin{alltt}
-int mp_add (mp_int * a, mp_int * b, mp_int * c);
-int mp_sub (mp_int * a, mp_int * b, mp_int * c)
-\end{alltt}
-
-Which perform $c = a \odot b$ where $\odot$ is one of signed addition or subtraction.  The operations are fully sign
-aware.
-
-\section{Sign Manipulation}
-\subsection{Negation}
-\label{sec:NEG}
-Simple integer negation can be performed with the following.
-
-\index{mp\_neg}
-\begin{alltt}
-int mp_neg (mp_int * a, mp_int * b);
-\end{alltt}
-
-Which assigns $-a$ to $b$.
-
-\subsection{Absolute}
-Simple integer absolutes can be performed with the following.
-
-\index{mp\_neg}
-\begin{alltt}
-int mp_abs (mp_int * a, mp_int * b);
-\end{alltt}
-
-Which assigns $\vert a \vert$ to $b$.
-
-\section{Integer Division and Remainder}
-To perform a complete and general integer division with remainder use the following function.
-
-\index{mp\_div}
-\begin{alltt}
-int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d);
-\end{alltt}
-
-This divides $a$ by $b$ and stores the quotient in $c$ and $d$.  The signed quotient is computed such that
-$bc + d = a$.  Note that either of $c$ or $d$ can be set to \textbf{NULL} if their value is not required.  If
-$b$ is zero the function returns \textbf{MP\_VAL}.
-
-
-\chapter{Multiplication and Squaring}
-\section{Multiplication}
-A full signed integer multiplication can be performed with the following.
-\index{mp\_mul}
-\begin{alltt}
-int mp_mul (mp_int * a, mp_int * b, mp_int * c);
-\end{alltt}
-Which assigns the full signed product $ab$ to $c$.  This function actually breaks into one of four cases which are
-specific multiplication routines optimized for given parameters.  First there are the Toom-Cook multiplications which
-should only be used with very large inputs.  This is followed by the Karatsuba multiplications which are for moderate
-sized inputs.  Then followed by the Comba and baseline multipliers.
-
-Fortunately for the developer you don't really need to know this unless you really want to fine tune the system.  mp\_mul()
-will determine on its own\footnote{Some tweaking may be required.} what routine to use automatically when it is called.
-
-\begin{alltt}
-int main(void)
-\{
-   mp_int number1, number2;
-   int result;
-
-   /* Initialize the numbers */
-   if ((result = mp_init_multi(&number1,
-                               &number2, NULL)) != MP_OKAY) \{
-      printf("Error initializing the numbers.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* set the terms */
-   if ((result = mp_set_int(&number, 257)) != MP_OKAY) \{
-      printf("Error setting number1.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   if ((result = mp_set_int(&number2, 1023)) != MP_OKAY) \{
-      printf("Error setting number2.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* multiply them */
-   if ((result = mp_mul(&number1, &number2,
-                        &number1)) != MP_OKAY) \{
-      printf("Error multiplying terms.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* display */
-   printf("number1 * number2 == \%lu", mp_get_int(&number1));
-
-   /* free terms and return */
-   mp_clear_multi(&number1, &number2, NULL);
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt}
-
-If this program succeeds it shall output the following.
-
-\begin{alltt}
-number1 * number2 == 262911
-\end{alltt}
-
-\section{Squaring}
-Since squaring can be performed faster than multiplication it is performed it's own function instead of just using
-mp\_mul().
-
-\index{mp\_sqr}
-\begin{alltt}
-int mp_sqr (mp_int * a, mp_int * b);
-\end{alltt}
-
-Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
-algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms because
-of the speed difference.
-
-\section{Tuning Polynomial Basis Routines}
-
-Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
-the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectively they require
-considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
-multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
-of 138).
-
-So why not always use Karatsuba or Toom-Cook?   The simple answer is that they have so much overhead that they're not
-actually faster than Comba until you hit distinct  ``cutoff'' points.  For Karatsuba with the default configuration,
-GCC 3.3.1 and an Athlon XP processor the cutoff point is roughly 110 digits (about 70 for the Intel P4).  That is, at
-110 digits Karatsuba and Comba multiplications just about break even and for 110+ digits Karatsuba is faster.
-
-Toom-Cook has incredible overhead and is probably only useful for very large inputs.  So far no known cutoff points
-exist and for the most part I just set the cutoff points very high to make sure they're not called.
-
-A demo program in the ``etc/'' directory of the project called ``tune.c'' can be used to find the cutoff points.  This
-can be built with GCC as follows
-
-\begin{alltt}
-make XXX
-\end{alltt}
-Where ``XXX'' is one of the following entries from the table \ref{fig:tuning}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|l|l|}
-\hline \textbf{Value of XXX} & \textbf{Meaning} \\
-\hline tune & Builds portable tuning application \\
-\hline tune86 & Builds x86 (pentium and up) program for COFF \\
-\hline tune86c & Builds x86 program for Cygwin \\
-\hline tune86l & Builds x86 program for Linux (ELF format) \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Build Names for Tuning Programs}
-\label{fig:tuning}
-\end{figure}
-
-When the program is running it will output a series of measurements for different cutoff points.  It will first find
-good Karatsuba squaring and multiplication points.  Then it proceeds to find Toom-Cook points.  Note that the Toom-Cook
-tuning takes a very long time as the cutoff points are likely to be very high.
-
-\chapter{Modular Reduction}
-
-Modular reduction is process of taking the remainder of one quantity divided by another.  Expressed
-as (\ref{eqn:mod}) the modular reduction is equivalent to the remainder of $b$ divided by $c$.
-
-\begin{equation}
-a \equiv b \mbox{ (mod }c\mbox{)}
-\label{eqn:mod}
-\end{equation}
-
-Of particular interest to cryptography are reductions where $b$ is limited to the range $0 \le b < c^2$ since particularly
-fast reduction algorithms can be written for the limited range.
-
-Note that one of the four optimized reduction algorithms are automatically chosen in the modular exponentiation
-algorithm mp\_exptmod when an appropriate modulus is detected.
-
-\section{Straight Division}
-In order to effect an arbitrary modular reduction the following algorithm is provided.
-
-\index{mp\_mod}
-\begin{alltt}
-int mp_mod(mp_int *a, mp_int *b, mp_int *c);
-\end{alltt}
-
-This reduces $a$ modulo $b$ and stores the result in $c$.  The sign of $c$ shall agree with the sign
-of $b$.  This algorithm accepts an input $a$ of any range and is not limited by $0 \le a < b^2$.
-
-\section{Barrett Reduction}
-
-Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
-a decent speedup over straight division.  First a $\mu$ value must be precomputed with the following function.
-
-\index{mp\_reduce\_setup}
-\begin{alltt}
-int mp_reduce_setup(mp_int *a, mp_int *b);
-\end{alltt}
-
-Given a modulus in $b$ this produces the required $\mu$ value in $a$.  For any given modulus this only has to
-be computed once.  Modular reduction can now be performed with the following.
-
-\index{mp\_reduce}
-\begin{alltt}
-int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
-\end{alltt}
-
-This will reduce $a$ in place modulo $b$ with the precomputed $\mu$ value in $c$.  $a$ must be in the range
-$0 \le a < b^2$.
-
-\begin{alltt}
-int main(void)
-\{
-   mp_int   a, b, c, mu;
-   int      result;
-
-   /* initialize a,b to desired values, mp_init mu,
-    * c and set c to 1...we want to compute a^3 mod b
-    */
-
-   /* get mu value */
-   if ((result = mp_reduce_setup(&mu, b)) != MP_OKAY) \{
-      printf("Error getting mu.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* square a to get c = a^2 */
-   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
-      printf("Error squaring.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* now reduce `c' modulo b */
-   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* multiply a to get c = a^3 */
-   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* now reduce `c' modulo b  */
-   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* c now equals a^3 mod b */
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt}
-
-This program will calculate $a^3 \mbox{ mod }b$ if all the functions succeed.
-
-\section{Montgomery Reduction}
-
-Montgomery is a specialized reduction algorithm for any odd moduli.  Like Barrett reduction a pre--computation
-step is required.  This is accomplished with the following.
-
-\index{mp\_montgomery\_setup}
-\begin{alltt}
-int mp_montgomery_setup(mp_int *a, mp_digit *mp);
-\end{alltt}
-
-For the given odd moduli $a$ the precomputation value is placed in $mp$.  The reduction is computed with the
-following.
-
-\index{mp\_montgomery\_reduce}
-\begin{alltt}
-int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
-\end{alltt}
-This reduces $a$ in place modulo $m$ with the pre--computed value $mp$.   $a$ must be in the range
-$0 \le a < b^2$.
-
-Montgomery reduction is faster than Barrett reduction for moduli smaller than the ``comba'' limit.  With the default
-setup for instance, the limit is $127$ digits ($3556$--bits).   Note that this function is not limited to
-$127$ digits just that it falls back to a baseline algorithm after that point.
-
-An important observation is that this reduction does not return $a \mbox{ mod }m$ but $aR^{-1} \mbox{ mod }m$
-where $R = \beta^n$, $n$ is the n number of digits in $m$ and $\beta$ is radix used (default is $2^{28}$).
-
-To quickly calculate $R$ the following function was provided.
-
-\index{mp\_montgomery\_calc\_normalization}
-\begin{alltt}
-int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
-\end{alltt}
-Which calculates $a = R$ for the odd moduli $b$ without using multiplication or division.
-
-The normal modus operandi for Montgomery reductions is to normalize the integers before entering the system.  For
-example, to calculate $a^3 \mbox { mod }b$ using Montgomery reduction the value of $a$ can be normalized by
-multiplying it by $R$.  Consider the following code snippet.
-
-\begin{alltt}
-int main(void)
-\{
-   mp_int   a, b, c, R;
-   mp_digit mp;
-   int      result;
-
-   /* initialize a,b to desired values,
-    * mp_init R, c and set c to 1....
-    */
-
-   /* get normalization */
-   if ((result = mp_montgomery_calc_normalization(&R, b)) != MP_OKAY) \{
-      printf("Error getting norm.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* get mp value */
-   if ((result = mp_montgomery_setup(&c, &mp)) != MP_OKAY) \{
-      printf("Error setting up montgomery.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* normalize `a' so now a is equal to aR */
-   if ((result = mp_mulmod(&a, &R, &b, &a)) != MP_OKAY) \{
-      printf("Error computing aR.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* square a to get c = a^2R^2 */
-   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
-      printf("Error squaring.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* now reduce `c' back down to c = a^2R^2 * R^-1 == a^2R */
-   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* multiply a to get c = a^3R^2 */
-   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* now reduce `c' back down to c = a^3R^2 * R^-1 == a^3R */
-   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* now reduce (again) `c' back down to c = a^3R * R^-1 == a^3 */
-   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
-      printf("Error reducing.  \%s",
-             mp_error_to_string(result));
-      return EXIT_FAILURE;
-   \}
-
-   /* c now equals a^3 mod b */
-
-   return EXIT_SUCCESS;
-\}
-\end{alltt}
-
-This particular example does not look too efficient but it demonstrates the point of the algorithm.  By
-normalizing the inputs the reduced results are always of the form $aR$ for some variable $a$.  This allows
-a single final reduction to correct for the normalization and the fast reduction used within the algorithm.
-
-For more details consider examining the file \textit{bn\_mp\_exptmod\_fast.c}.
-
-\section{Restricted Dimminished Radix}
-
-``Dimminished Radix'' reduction refers to reduction with respect to moduli that are ameniable to simple
-digit shifting and small multiplications.  In this case the ``restricted'' variant refers to moduli of the
-form $\beta^k - p$ for some $k \ge 0$ and $0 < p < \beta$ where $\beta$ is the radix (default to $2^{28}$).
-
-As in the case of Montgomery reduction there is a pre--computation phase required for a given modulus.
-
-\index{mp\_dr\_setup}
-\begin{alltt}
-void mp_dr_setup(mp_int *a, mp_digit *d);
-\end{alltt}
-
-This computes the value required for the modulus $a$ and stores it in $d$.  This function cannot fail
-and does not return any error codes.  After the pre--computation a reduction can be performed with the
-following.
-
-\index{mp\_dr\_reduce}
-\begin{alltt}
-int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
-\end{alltt}
-
-This reduces $a$ in place modulo $b$ with the pre--computed value $mp$.  $b$ must be of a restricted
-dimminished radix form and $a$ must be in the range $0 \le a < b^2$.  Dimminished radix reductions are
-much faster than both Barrett and Montgomery reductions as they have a much lower asymtotic running time.
-
-Since the moduli are restricted this algorithm is not particularly useful for something like Rabin, RSA or
-BBS cryptographic purposes.  This reduction algorithm is useful for Diffie-Hellman and ECC where fixed
-primes are acceptable.
-
-Note that unlike Montgomery reduction there is no normalization process.  The result of this function is
-equal to the correct residue.
-
-\section{Unrestricted Dimminshed Radix}
-
-Unrestricted reductions work much like the restricted counterparts except in this case the moduli is of the
-form $2^k - p$ for $0 < p < \beta$.  In this sense the unrestricted reductions are more flexible as they
-can be applied to a wider range of numbers.
-
-\index{mp\_reduce\_2k\_setup}
-\begin{alltt}
-int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
-\end{alltt}
-
-This will compute the required $d$ value for the given moduli $a$.
-
-\index{mp\_reduce\_2k}
-\begin{alltt}
-int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
-\end{alltt}
-
-This will reduce $a$ in place modulo $n$ with the pre--computed value $d$.  From my experience this routine is
-slower than mp\_dr\_reduce but faster for most moduli sizes than the Montgomery reduction.
-
-\chapter{Exponentiation}
-\section{Single Digit Exponentiation}
-\index{mp\_expt\_d\_ex}
-\begin{alltt}
-int mp_expt_d_ex (mp_int * a, mp_digit b, mp_int * c, int fast)
-\end{alltt}
-This function computes $c = a^b$.
-
-With parameter \textit{fast} set to $0$ the old version of the algorithm is used,
-when \textit{fast} is $1$, a faster but not statically timed version of the algorithm is used.
-
-The old version uses a simple binary left-to-right algorithm.
-It is faster than repeated multiplications by $a$ for all values of $b$ greater than three.
-
-The new version uses a binary right-to-left algorithm.
-
-The difference between the old and the new version is that the old version always
-executes $DIGIT\_BIT$ iterations. The new algorithm executes only $n$ iterations
-where $n$ is equal to the position of the highest bit that is set in $b$.
-
-\index{mp\_expt\_d}
-\begin{alltt}
-int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
-\end{alltt}
-mp\_expt\_d(a, b, c) is a wrapper function to mp\_expt\_d\_ex(a, b, c, 0).
-
-\section{Modular Exponentiation}
-\index{mp\_exptmod}
-\begin{alltt}
-int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-\end{alltt}
-This computes $Y \equiv G^X \mbox{ (mod }P\mbox{)}$ using a variable width sliding window algorithm.  This function
-will automatically detect the fastest modular reduction technique to use during the operation.  For negative values of
-$X$ the operation is performed as $Y \equiv (G^{-1} \mbox{ mod }P)^{\vert X \vert} \mbox{ (mod }P\mbox{)}$ provided that
-$gcd(G, P) = 1$.
-
-This function is actually a shell around the two internal exponentiation functions.  This routine will automatically
-detect when Barrett, Montgomery, Restricted and Unrestricted Dimminished Radix based exponentiation can be used.  Generally
-moduli of the a ``restricted dimminished radix'' form lead to the fastest modular exponentiations.  Followed by Montgomery
-and the other two algorithms.
-
-\section{Root Finding}
-\index{mp\_n\_root}
-\begin{alltt}
-int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
-\end{alltt}
-This computes $c = a^{1/b}$ such that $c^b \le a$ and $(c+1)^b > a$.  The implementation of this function is not
-ideal for values of $b$ greater than three.  It will work but become very slow.  So unless you are working with very small
-numbers (less than 1000 bits) I'd avoid $b > 3$ situations.  Will return a positive root only for even roots and return
-a root with the sign of the input for odd roots.  For example, performing $4^{1/2}$ will return $2$ whereas $(-8)^{1/3}$
-will return $-2$.
-
-This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
-the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
-values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
-$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$ or simply
-$\left ( \left ( \left ( a^{1/2} \right )^{1/2} \right )^{1/2} \right )^{1/2}$
-
-\chapter{Prime Numbers}
-\section{Trial Division}
-\index{mp\_prime\_is\_divisible}
-\begin{alltt}
-int mp_prime_is_divisible (mp_int * a, int *result)
-\end{alltt}
-This will attempt to evenly divide $a$ by a list of primes\footnote{Default is the first 256 primes.} and store the
-outcome in ``result''.  That is if $result = 0$ then $a$ is not divisible by the primes, otherwise it is.  Note that
-if the function does not return \textbf{MP\_OKAY} the value in ``result'' should be considered undefined\footnote{Currently
-the default is to set it to zero first.}.
-
-\section{Fermat Test}
-\index{mp\_prime\_fermat}
-\begin{alltt}
-int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
-\end{alltt}
-Performs a Fermat primality test to the base $b$.  That is it computes $b^a \mbox{ mod }a$ and tests whether the value is
-equal to $b$ or not.  If the values are equal then $a$ is probably prime and $result$ is set to one.  Otherwise $result$
-is set to zero.
-
-\section{Miller-Rabin Test}
-\index{mp\_prime\_miller\_rabin}
-\begin{alltt}
-int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
-\end{alltt}
-Performs a Miller-Rabin test to the base $b$ of $a$.  This test is much stronger than the Fermat test and is very hard to
-fool (besides with Carmichael numbers).  If $a$ passes the test (therefore is probably prime) $result$ is set to one.
-Otherwise $result$ is set to zero.
-
-Note that is suggested that you use the Miller-Rabin test instead of the Fermat test since all of the failures of
-Miller-Rabin are a subset of the failures of the Fermat test.
-
-\subsection{Required Number of Tests}
-Generally to ensure a number is very likely to be prime you have to perform the Miller-Rabin with at least a half-dozen
-or so unique bases.  However, it has been proven that the probability of failure goes down as the size of the input goes up.
-This is why a simple function has been provided to help out.
-
-\index{mp\_prime\_rabin\_miller\_trials}
-\begin{alltt}
-int mp_prime_rabin_miller_trials(int size)
-\end{alltt}
-This returns the number of trials required for a $2^{-96}$ (or lower) probability of failure for a given ``size'' expressed
-in bits.  This comes in handy specially since larger numbers are slower to test.  For example, a 512-bit number would
-require ten tests whereas a 1024-bit number would only require four tests.
-
-You should always still perform a trial division before a Miller-Rabin test though.
-
-\section{Primality Testing}
-\index{mp\_prime\_is\_prime}
-\begin{alltt}
-int mp_prime_is_prime (mp_int * a, int t, int *result)
-\end{alltt}
-This will perform a trial division followed by $t$ rounds of Miller-Rabin tests on $a$ and store the result in $result$.
-If $a$ passes all of the tests $result$ is set to one, otherwise it is set to zero.  Note that $t$ is bounded by
-$1 \le t < PRIME\_SIZE$ where $PRIME\_SIZE$ is the number of primes in the prime number table (by default this is $256$).
-
-\section{Next Prime}
-\index{mp\_prime\_next\_prime}
-\begin{alltt}
-int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
-\end{alltt}
-This finds the next prime after $a$ that passes mp\_prime\_is\_prime() with $t$ tests.  Set $bbs\_style$ to one if you
-want only the next prime congruent to $3 \mbox{ mod } 4$, otherwise set it to zero to find any next prime.
-
-\section{Random Primes}
-\index{mp\_prime\_random}
-\begin{alltt}
-int mp_prime_random(mp_int *a, int t, int size, int bbs,
-                    ltm_prime_callback cb, void *dat)
-\end{alltt}
-This will find a prime greater than $256^{size}$ which can be ``bbs\_style'' or not depending on $bbs$ and must pass
-$t$ rounds of tests.  The ``ltm\_prime\_callback'' is a typedef for
-
-\begin{alltt}
-typedef int ltm_prime_callback(unsigned char *dst, int len, void *dat);
-\end{alltt}
-
-Which is a function that must read $len$ bytes (and return the amount stored) into $dst$.  The $dat$ variable is simply
-copied from the original input.  It can be used to pass RNG context data to the callback.  The function
-mp\_prime\_random() is more suitable for generating primes which must be secret (as in the case of RSA) since there
-is no skew on the least significant bits.
-
-\textit{Note:}  As of v0.30 of the LibTomMath library this function has been deprecated.  It is still available
-but users are encouraged to use the new mp\_prime\_random\_ex() function instead.
-
-\subsection{Extended Generation}
-\index{mp\_prime\_random\_ex}
-\begin{alltt}
-int mp_prime_random_ex(mp_int *a,    int t,
-                       int     size, int flags,
-                       ltm_prime_callback cb, void *dat);
-\end{alltt}
-This will generate a prime in $a$ using $t$ tests of the primality testing algorithms.  The variable $size$
-specifies the bit length of the prime desired.  The variable $flags$ specifies one of several options available
-(see fig. \ref{fig:primeopts}) which can be OR'ed together.  The callback parameters are used as in
-mp\_prime\_random().
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|r|l|}
-\hline \textbf{Flag}         & \textbf{Meaning} \\
-\hline LTM\_PRIME\_BBS       & Make the prime congruent to $3$ modulo $4$ \\
-\hline LTM\_PRIME\_SAFE      & Make a prime $p$ such that $(p - 1)/2$ is also prime. \\
-                             & This option implies LTM\_PRIME\_BBS as well. \\
-\hline LTM\_PRIME\_2MSB\_OFF & Makes sure that the bit adjacent to the most significant bit \\
-                             & Is forced to zero.  \\
-\hline LTM\_PRIME\_2MSB\_ON  & Makes sure that the bit adjacent to the most significant bit \\
-                             & Is forced to one. \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Primality Generation Options}
-\label{fig:primeopts}
-\end{figure}
-
-\chapter{Input and Output}
-\section{ASCII Conversions}
-\subsection{To ASCII}
-\index{mp\_toradix}
-\begin{alltt}
-int mp_toradix (mp_int * a, char *str, int radix);
-\end{alltt}
-This still store $a$ in ``str'' as a base-``radix'' string of ASCII chars.  This function appends a NUL character
-to terminate the string.  Valid values of ``radix'' line in the range $[2, 64]$.  To determine the size (exact) required
-by the conversion before storing any data use the following function.
-
-\index{mp\_radix\_size}
-\begin{alltt}
-int mp_radix_size (mp_int * a, int radix, int *size)
-\end{alltt}
-This stores in ``size'' the number of characters (including space for the NUL terminator) required.  Upon error this
-function returns an error code and ``size'' will be zero.
-
-\subsection{From ASCII}
-\index{mp\_read\_radix}
-\begin{alltt}
-int mp_read_radix (mp_int * a, char *str, int radix);
-\end{alltt}
-This will read the base-``radix'' NUL terminated string from ``str'' into $a$.  It will stop reading when it reads a
-character it does not recognize (which happens to include th NUL char... imagine that...).  A single leading $-$ sign
-can be used to denote a negative number.
-
-\section{Binary Conversions}
-
-Converting an mp\_int to and from binary is another keen idea.
-
-\index{mp\_unsigned\_bin\_size}
-\begin{alltt}
-int mp_unsigned_bin_size(mp_int *a);
-\end{alltt}
-
-This will return the number of bytes (octets) required to store the unsigned copy of the integer $a$.
-
-\index{mp\_to\_unsigned\_bin}
-\begin{alltt}
-int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
-\end{alltt}
-This will store $a$ into the buffer $b$ in big--endian format.  Fortunately this is exactly what DER (or is it ASN?)
-requires.  It does not store the sign of the integer.
-
-\index{mp\_read\_unsigned\_bin}
-\begin{alltt}
-int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
-\end{alltt}
-This will read in an unsigned big--endian array of bytes (octets) from $b$ of length $c$ into $a$.  The resulting
-integer $a$ will always be positive.
-
-For those who acknowledge the existence of negative numbers (heretic!) there are ``signed'' versions of the
-previous functions.
-
-\begin{alltt}
-int mp_signed_bin_size(mp_int *a);
-int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
-int mp_to_signed_bin(mp_int *a, unsigned char *b);
-\end{alltt}
-They operate essentially the same as the unsigned copies except they prefix the data with zero or non--zero
-byte depending on the sign.  If the sign is zpos (e.g. not negative) the prefix is zero, otherwise the prefix
-is non--zero.
-
-\chapter{Algebraic Functions}
-\section{Extended Euclidean Algorithm}
-\index{mp\_exteuclid}
-\begin{alltt}
-int mp_exteuclid(mp_int *a, mp_int *b,
-                 mp_int *U1, mp_int *U2, mp_int *U3);
-\end{alltt}
-
-This finds the triple U1/U2/U3 using the Extended Euclidean algorithm such that the following equation holds.
-
-\begin{equation}
-a \cdot U1 + b \cdot U2 = U3
-\end{equation}
-
-Any of the U1/U2/U3 paramters can be set to \textbf{NULL} if they are not desired.
-
-\section{Greatest Common Divisor}
-\index{mp\_gcd}
-\begin{alltt}
-int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
-\end{alltt}
-This will compute the greatest common divisor of $a$ and $b$ and store it in $c$.
-
-\section{Least Common Multiple}
-\index{mp\_lcm}
-\begin{alltt}
-int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
-\end{alltt}
-This will compute the least common multiple of $a$ and $b$ and store it in $c$.
-
-\section{Jacobi Symbol}
-\index{mp\_jacobi}
-\begin{alltt}
-int mp_jacobi (mp_int * a, mp_int * p, int *c)
-\end{alltt}
-This will compute the Jacobi symbol for $a$ with respect to $p$.  If $p$ is prime this essentially computes the Legendre
-symbol.  The result is stored in $c$ and can take on one of three values $\lbrace -1, 0, 1 \rbrace$.  If $p$ is prime
-then the result will be $-1$ when $a$ is not a quadratic residue modulo $p$.  The result will be $0$ if $a$ divides $p$
-and the result will be $1$ if $a$ is a quadratic residue modulo $p$.
-
-\section{Modular square root}
-\index{mp\_sqrtmod\_prime}
-\begin{alltt}
-int mp_sqrtmod_prime(mp_int *n, mp_int *p, mp_int *r)
-\end{alltt}
-
-This will solve the modular equatioon $r^2 = n \mod p$ where $p$ is a prime number greater than 2 (odd prime).
-The result is returned in the third argument $r$, the function returns \textbf{MP\_OKAY} on success,
-other return values indicate failure.
-
-The implementation is split for two different cases:
-
-1. if $p \mod 4 == 3$ we apply \href{http://cacr.uwaterloo.ca/hac/}{Handbook of Applied Cryptography algorithm 3.36} and compute $r$ directly as
-$r = n^{(p+1)/4} \mod p$
-
-2. otherwise we use \href{https://en.wikipedia.org/wiki/Tonelli-Shanks_algorithm}{Tonelli-Shanks algorithm}
-
-The function does not check the primality of parameter $p$ thus it is up to the caller to assure that this parameter
-is a prime number. When $p$ is a composite the function behaviour is undefined, it may even return a false-positive
-\textbf{MP\_OKAY}.
-
-\section{Modular Inverse}
-\index{mp\_invmod}
-\begin{alltt}
-int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-\end{alltt}
-Computes the multiplicative inverse of $a$ modulo $b$ and stores the result in $c$ such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$.
-
-\section{Single Digit Functions}
-
-For those using small numbers (\textit{snicker snicker}) there are several ``helper'' functions
-
-\index{mp\_add\_d} \index{mp\_sub\_d} \index{mp\_mul\_d} \index{mp\_div\_d} \index{mp\_mod\_d}
-\begin{alltt}
-int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
-int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
-int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
-int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
-int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
-\end{alltt}
-
-These work like the full mp\_int capable variants except the second parameter $b$ is a mp\_digit.  These
-functions fairly handy if you have to work with relatively small numbers since you will not have to allocate
-an entire mp\_int to store a number like $1$ or $2$.
-
-\input{bn.ind}
-
-\end{document}
diff --git a/libtommath/bn_mp_export.c b/libtommath/bn_mp_export.c
new file mode 100644
index 0000000..ac4c2f9
--- /dev/null
+++ b/libtommath/bn_mp_export.c
@@ -0,0 +1,88 @@
+#include <tommath_private.h>
+#ifdef BN_MP_EXPORT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* based on gmp's mpz_export.
+ * see http://gmplib.org/manual/Integer-Import-and-Export.html
+ */
+int mp_export(void* rop, size_t* countp, int order, size_t size, 
+                                int endian, size_t nails, mp_int* op) {
+	int result;
+	size_t odd_nails, nail_bytes, i, j, bits, count;
+	unsigned char odd_nail_mask;
+
+	mp_int t;
+
+	if ((result = mp_init_copy(&t, op)) != MP_OKAY) {
+		return result;
+	}
+
+	if (endian == 0) {
+		union {
+			unsigned int i;
+			char c[4];
+		} lint;
+		lint.i = 0x01020304;
+
+		endian = (lint.c[0] == 4) ? -1 : 1;
+	}
+
+	odd_nails = (nails % 8);
+	odd_nail_mask = 0xff;
+	for (i = 0; i < odd_nails; ++i) {
+		odd_nail_mask ^= (1 << (7 - i));
+	}
+	nail_bytes = nails / 8;
+
+	bits = mp_count_bits(&t);
+	count = (bits / ((size * 8) - nails)) + (((bits % ((size * 8) - nails)) != 0) ? 1 : 0);
+
+	for (i = 0; i < count; ++i) {
+		for (j = 0; j < size; ++j) {
+			unsigned char* byte = (
+				(unsigned char*)rop + 
+				(((order == -1) ? i : ((count - 1) - i)) * size) +
+				((endian == -1) ? j : ((size - 1) - j))
+			);
+
+			if (j >= (size - nail_bytes)) {
+				*byte = 0;
+				continue;
+			}
+
+			*byte = (unsigned char)((j == ((size - nail_bytes) - 1)) ? (t.dp[0] & odd_nail_mask) : (t.dp[0] & 0xFF));
+
+			if ((result = mp_div_2d(&t, ((j == ((size - nail_bytes) - 1)) ? (8 - odd_nails) : 8), &t, NULL)) != MP_OKAY) {
+				mp_clear(&t);
+				return result;
+			}
+		}
+	}
+
+	mp_clear(&t);
+
+	if (countp != NULL) {
+		*countp = count;
+	}
+
+	return MP_OKAY;
+}
+
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/libtommath/bn_mp_get_long.c b/libtommath/bn_mp_get_long.c
new file mode 100644
index 0000000..7c3d0fe
--- /dev/null
+++ b/libtommath/bn_mp_get_long.c
@@ -0,0 +1,41 @@
+#include <tommath_private.h>
+#ifdef BN_MP_GET_LONG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* get the lower unsigned long of an mp_int, platform dependent */
+unsigned long mp_get_long(mp_int * a)
+{
+  int i;
+  unsigned long res;
+
+  if (a->used == 0) {
+     return 0;
+  }
+
+  /* get number of digits of the lsb we have to read */
+  i = MIN(a->used,(int)(((sizeof(unsigned long) * CHAR_BIT) + DIGIT_BIT - 1) / DIGIT_BIT)) - 1;
+
+  /* get most significant digit of result */
+  res = DIGIT(a,i);
+
+#if (ULONG_MAX != 0xffffffffuL) || (DIGIT_BIT < 32)
+  while (--i >= 0) {
+    res = (res << DIGIT_BIT) | DIGIT(a,i);
+  }
+#endif
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_get_long_long.c b/libtommath/bn_mp_get_long_long.c
new file mode 100644
index 0000000..4b959e6
--- /dev/null
+++ b/libtommath/bn_mp_get_long_long.c
@@ -0,0 +1,41 @@
+#include <tommath_private.h>
+#ifdef BN_MP_GET_LONG_LONG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* get the lower unsigned long long of an mp_int, platform dependent */
+unsigned long long mp_get_long_long (mp_int * a)
+{
+  int i;
+  unsigned long long res;
+
+  if (a->used == 0) {
+     return 0;
+  }
+
+  /* get number of digits of the lsb we have to read */
+  i = MIN(a->used,(int)(((sizeof(unsigned long long) * CHAR_BIT) + DIGIT_BIT - 1) / DIGIT_BIT)) - 1;
+
+  /* get most significant digit of result */
+  res = DIGIT(a,i);
+
+#if DIGIT_BIT < 64
+  while (--i >= 0) {
+    res = (res << DIGIT_BIT) | DIGIT(a,i);
+  }
+#endif
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_import.c b/libtommath/bn_mp_import.c
new file mode 100644
index 0000000..dd4b8e6
--- /dev/null
+++ b/libtommath/bn_mp_import.c
@@ -0,0 +1,73 @@
+#include <tommath_private.h>
+#ifdef BN_MP_IMPORT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* based on gmp's mpz_import.
+ * see http://gmplib.org/manual/Integer-Import-and-Export.html
+ */
+int mp_import(mp_int* rop, size_t count, int order, size_t size, 
+                            int endian, size_t nails, const void* op) {
+	int result;
+	size_t odd_nails, nail_bytes, i, j;
+	unsigned char odd_nail_mask;
+
+	mp_zero(rop);
+
+	if (endian == 0) {
+		union {
+			unsigned int i;
+			char c[4];
+		} lint;
+		lint.i = 0x01020304;
+
+		endian = (lint.c[0] == 4) ? -1 : 1;
+	}
+
+	odd_nails = (nails % 8);
+	odd_nail_mask = 0xff;
+	for (i = 0; i < odd_nails; ++i) {
+		odd_nail_mask ^= (1 << (7 - i));
+	}
+	nail_bytes = nails / 8;
+
+	for (i = 0; i < count; ++i) {
+		for (j = 0; j < (size - nail_bytes); ++j) {
+			unsigned char byte = *(
+					(unsigned char*)op + 
+					(((order == 1) ? i : ((count - 1) - i)) * size) +
+					((endian == 1) ? (j + nail_bytes) : (((size - 1) - j) - nail_bytes))
+				);
+
+			if (
+				(result = mp_mul_2d(rop, ((j == 0) ? (8 - odd_nails) : 8), rop)) != MP_OKAY) {
+				return result;
+			}
+
+			rop->dp[0] |= (j == 0) ? (byte & odd_nail_mask) : byte;
+			rop->used  += 1;
+		}
+	}
+
+	mp_clamp(rop);
+
+	return MP_OKAY;
+}
+
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/libtommath/bn_mp_n_root_ex.c b/libtommath/bn_mp_n_root_ex.c
new file mode 100644
index 0000000..79d1dfb
--- /dev/null
+++ b/libtommath/bn_mp_n_root_ex.c
@@ -0,0 +1,132 @@
+#include <tommath_private.h>
+#ifdef BN_MP_N_ROOT_EX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* find the n'th root of an integer
+ *
+ * Result found such that (c)**b <= a and (c+1)**b > a
+ *
+ * This algorithm uses Newton's approximation
+ * x[i+1] = x[i] - f(x[i])/f'(x[i])
+ * which will find the root in log(N) time where
+ * each step involves a fair bit.  This is not meant to
+ * find huge roots [square and cube, etc].
+ */
+int mp_n_root_ex (mp_int * a, mp_digit b, mp_int * c, int fast)
+{
+  mp_int  t1, t2, t3;
+  int     res, neg;
+
+  /* input must be positive if b is even */
+  if (((b & 1) == 0) && (a->sign == MP_NEG)) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto LBL_T1;
+  }
+
+  if ((res = mp_init (&t3)) != MP_OKAY) {
+    goto LBL_T2;
+  }
+
+  /* if a is negative fudge the sign but keep track */
+  neg     = a->sign;
+  a->sign = MP_ZPOS;
+
+  /* t2 = 2 */
+  mp_set (&t2, 2);
+
+  do {
+    /* t1 = t2 */
+    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
+
+    /* t3 = t1**(b-1) */
+    if ((res = mp_expt_d_ex (&t1, b - 1, &t3, fast)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* numerator */
+    /* t2 = t1**b */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* t2 = t1**b - a */
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* denominator */
+    /* t3 = t1**(b-1) * b  */
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* t3 = (t1**b - a)/(b * t1**(b-1)) */
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+  }  while (mp_cmp (&t1, &t2) != MP_EQ);
+
+  /* result can be off by a few so check */
+  for (;;) {
+    if ((res = mp_expt_d_ex (&t1, b, &t2, fast)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    if (mp_cmp (&t2, a) == MP_GT) {
+      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
+         goto LBL_T3;
+      }
+    } else {
+      break;
+    }
+  }
+
+  /* reset the sign of a first */
+  a->sign = neg;
+
+  /* set the result */
+  mp_exch (&t1, c);
+
+  /* set the sign of the result */
+  c->sign = neg;
+
+  res = MP_OKAY;
+
+LBL_T3:mp_clear (&t3);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+  return res;
+}
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/libtommath/bn_mp_set_long.c b/libtommath/bn_mp_set_long.c
new file mode 100644
index 0000000..281fce7
--- /dev/null
+++ b/libtommath/bn_mp_set_long.c
@@ -0,0 +1,24 @@
+#include <tommath_private.h>
+#ifdef BN_MP_SET_LONG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* set a platform dependent unsigned long int */
+MP_SET_XLONG(mp_set_long, unsigned long)
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/libtommath/bn_mp_set_long_long.c b/libtommath/bn_mp_set_long_long.c
new file mode 100644
index 0000000..3c4b01a
--- /dev/null
+++ b/libtommath/bn_mp_set_long_long.c
@@ -0,0 +1,24 @@
+#include <tommath_private.h>
+#ifdef BN_MP_SET_LONG_LONG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tstdenis82@gmail.com, http://libtom.org
+ */
+
+/* set a platform dependent unsigned long long int */
+MP_SET_XLONG(mp_set_long_long, unsigned long long)
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
diff --git a/libtommath/bn_mp_sqrt.c b/libtommath/bn_mp_sqrt.c
index 178059e..7c9d25d 100644
--- a/libtommath/bn_mp_sqrt.c
+++ b/libtommath/bn_mp_sqrt.c
@@ -1,5 +1,4 @@
 #include <tommath_private.h>
-
 #ifdef BN_MP_SQRT_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
diff --git a/libtommath/bn_mp_sqrtmod_prime.c b/libtommath/bn_mp_sqrtmod_prime.c
new file mode 100644
index 0000000..968729e
--- /dev/null
+++ b/libtommath/bn_mp_sqrtmod_prime.c
@@ -0,0 +1,124 @@
+#include <tommath_private.h>
+#ifdef BN_MP_SQRTMOD_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ */
+
+/* Tonelli-Shanks algorithm
+ * https://en.wikipedia.org/wiki/Tonelli%E2%80%93Shanks_algorithm
+ * https://gmplib.org/list-archives/gmp-discuss/2013-April/005300.html
+ *
+ */
+
+int mp_sqrtmod_prime(mp_int *n, mp_int *prime, mp_int *ret)
+{
+  int res, legendre;
+  mp_int t1, C, Q, S, Z, M, T, R, two;
+  mp_digit i;
+
+  /* first handle the simple cases */
+  if (mp_cmp_d(n, 0) == MP_EQ) {
+    mp_zero(ret);
+    return MP_OKAY;
+  }
+  if (mp_cmp_d(prime, 2) == MP_EQ)                              return MP_VAL; /* prime must be odd */
+  if ((res = mp_jacobi(n, prime, &legendre)) != MP_OKAY)        return res;
+  if (legendre == -1)                                           return MP_VAL; /* quadratic non-residue mod prime */
+
+  if ((res = mp_init_multi(&t1, &C, &Q, &S, &Z, &M, &T, &R, &two, NULL)) != MP_OKAY) {
+	return res;
+  }
+
+  /* SPECIAL CASE: if prime mod 4 == 3
+   * compute directly: res = n^(prime+1)/4 mod prime
+   * Handbook of Applied Cryptography algorithm 3.36
+   */
+  if ((res = mp_mod_d(prime, 4, &i)) != MP_OKAY)                goto cleanup;
+  if (i == 3) {
+    if ((res = mp_add_d(prime, 1, &t1)) != MP_OKAY)             goto cleanup;
+    if ((res = mp_div_2(&t1, &t1)) != MP_OKAY)                  goto cleanup;
+    if ((res = mp_div_2(&t1, &t1)) != MP_OKAY)                  goto cleanup;
+    if ((res = mp_exptmod(n, &t1, prime, ret)) != MP_OKAY)      goto cleanup;
+    res = MP_OKAY;
+    goto cleanup;
+  }
+
+  /* NOW: Tonelli-Shanks algorithm */
+
+  /* factor out powers of 2 from prime-1, defining Q and S as: prime-1 = Q*2^S */
+  if ((res = mp_copy(prime, &Q)) != MP_OKAY)                    goto cleanup;
+  if ((res = mp_sub_d(&Q, 1, &Q)) != MP_OKAY)                   goto cleanup;
+  /* Q = prime - 1 */
+  mp_zero(&S);
+  /* S = 0 */
+  while (mp_iseven(&Q) != MP_NO) {
+    if ((res = mp_div_2(&Q, &Q)) != MP_OKAY)                    goto cleanup;
+    /* Q = Q / 2 */
+    if ((res = mp_add_d(&S, 1, &S)) != MP_OKAY)                 goto cleanup;
+    /* S = S + 1 */
+  }
+
+  /* find a Z such that the Legendre symbol (Z|prime) == -1 */
+  if ((res = mp_set_int(&Z, 2)) != MP_OKAY)                     goto cleanup;
+  /* Z = 2 */
+  while(1) {
+    if ((res = mp_jacobi(&Z, prime, &legendre)) != MP_OKAY)     goto cleanup;
+    if (legendre == -1) break;
+    if ((res = mp_add_d(&Z, 1, &Z)) != MP_OKAY)                 goto cleanup;
+    /* Z = Z + 1 */
+  }
+
+  if ((res = mp_exptmod(&Z, &Q, prime, &C)) != MP_OKAY)         goto cleanup;
+  /* C = Z ^ Q mod prime */
+  if ((res = mp_add_d(&Q, 1, &t1)) != MP_OKAY)                  goto cleanup;
+  if ((res = mp_div_2(&t1, &t1)) != MP_OKAY)                    goto cleanup;
+  /* t1 = (Q + 1) / 2 */
+  if ((res = mp_exptmod(n, &t1, prime, &R)) != MP_OKAY)         goto cleanup;
+  /* R = n ^ ((Q + 1) / 2) mod prime */
+  if ((res = mp_exptmod(n, &Q, prime, &T)) != MP_OKAY)          goto cleanup;
+  /* T = n ^ Q mod prime */
+  if ((res = mp_copy(&S, &M)) != MP_OKAY)                       goto cleanup;
+  /* M = S */
+  if ((res = mp_set_int(&two, 2)) != MP_OKAY)                   goto cleanup;
+
+  res = MP_VAL;
+  while (1) {
+    if ((res = mp_copy(&T, &t1)) != MP_OKAY)                    goto cleanup;
+    i = 0;
+    while (1) {
+      if (mp_cmp_d(&t1, 1) == MP_EQ) break;
+      if ((res = mp_exptmod(&t1, &two, prime, &t1)) != MP_OKAY) goto cleanup;
+      i++;
+    }
+    if (i == 0) {
+      if ((res = mp_copy(&R, ret)) != MP_OKAY)                  goto cleanup;
+      res = MP_OKAY;
+      goto cleanup;
+    }
+    if ((res = mp_sub_d(&M, i, &t1)) != MP_OKAY)                goto cleanup;
+    if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY)               goto cleanup;
+    if ((res = mp_exptmod(&two, &t1, prime, &t1)) != MP_OKAY)   goto cleanup;
+    /* t1 = 2 ^ (M - i - 1) */
+    if ((res = mp_exptmod(&C, &t1, prime, &t1)) != MP_OKAY)     goto cleanup;
+    /* t1 = C ^ (2 ^ (M - i - 1)) mod prime */
+    if ((res = mp_sqrmod(&t1, prime, &C)) != MP_OKAY)           goto cleanup;
+    /* C = (t1 * t1) mod prime */
+    if ((res = mp_mulmod(&R, &t1, prime, &R)) != MP_OKAY)       goto cleanup;
+    /* R = (R * t1) mod prime */
+    if ((res = mp_mulmod(&T, &C, prime, &T)) != MP_OKAY)        goto cleanup;
+    /* T = (T * C) mod prime */
+    mp_set(&M, i);
+    /* M = i */
+  }
+
+cleanup:
+  mp_clear_multi(&t1, &C, &Q, &S, &Z, &M, &T, &R, &two, NULL);
+  return res;
+}
+
+#endif
diff --git a/libtommath/booker.pl b/libtommath/booker.pl
deleted file mode 100644
index c2abae6..0000000
--- a/libtommath/booker.pl
+++ /dev/null
@@ -1,267 +0,0 @@
-#!/bin/perl
-#
-#Used to prepare the book "tommath.src" for LaTeX by pre-processing it into a .tex file
-#
-#Essentially you write the "tommath.src" as normal LaTex except where you want code snippets you put
-#
-#EXAM,file
-#
-#This preprocessor will then open "file" and insert it as a verbatim copy.
-#
-#Tom St Denis
-
-#get graphics type
-if (shift =~ /PDF/) {
-   $graph = "";
-} else {
-   $graph = ".ps";
-}
-
-open(IN,"<tommath.src") or die "Can't open source file";
-open(OUT,">tommath.tex") or die "Can't open destination file";
-
-print "Scanning for sections\n";
-$chapter = $section = $subsection = 0;
-$x = 0;
-while (<IN>) {
-   print ".";
-   if (!(++$x % 80)) { print "\n"; }
-   #update the headings
-   if (~($_ =~ /\*/)) {
-      if ($_ =~ /\\chapter\{.+}/) {
-          ++$chapter;
-          $section = $subsection = 0;
-      } elsif ($_ =~ /\\section\{.+}/) {
-          ++$section;
-          $subsection = 0;
-      } elsif ($_ =~ /\\subsection\{.+}/) {
-          ++$subsection;
-      }
-   }
-
-   if ($_ =~ m/MARK/) {
-      @m = split(",",$_);
-      chomp(@m[1]);
-      $index1{@m[1]} = $chapter;
-      $index2{@m[1]} = $section;
-      $index3{@m[1]} = $subsection;
-   }
-}
-close(IN);
-
-open(IN,"<tommath.src") or die "Can't open source file";
-$readline = $wroteline = 0;
-$srcline = 0;
-
-while (<IN>) {
-   ++$readline;
-   ++$srcline;
-
-   if ($_ =~ m/MARK/) {
-   } elsif ($_ =~ m/EXAM/ || $_ =~ m/LIST/) {
-      if ($_ =~ m/EXAM/) {
-         $skipheader = 1;
-      } else {
-         $skipheader = 0;
-      }
-
-      # EXAM,file
-      chomp($_);
-      @m = split(",",$_);
-      open(SRC,"<$m[1]") or die "Error:$srcline:Can't open source file $m[1]";
-
-      print "$srcline:Inserting $m[1]:";
-
-      $line = 0;
-      $tmp = $m[1];
-      $tmp =~ s/_/"\\_"/ge;
-      print OUT "\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n";
-      $wroteline += 5;
-
-      if ($skipheader == 1) {
-         # scan till next end of comment, e.g. skip license
-         while (<SRC>) {
-            $text[$line++] = $_;
-            last if ($_ =~ /libtom\.org/);
-         }
-         <SRC>;
-      }
-
-      $inline = 0;
-      while (<SRC>) {
-      next if ($_ =~ /\$Source/);
-      next if ($_ =~ /\$Revision/);
-      next if ($_ =~ /\$Date/);
-         $text[$line++] = $_;
-         ++$inline;
-         chomp($_);
-         $_ =~ s/\t/"    "/ge;
-         $_ =~ s/{/"^{"/ge;
-         $_ =~ s/}/"^}"/ge;
-         $_ =~ s/\\/'\symbol{92}'/ge;
-         $_ =~ s/\^/"\\"/ge;
-
-         printf OUT ("%03d   ", $line);
-         for ($x = 0; $x < length($_); $x++) {
-             print OUT chr(vec($_, $x, 8));
-             if ($x == 75) {
-                 print OUT "\n      ";
-                 ++$wroteline;
-             }
-         }
-         print OUT "\n";
-         ++$wroteline;
-      }
-      $totlines = $line;
-      print OUT "\\end{alltt}\n\\end{small}\n";
-      close(SRC);
-      print "$inline lines\n";
-      $wroteline += 2;
-   } elsif ($_ =~ m/@\d+,.+@/) {
-     # line contains [number,text]
-     # e.g. @14,for (ix = 0)@
-     $txt = $_;
-     while ($txt =~ m/@\d+,.+@/) {
-        @m = split("@",$txt);      # splits into text, one, two
-        @parms = split(",",$m[1]);  # splits one,two into two elements
-
-        # now search from $parms[0] down for $parms[1]
-        $found1 = 0;
-        $found2 = 0;
-        for ($i = $parms[0]; $i < $totlines && $found1 == 0; $i++) {
-           if ($text[$i] =~ m/\Q$parms[1]\E/) {
-              $foundline1 = $i + 1;
-              $found1 = 1;
-           }
-        }
-
-        # now search backwards
-        for ($i = $parms[0] - 1; $i >= 0 && $found2 == 0; $i--) {
-           if ($text[$i] =~ m/\Q$parms[1]\E/) {
-              $foundline2 = $i + 1;
-              $found2 = 1;
-           }
-        }
-
-        # now use the closest match or the first if tied
-        if ($found1 == 1 && $found2 == 0) {
-           $found = 1;
-           $foundline = $foundline1;
-        } elsif ($found1 == 0 && $found2 == 1) {
-           $found = 1;
-           $foundline = $foundline2;
-        } elsif ($found1 == 1 && $found2 == 1) {
-           $found = 1;
-           if (($foundline1 - $parms[0]) <= ($parms[0] - $foundline2)) {
-              $foundline = $foundline1;
-           } else {
-              $foundline = $foundline2;
-           }
-        } else {
-           $found = 0;
-        }
-
-        # if found replace
-        if ($found == 1) {
-           $delta = $parms[0] - $foundline;
-           print "Found replacement tag for \"$parms[1]\" on line $srcline which refers to line $foundline (delta $delta)\n";
-           $_ =~ s/@\Q$m[1]\E@/$foundline/;
-        } else {
-           print "ERROR:  The tag \"$parms[1]\" on line $srcline was not found in the most recently parsed source!\n";
-        }
-
-        # remake the rest of the line
-        $cnt = @m;
-        $txt = "";
-        for ($i = 2; $i < $cnt; $i++) {
-            $txt = $txt . $m[$i] . "@";
-        }
-     }
-     print OUT $_;
-     ++$wroteline;
-   } elsif ($_ =~ /~.+~/) {
-      # line contains a ~text~ pair used to refer to indexing :-)
-      $txt = $_;
-      while ($txt =~ /~.+~/) {
-         @m = split("~", $txt);
-
-         # word is the second position
-         $word = @m[1];
-         $a = $index1{$word};
-         $b = $index2{$word};
-         $c = $index3{$word};
-
-         # if chapter (a) is zero it wasn't found
-         if ($a == 0) {
-            print "ERROR: the tag \"$word\" on line $srcline was not found previously marked.\n";
-         } else {
-            # format the tag as x, x.y or x.y.z depending on the values
-            $str = $a;
-            $str = $str . ".$b" if ($b != 0);
-            $str = $str . ".$c" if ($c != 0);
-
-            if ($b == 0 && $c == 0) {
-               # its a chapter
-               if ($a <= 10) {
-                  if ($a == 1) {
-                     $str = "chapter one";
-                  } elsif ($a == 2) {
-                     $str = "chapter two";
-                  } elsif ($a == 3) {
-                     $str = "chapter three";
-                  } elsif ($a == 4) {
-                     $str = "chapter four";
-                  } elsif ($a == 5) {
-                     $str = "chapter five";
-                  } elsif ($a == 6) {
-                     $str = "chapter six";
-                  } elsif ($a == 7) {
-                     $str = "chapter seven";
-                  } elsif ($a == 8) {
-                     $str = "chapter eight";
-                  } elsif ($a == 9) {
-                     $str = "chapter nine";
-                  } elsif ($a == 10) {
-                     $str = "chapter ten";
-                  }
-               } else {
-                  $str = "chapter " . $str;
-               }
-            } else {
-               $str = "section " . $str     if ($b != 0 && $c == 0);
-               $str = "sub-section " . $str if ($b != 0 && $c != 0);
-            }
-
-            #substitute
-            $_ =~ s/~\Q$word\E~/$str/;
-
-            print "Found replacement tag for marker \"$word\" on line $srcline which refers to $str\n";
-         }
-
-         # remake rest of the line
-         $cnt = @m;
-         $txt = "";
-         for ($i = 2; $i < $cnt; $i++) {
-             $txt = $txt . $m[$i] . "~";
-         }
-      }
-      print OUT $_;
-      ++$wroteline;
-   } elsif ($_ =~ m/FIGU/) {
-      # FIGU,file,caption
-      chomp($_);
-      @m = split(",", $_);
-      print OUT "\\begin{center}\n\\begin{figure}[here]\n\\includegraphics{pics/$m[1]$graph}\n";
-      print OUT "\\caption{$m[2]}\n\\label{pic:$m[1]}\n\\end{figure}\n\\end{center}\n";
-      $wroteline += 4;
-   } else {
-      print OUT $_;
-      ++$wroteline;
-   }
-}
-print "Read $readline lines, wrote $wroteline lines\n";
-
-close (OUT);
-close (IN);
-
-system('perl -pli -e "s/\s*$//" tommath.tex');
diff --git a/libtommath/demo/demo.c b/libtommath/demo/demo.c
deleted file mode 100644
index b46b7f8..0000000
--- a/libtommath/demo/demo.c
+++ /dev/null
@@ -1,986 +0,0 @@
-#include <string.h>
-#include <time.h>
-
-#ifdef IOWNANATHLON
-#include <unistd.h>
-#define SLEEP sleep(4)
-#else
-#define SLEEP
-#endif
-
-/*
- * Configuration
- */
-#ifndef LTM_DEMO_TEST_VS_MTEST
-#define LTM_DEMO_TEST_VS_MTEST 1
-#endif
-
-#ifndef LTM_DEMO_TEST_REDUCE_2K_L
-/* This test takes a moment so we disable it by default, but it can be:
- * 0 to disable testing
- * 1 to make the test with P = 2^1024 - 0x2A434 B9FDEC95 D8F9D550 FFFFFFFF FFFFFFFF
- * 2 to make the test with P = 2^2048 - 0x1 00000000 00000000 00000000 00000000 4945DDBF 8EA2A91D 5776399B B83E188F
- */
-#define LTM_DEMO_TEST_REDUCE_2K_L 0
-#endif
-
-#ifdef LTM_DEMO_REAL_RAND
-#define LTM_DEMO_RAND_SEED  time(NULL)
-#else
-#define LTM_DEMO_RAND_SEED  23
-#endif
-
-#include "tommath.h"
-
-void ndraw(mp_int * a, char *name)
-{
-   char buf[16000];
-
-   printf("%s: ", name);
-   mp_toradix(a, buf, 10);
-   printf("%s\n", buf);
-   mp_toradix(a, buf, 16);
-   printf("0x%s\n", buf);
-}
-
-#if LTM_DEMO_TEST_VS_MTEST
-static void draw(mp_int * a)
-{
-   ndraw(a, "");
-}
-#endif
-
-
-unsigned long lfsr = 0xAAAAAAAAUL;
-
-int lbit(void)
-{
-   if (lfsr & 0x80000000UL) {
-      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
-      return 1;
-   } else {
-      lfsr <<= 1;
-      return 0;
-   }
-}
-
-#if defined(LTM_DEMO_REAL_RAND) && !defined(_WIN32)
-static FILE* fd_urandom;
-#endif
-int myrng(unsigned char *dst, int len, void *dat)
-{
-   int x;
-   (void)dat;
-#if defined(LTM_DEMO_REAL_RAND)
-   if (!fd_urandom) {
-#if !defined(_WIN32)
-      fprintf(stderr, "\nno /dev/urandom\n");
-#endif
-   }
-   else {
-      return fread(dst, 1, len, fd_urandom);
-   }
-#endif
-   for (x = 0; x < len; ) {
-      unsigned int r = (unsigned int)rand();
-      do {
-         dst[x++] = r & 0xFF;
-         r >>= 8;
-      } while((r != 0) && (x < len));
-   }
-   return len;
-}
-
-#if LTM_DEMO_TEST_VS_MTEST != 0
-static void _panic(int l)
-{
-  fprintf(stderr, "\n%d: fgets failed\n", l);
-  exit(EXIT_FAILURE);
-}
-#endif
-
-mp_int a, b, c, d, e, f;
-
-static void _cleanup(void)
-{
-  mp_clear_multi(&a, &b, &c, &d, &e, &f, NULL);
-  printf("\n");
-
-#ifdef LTM_DEMO_REAL_RAND
-  if(fd_urandom)
-     fclose(fd_urandom);
-#endif
-}
-struct mp_sqrtmod_prime_st {
-   unsigned long p;
-   unsigned long n;
-   mp_digit r;
-};
-struct mp_sqrtmod_prime_st sqrtmod_prime[] = {
-      { 5, 14, 3 },
-      { 7, 9, 4 },
-      { 113, 2, 62 }
-};
-struct mp_jacobi_st {
-   unsigned long n;
-   int c[16];
-};
-struct mp_jacobi_st jacobi[] = {
-      { 3, {  1, -1,  0,  1, -1,  0,  1, -1,  0,  1, -1,  0,  1, -1,  0,  1 } },
-      { 5, {  0,  1, -1, -1,  1,  0,  1, -1, -1,  1,  0,  1, -1, -1,  1,  0 } },
-      { 7, {  1, -1,  1, -1, -1,  0,  1,  1, -1,  1, -1, -1,  0,  1,  1, -1 } },
-      { 9, { -1,  1,  0,  1,  1,  0,  1,  1,  0,  1,  1,  0,  1,  1,  0,  1 } },
-};
-
-char cmd[4096], buf[4096];
-int main(void)
-{
-   unsigned rr;
-   int cnt, ix;
-#if LTM_DEMO_TEST_VS_MTEST
-   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n,
-      gcd_n, lcm_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n;
-   char* ret;
-#else
-   unsigned long s, t;
-   unsigned long long q, r;
-   mp_digit mp;
-   int i, n, err, should;
-#endif
-
-   if (mp_init_multi(&a, &b, &c, &d, &e, &f, NULL)!= MP_OKAY)
-     return EXIT_FAILURE;
-
-   atexit(_cleanup);
-
-#if defined(LTM_DEMO_REAL_RAND)
-   if (!fd_urandom) {
-      fd_urandom = fopen("/dev/urandom", "r");
-      if (!fd_urandom) {
-#if !defined(_WIN32)
-         fprintf(stderr, "\ncould not open /dev/urandom\n");
-#endif
-      }
-   }
-#endif
-   srand(LTM_DEMO_RAND_SEED);
-
-#ifdef MP_8BIT
-   printf("Digit size 8 Bit \n");
-#endif
-#ifdef MP_16BIT
-   printf("Digit size 16 Bit \n");
-#endif
-#ifdef MP_32BIT
-   printf("Digit size 32 Bit \n");
-#endif
-#ifdef MP_64BIT
-   printf("Digit size 64 Bit \n");
-#endif
-   printf("Size of mp_digit: %u\n", (unsigned int)sizeof(mp_digit));
-   printf("Size of mp_word: %u\n", (unsigned int)sizeof(mp_word));
-   printf("DIGIT_BIT: %d\n", DIGIT_BIT);
-   printf("MP_PREC: %d\n", MP_PREC);
-
-#if LTM_DEMO_TEST_VS_MTEST == 0
-   // trivial stuff
-   mp_set_int(&a, 5);
-   mp_neg(&a, &b);
-   if (mp_cmp(&a, &b) != MP_GT) {
-      return EXIT_FAILURE;
-   }
-   if (mp_cmp(&b, &a) != MP_LT) {
-      return EXIT_FAILURE;
-   }
-   mp_neg(&a, &a);
-   if (mp_cmp(&b, &a) != MP_EQ) {
-      return EXIT_FAILURE;
-   }
-   mp_abs(&a, &b);
-   if (mp_isneg(&b) != MP_NO) {
-      return EXIT_FAILURE;
-   }
-   mp_add_d(&a, 1, &b);
-   mp_add_d(&a, 6, &b);
-
-
-   mp_set_int(&a, 0);
-   mp_set_int(&b, 1);
-   if ((err = mp_jacobi(&a, &b, &i)) != MP_OKAY) {
-      printf("Failed executing mp_jacobi(0 | 1) %s.\n", mp_error_to_string(err));
-      return EXIT_FAILURE;
-   }
-   if (i != 1) {
-      printf("Failed trivial mp_jacobi(0 | 1) %d != 1\n", i);
-      return EXIT_FAILURE;
-   }
-   for (cnt = 0; cnt < (int)(sizeof(jacobi)/sizeof(jacobi[0])); ++cnt) {
-      mp_set_int(&b, jacobi[cnt].n);
-      /* only test positive values of a */
-      for (n = -5; n <= 10; ++n) {
-         mp_set_int(&a, abs(n));
-         should = MP_OKAY;
-         if (n < 0) {
-            mp_neg(&a, &a);
-            /* Until #44 is fixed the negative a's must fail */
-            should = MP_VAL;
-         }
-         if ((err = mp_jacobi(&a, &b, &i)) != should) {
-            printf("Failed executing mp_jacobi(%d | %lu) %s.\n", n, jacobi[cnt].n, mp_error_to_string(err));
-            return EXIT_FAILURE;
-         }
-         if (err == MP_OKAY && i != jacobi[cnt].c[n + 5]) {
-            printf("Failed trivial mp_jacobi(%d | %lu) %d != %d\n", n, jacobi[cnt].n, i, jacobi[cnt].c[n + 5]);
-            return EXIT_FAILURE;
-         }
-      }
-   }
-
-   // test mp_get_int
-   printf("\n\nTesting: mp_get_int");
-   for (i = 0; i < 1000; ++i) {
-      t = ((unsigned long) rand () * rand () + 1) & 0xFFFFFFFF;
-      mp_set_int (&a, t);
-      if (t != mp_get_int (&a)) {
-         printf ("\nmp_get_int() bad result!");
-         return EXIT_FAILURE;
-      }
-   }
-   mp_set_int(&a, 0);
-   if (mp_get_int(&a) != 0) {
-      printf("\nmp_get_int() bad result!");
-      return EXIT_FAILURE;
-   }
-   mp_set_int(&a, 0xffffffff);
-   if (mp_get_int(&a) != 0xffffffff) {
-      printf("\nmp_get_int() bad result!");
-      return EXIT_FAILURE;
-   }
-
-   printf("\n\nTesting: mp_get_long\n");
-   for (i = 0; i < (int)(sizeof(unsigned long)*CHAR_BIT) - 1; ++i) {
-      t = (1ULL << (i+1)) - 1;
-      if (!t)
-         t = -1;
-      printf(" t = 0x%lx i = %d\r", t, i);
-      do {
-         if (mp_set_long(&a, t) != MP_OKAY) {
-            printf("\nmp_set_long() error!");
-            return EXIT_FAILURE;
-         }
-         s = mp_get_long(&a);
-         if (s != t) {
-            printf("\nmp_get_long() bad result! 0x%lx != 0x%lx", s, t);
-            return EXIT_FAILURE;
-         }
-         t <<= 1;
-      } while(t);
-   }
-
-   printf("\n\nTesting: mp_get_long_long\n");
-   for (i = 0; i < (int)(sizeof(unsigned long long)*CHAR_BIT) - 1; ++i) {
-      r = (1ULL << (i+1)) - 1;
-      if (!r)
-         r = -1;
-      printf(" r = 0x%llx i = %d\r", r, i);
-      do {
-         if (mp_set_long_long(&a, r) != MP_OKAY) {
-            printf("\nmp_set_long_long() error!");
-            return EXIT_FAILURE;
-         }
-         q = mp_get_long_long(&a);
-         if (q != r) {
-            printf("\nmp_get_long_long() bad result! 0x%llx != 0x%llx", q, r);
-            return EXIT_FAILURE;
-         }
-         r <<= 1;
-      } while(r);
-   }
-
-   // test mp_sqrt
-   printf("\n\nTesting: mp_sqrt\n");
-   for (i = 0; i < 1000; ++i) {
-      printf ("%6d\r", i);
-      fflush (stdout);
-      n = (rand () & 15) + 1;
-      mp_rand (&a, n);
-      if (mp_sqrt (&a, &b) != MP_OKAY) {
-         printf ("\nmp_sqrt() error!");
-         return EXIT_FAILURE;
-      }
-      mp_n_root_ex (&a, 2, &c, 0);
-      mp_n_root_ex (&a, 2, &d, 1);
-      if (mp_cmp_mag (&c, &d) != MP_EQ) {
-         printf ("\nmp_n_root_ex() bad result!");
-         return EXIT_FAILURE;
-      }
-      if (mp_cmp_mag (&b, &c) != MP_EQ) {
-         printf ("mp_sqrt() bad result!\n");
-         return EXIT_FAILURE;
-      }
-   }
-
-   printf("\n\nTesting: mp_is_square\n");
-   for (i = 0; i < 1000; ++i) {
-      printf ("%6d\r", i);
-      fflush (stdout);
-
-      /* test mp_is_square false negatives */
-      n = (rand () & 7) + 1;
-      mp_rand (&a, n);
-      mp_sqr (&a, &a);
-      if (mp_is_square (&a, &n) != MP_OKAY) {
-         printf ("\nfn:mp_is_square() error!");
-         return EXIT_FAILURE;
-      }
-      if (n == 0) {
-         printf ("\nfn:mp_is_square() bad result!");
-         return EXIT_FAILURE;
-      }
-
-      /* test for false positives */
-      mp_add_d (&a, 1, &a);
-      if (mp_is_square (&a, &n) != MP_OKAY) {
-         printf ("\nfp:mp_is_square() error!");
-         return EXIT_FAILURE;
-      }
-      if (n == 1) {
-         printf ("\nfp:mp_is_square() bad result!");
-         return EXIT_FAILURE;
-      }
-
-   }
-   printf("\n\n");
-
-   // r^2 = n (mod p)
-   for (i = 0; i < (int)(sizeof(sqrtmod_prime)/sizeof(sqrtmod_prime[0])); ++i) {
-      mp_set_int(&a, sqrtmod_prime[i].p);
-      mp_set_int(&b, sqrtmod_prime[i].n);
-      if (mp_sqrtmod_prime(&b, &a, &c) != MP_OKAY) {
-         printf("Failed executing %d. mp_sqrtmod_prime\n", (i+1));
-         return EXIT_FAILURE;
-      }
-      if (mp_cmp_d(&c, sqrtmod_prime[i].r) != MP_EQ) {
-         printf("Failed %d. trivial mp_sqrtmod_prime\n", (i+1));
-         ndraw(&c, "r");
-         return EXIT_FAILURE;
-      }
-   }
-
-   /* test for size */
-   for (ix = 10; ix < 128; ix++) {
-      printf ("Testing (not safe-prime): %9d bits    \r", ix);
-      fflush (stdout);
-      err = mp_prime_random_ex (&a, 8, ix,
-                                (rand () & 1) ? 0 : LTM_PRIME_2MSB_ON, myrng,
-                                NULL);
-      if (err != MP_OKAY) {
-         printf ("failed with err code %d\n", err);
-         return EXIT_FAILURE;
-      }
-      if (mp_count_bits (&a) != ix) {
-         printf ("Prime is %d not %d bits!!!\n", mp_count_bits (&a), ix);
-         return EXIT_FAILURE;
-      }
-   }
-   printf("\n");
-
-   for (ix = 16; ix < 128; ix++) {
-      printf ("Testing (    safe-prime): %9d bits    \r", ix);
-      fflush (stdout);
-      err = mp_prime_random_ex (
-            &a, 8, ix, ((rand () & 1) ? 0 : LTM_PRIME_2MSB_ON) | LTM_PRIME_SAFE,
-            myrng, NULL);
-      if (err != MP_OKAY) {
-         printf ("failed with err code %d\n", err);
-         return EXIT_FAILURE;
-      }
-      if (mp_count_bits (&a) != ix) {
-         printf ("Prime is %d not %d bits!!!\n", mp_count_bits (&a), ix);
-         return EXIT_FAILURE;
-      }
-      /* let's see if it's really a safe prime */
-      mp_sub_d (&a, 1, &a);
-      mp_div_2 (&a, &a);
-      mp_prime_is_prime (&a, 8, &cnt);
-      if (cnt != MP_YES) {
-         printf ("sub is not prime!\n");
-         return EXIT_FAILURE;
-      }
-   }
-
-   printf("\n\n");
-
-   // test montgomery
-   printf("Testing: montgomery...\n");
-   for (i = 1; i <= 10; i++) {
-      if (i == 10)
-         i = 1000;
-      printf(" digit size: %2d\r", i);
-      fflush(stdout);
-      for (n = 0; n < 1000; n++) {
-         mp_rand(&a, i);
-         a.dp[0] |= 1;
-
-         // let's see if R is right
-         mp_montgomery_calc_normalization(&b, &a);
-         mp_montgomery_setup(&a, &mp);
-
-         // now test a random reduction
-         for (ix = 0; ix < 100; ix++) {
-             mp_rand(&c, 1 + abs(rand()) % (2*i));
-             mp_copy(&c, &d);
-             mp_copy(&c, &e);
-
-             mp_mod(&d, &a, &d);
-             mp_montgomery_reduce(&c, &a, mp);
-             mp_mulmod(&c, &b, &a, &c);
-
-             if (mp_cmp(&c, &d) != MP_EQ) {
-printf("d = e mod a, c = e MOD a\n");
-mp_todecimal(&a, buf); printf("a = %s\n", buf);
-mp_todecimal(&e, buf); printf("e = %s\n", buf);
-mp_todecimal(&d, buf); printf("d = %s\n", buf);
-mp_todecimal(&c, buf); printf("c = %s\n", buf);
-printf("compare no compare!\n"); return EXIT_FAILURE; }
-             /* only one big montgomery reduction */
-             if (i > 10)
-             {
-                n = 1000;
-                ix = 100;
-             }
-         }
-      }
-   }
-
-   printf("\n\n");
-
-   mp_read_radix(&a, "123456", 10);
-   mp_toradix_n(&a, buf, 10, 3);
-   printf("a == %s\n", buf);
-   mp_toradix_n(&a, buf, 10, 4);
-   printf("a == %s\n", buf);
-   mp_toradix_n(&a, buf, 10, 30);
-   printf("a == %s\n", buf);
-
-
-#if 0
-   for (;;) {
-      fgets(buf, sizeof(buf), stdin);
-      mp_read_radix(&a, buf, 10);
-      mp_prime_next_prime(&a, 5, 1);
-      mp_toradix(&a, buf, 10);
-      printf("%s, %lu\n", buf, a.dp[0] & 3);
-   }
-#endif
-
-   /* test mp_cnt_lsb */
-   printf("\n\nTesting: mp_cnt_lsb");
-   mp_set(&a, 1);
-   for (ix = 0; ix < 1024; ix++) {
-      if (mp_cnt_lsb (&a) != ix) {
-         printf ("Failed at %d, %d\n", ix, mp_cnt_lsb (&a));
-         return EXIT_FAILURE;
-      }
-      mp_mul_2 (&a, &a);
-   }
-
-/* test mp_reduce_2k */
-   printf("\n\nTesting: mp_reduce_2k\n");
-   for (cnt = 3; cnt <= 128; ++cnt) {
-      mp_digit tmp;
-
-      mp_2expt (&a, cnt);
-      mp_sub_d (&a, 2, &a); /* a = 2**cnt - 2 */
-
-      printf ("\r %4d bits", cnt);
-      printf ("(%d)", mp_reduce_is_2k (&a));
-      mp_reduce_2k_setup (&a, &tmp);
-      printf ("(%lu)", (unsigned long) tmp);
-      for (ix = 0; ix < 1000; ix++) {
-         if (!(ix & 127)) {
-            printf (".");
-            fflush (stdout);
-         }
-         mp_rand (&b, (cnt / DIGIT_BIT + 1) * 2);
-         mp_copy (&c, &b);
-         mp_mod (&c, &a, &c);
-         mp_reduce_2k (&b, &a, 2);
-         if (mp_cmp (&c, &b)) {
-            printf ("FAILED\n");
-            return EXIT_FAILURE;
-         }
-      }
-   }
-
-/* test mp_div_3  */
-   printf("\n\nTesting: mp_div_3...\n");
-   mp_set(&d, 3);
-   for (cnt = 0; cnt < 10000;) {
-      mp_digit r2;
-
-      if (!(++cnt & 127))
-      {
-        printf("%9d\r", cnt);
-        fflush(stdout);
-      }
-      mp_rand(&a, abs(rand()) % 128 + 1);
-      mp_div(&a, &d, &b, &e);
-      mp_div_3(&a, &c, &r2);
-
-      if (mp_cmp(&b, &c) || mp_cmp_d(&e, r2)) {
-	 printf("\nmp_div_3 => Failure\n");
-      }
-   }
-   printf("\nPassed div_3 testing");
-
-/* test the DR reduction */
-   printf("\n\nTesting: mp_dr_reduce...\n");
-   for (cnt = 2; cnt < 32; cnt++) {
-      printf ("\r%d digit modulus", cnt);
-      mp_grow (&a, cnt);
-      mp_zero (&a);
-      for (ix = 1; ix < cnt; ix++) {
-         a.dp[ix] = MP_MASK;
-      }
-      a.used = cnt;
-      a.dp[0] = 3;
-
-      mp_rand (&b, cnt - 1);
-      mp_copy (&b, &c);
-
-      rr = 0;
-      do {
-         if (!(rr & 127)) {
-            printf (".");
-            fflush (stdout);
-         }
-         mp_sqr (&b, &b);
-         mp_add_d (&b, 1, &b);
-         mp_copy (&b, &c);
-
-         mp_mod (&b, &a, &b);
-         mp_dr_setup(&a, &mp),
-         mp_dr_reduce (&c, &a, mp);
-
-         if (mp_cmp (&b, &c) != MP_EQ) {
-            printf ("Failed on trial %u\n", rr);
-            return EXIT_FAILURE;
-         }
-      } while (++rr < 500);
-      printf (" passed");
-      fflush (stdout);
-   }
-
-#if LTM_DEMO_TEST_REDUCE_2K_L
-/* test the mp_reduce_2k_l code */
-#if LTM_DEMO_TEST_REDUCE_2K_L == 1
-/* first load P with 2^1024 - 0x2A434 B9FDEC95 D8F9D550 FFFFFFFF FFFFFFFF */
-   mp_2expt(&a, 1024);
-   mp_read_radix(&b, "2A434B9FDEC95D8F9D550FFFFFFFFFFFFFFFF", 16);
-   mp_sub(&a, &b, &a);
-#elif LTM_DEMO_TEST_REDUCE_2K_L == 2
-/*  p = 2^2048 - 0x1 00000000 00000000 00000000 00000000 4945DDBF 8EA2A91D 5776399B B83E188F  */
-   mp_2expt(&a, 2048);
-   mp_read_radix(&b,
-		 "1000000000000000000000000000000004945DDBF8EA2A91D5776399BB83E188F",
-		 16);
-   mp_sub(&a, &b, &a);
-#else
-#error oops
-#endif
-
-   mp_todecimal(&a, buf);
-   printf("\n\np==%s\n", buf);
-/* now mp_reduce_is_2k_l() should return */
-   if (mp_reduce_is_2k_l(&a) != 1) {
-      printf("mp_reduce_is_2k_l() return 0, should be 1\n");
-      return EXIT_FAILURE;
-   }
-   mp_reduce_2k_setup_l(&a, &d);
-   /* now do a million square+1 to see if it varies */
-   mp_rand(&b, 64);
-   mp_mod(&b, &a, &b);
-   mp_copy(&b, &c);
-   printf("Testing: mp_reduce_2k_l...");
-   fflush(stdout);
-   for (cnt = 0; cnt < (int)(1UL << 20); cnt++) {
-      mp_sqr(&b, &b);
-      mp_add_d(&b, 1, &b);
-      mp_reduce_2k_l(&b, &a, &d);
-      mp_sqr(&c, &c);
-      mp_add_d(&c, 1, &c);
-      mp_mod(&c, &a, &c);
-      if (mp_cmp(&b, &c) != MP_EQ) {
-	 printf("mp_reduce_2k_l() failed at step %d\n", cnt);
-	 mp_tohex(&b, buf);
-	 printf("b == %s\n", buf);
-	 mp_tohex(&c, buf);
-	 printf("c == %s\n", buf);
-	 return EXIT_FAILURE;
-      }
-   }
-   printf("...Passed\n");
-#endif /* LTM_DEMO_TEST_REDUCE_2K_L */
-
-#else
-
-   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
-      sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n =
-      sub_d_n = 0;
-
-   /* force KARA and TOOM to enable despite cutoffs */
-   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 8;
-   TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 16;
-
-   for (;;) {
-      /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
-      switch (abs(rand()) % 7) {
-      case 0:
-	 mp_clear(&a);
-	 mp_init(&a);
-	 break;
-      case 1:
-	 mp_clear(&b);
-	 mp_init(&b);
-	 break;
-      case 2:
-	 mp_clear(&c);
-	 mp_init(&c);
-	 break;
-      case 3:
-	 mp_clear(&d);
-	 mp_init(&d);
-	 break;
-      case 4:
-	 mp_clear(&e);
-	 mp_init(&e);
-	 break;
-      case 5:
-	 mp_clear(&f);
-	 mp_init(&f);
-	 break;
-      case 6:
-	 break;			/* don't clear any */
-      }
-
-
-      printf
-	 ("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ",
-	  add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n,
-	  expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
-      ret=fgets(cmd, 4095, stdin); if(!ret){_panic(__LINE__);}
-      cmd[strlen(cmd) - 1] = 0;
-      printf("%-6s ]\r", cmd);
-      fflush(stdout);
-      if (!strcmp(cmd, "mul2d")) {
-	 ++mul2d_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 sscanf(buf, "%d", &rr);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-
-	 mp_mul_2d(&a, rr, &a);
-	 a.sign = b.sign;
-	 if (mp_cmp(&a, &b) != MP_EQ) {
-	    printf("mul2d failed, rr == %d\n", rr);
-	    draw(&a);
-	    draw(&b);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "div2d")) {
-	 ++div2d_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 sscanf(buf, "%d", &rr);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-
-	 mp_div_2d(&a, rr, &a, &e);
-	 a.sign = b.sign;
-	 if (a.used == b.used && a.used == 0) {
-	    a.sign = b.sign = MP_ZPOS;
-	 }
-	 if (mp_cmp(&a, &b) != MP_EQ) {
-	    printf("div2d failed, rr == %d\n", rr);
-	    draw(&a);
-	    draw(&b);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "add")) {
-	 ++add_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 mp_copy(&a, &d);
-	 mp_add(&d, &b, &d);
-	 if (mp_cmp(&c, &d) != MP_EQ) {
-	    printf("add %lu failure!\n", add_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-
-	 /* test the sign/unsigned storage functions */
-
-	 rr = mp_signed_bin_size(&c);
-	 mp_to_signed_bin(&c, (unsigned char *) cmd);
-	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
-	 mp_read_signed_bin(&d, (unsigned char *) cmd, rr);
-	 if (mp_cmp(&c, &d) != MP_EQ) {
-	    printf("mp_signed_bin failure!\n");
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-
-
-	 rr = mp_unsigned_bin_size(&c);
-	 mp_to_unsigned_bin(&c, (unsigned char *) cmd);
-	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
-	 mp_read_unsigned_bin(&d, (unsigned char *) cmd, rr);
-	 if (mp_cmp_mag(&c, &d) != MP_EQ) {
-	    printf("mp_unsigned_bin failure!\n");
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-
-      } else if (!strcmp(cmd, "sub")) {
-	 ++sub_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 mp_copy(&a, &d);
-	 mp_sub(&d, &b, &d);
-	 if (mp_cmp(&c, &d) != MP_EQ) {
-	    printf("sub %lu failure!\n", sub_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "mul")) {
-	 ++mul_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 mp_copy(&a, &d);
-	 mp_mul(&d, &b, &d);
-	 if (mp_cmp(&c, &d) != MP_EQ) {
-	    printf("mul %lu failure!\n", mul_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "div")) {
-	 ++div_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&d, buf, 64);
-
-	 mp_div(&a, &b, &e, &f);
-	 if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
-	    printf("div %lu %d, %d, failure!\n", div_n, mp_cmp(&c, &e),
-		   mp_cmp(&d, &f));
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    draw(&e);
-	    draw(&f);
-	    return EXIT_FAILURE;
-	 }
-
-      } else if (!strcmp(cmd, "sqr")) {
-	 ++sqr_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 mp_copy(&a, &c);
-	 mp_sqr(&c, &c);
-	 if (mp_cmp(&b, &c) != MP_EQ) {
-	    printf("sqr %lu failure!\n", sqr_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "gcd")) {
-	 ++gcd_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 mp_copy(&a, &d);
-	 mp_gcd(&d, &b, &d);
-	 d.sign = c.sign;
-	 if (mp_cmp(&c, &d) != MP_EQ) {
-	    printf("gcd %lu failure!\n", gcd_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "lcm")) {
-	 ++lcm_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 mp_copy(&a, &d);
-	 mp_lcm(&d, &b, &d);
-	 d.sign = c.sign;
-	 if (mp_cmp(&c, &d) != MP_EQ) {
-	    printf("lcm %lu failure!\n", lcm_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "expt")) {
-	 ++expt_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&d, buf, 64);
-	 mp_copy(&a, &e);
-	 mp_exptmod(&e, &b, &c, &e);
-	 if (mp_cmp(&d, &e) != MP_EQ) {
-	    printf("expt %lu failure!\n", expt_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    draw(&e);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "invmod")) {
-	 ++inv_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&c, buf, 64);
-	 mp_invmod(&a, &b, &d);
-	 mp_mulmod(&d, &a, &b, &e);
-	 if (mp_cmp_d(&e, 1) != MP_EQ) {
-	    printf("inv [wrong value from MPI?!] failure\n");
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    draw(&d);
-	    draw(&e);
-	    mp_gcd(&a, &b, &e);
-	    draw(&e);
-	    return EXIT_FAILURE;
-	 }
-
-      } else if (!strcmp(cmd, "div2")) {
-	 ++div2_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 mp_div_2(&a, &c);
-	 if (mp_cmp(&c, &b) != MP_EQ) {
-	    printf("div_2 %lu failure\n", div2_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "mul2")) {
-	 ++mul2_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 mp_mul_2(&a, &c);
-	 if (mp_cmp(&c, &b) != MP_EQ) {
-	    printf("mul_2 %lu failure\n", mul2_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "add_d")) {
-	 ++add_d_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 sscanf(buf, "%d", &ix);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 mp_add_d(&a, ix, &c);
-	 if (mp_cmp(&b, &c) != MP_EQ) {
-	    printf("add_d %lu failure\n", add_d_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    printf("d == %d\n", ix);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "sub_d")) {
-	 ++sub_d_n;
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&a, buf, 64);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 sscanf(buf, "%d", &ix);
-	 ret=fgets(buf, 4095, stdin); if(!ret){_panic(__LINE__);}
-	 mp_read_radix(&b, buf, 64);
-	 mp_sub_d(&a, ix, &c);
-	 if (mp_cmp(&b, &c) != MP_EQ) {
-	    printf("sub_d %lu failure\n", sub_d_n);
-	    draw(&a);
-	    draw(&b);
-	    draw(&c);
-	    printf("d == %d\n", ix);
-	    return EXIT_FAILURE;
-	 }
-      } else if (!strcmp(cmd, "exit")) {
-         printf("\nokay, exiting now\n");
-         break;
-      }
-   }
-#endif
-   return 0;
-}
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/demo/timing.c b/libtommath/demo/timing.c
deleted file mode 100644
index 1bd8489..0000000
--- a/libtommath/demo/timing.c
+++ /dev/null
@@ -1,339 +0,0 @@
-#include <tommath.h>
-#include <time.h>
-#include <unistd.h>
-
-ulong64 _tt;
-
-#ifdef IOWNANATHLON
-#include <unistd.h>
-#define SLEEP sleep(4)
-#else
-#define SLEEP
-#endif
-
-#ifdef LTM_TIMING_REAL_RAND
-#define LTM_TIMING_RAND_SEED  time(NULL)
-#else
-#define LTM_TIMING_RAND_SEED  23
-#endif
-
-
-void ndraw(mp_int * a, char *name)
-{
-   char buf[4096];
-
-   printf("%s: ", name);
-   mp_toradix(a, buf, 64);
-   printf("%s\n", buf);
-}
-
-static void draw(mp_int * a)
-{
-   ndraw(a, "");
-}
-
-
-unsigned long lfsr = 0xAAAAAAAAUL;
-
-int lbit(void)
-{
-   if (lfsr & 0x80000000UL) {
-      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
-      return 1;
-   } else {
-      lfsr <<= 1;
-      return 0;
-   }
-}
-
-/* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC(void)
-{
-#if defined __GNUC__
-#if defined(__i386__) || defined(__x86_64__)
-  /* version from http://www.mcs.anl.gov/~kazutomo/rdtsc.html
-   * the old code always got a warning issued by gcc, clang did not complain...
-   */
-  unsigned hi, lo;
-  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-  return ((ulong64)lo)|( ((ulong64)hi)<<32);
-#else /* gcc-IA64 version */
-   unsigned long result;
-   __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
-
-   while (__builtin_expect((int) result == -1, 0))
-      __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
-
-   return result;
-#endif
-
-   // Microsoft and Intel Windows compilers
-#elif defined _M_IX86
-   __asm rdtsc
-#elif defined _M_AMD64
-   return __rdtsc();
-#elif defined _M_IA64
-#if defined __INTEL_COMPILER
-#include <ia64intrin.h>
-#endif
-   return __getReg(3116);
-#else
-#error need rdtsc function for this build
-#endif
-}
-
-#define DO(x) x; x;
-//#define DO4(x) DO2(x); DO2(x);
-//#define DO8(x) DO4(x); DO4(x);
-//#define DO(x)  DO8(x); DO8(x);
-
-#ifdef TIMING_NO_LOGS
-#define FOPEN(a, b)     NULL
-#define FPRINTF(a,b,c,d)
-#define FFLUSH(a)
-#define FCLOSE(a)       (void)(a)
-#else
-#define FOPEN(a,b)       fopen(a,b)
-#define FPRINTF(a,b,c,d) fprintf(a,b,c,d)
-#define FFLUSH(a)        fflush(a)
-#define FCLOSE(a)        fclose(a)
-#endif
-
-int main(void)
-{
-   ulong64 tt, gg, CLK_PER_SEC;
-   FILE *log, *logb, *logc, *logd;
-   mp_int a, b, c, d, e, f;
-   int n, cnt, ix, old_kara_m, old_kara_s, old_toom_m, old_toom_s;
-   unsigned rr;
-
-   mp_init(&a);
-   mp_init(&b);
-   mp_init(&c);
-   mp_init(&d);
-   mp_init(&e);
-   mp_init(&f);
-
-   srand(LTM_TIMING_RAND_SEED);
-
-
-   CLK_PER_SEC = TIMFUNC();
-   sleep(1);
-   CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
-
-   printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
-   log = FOPEN("logs/add.log", "w");
-   for (cnt = 8; cnt <= 128; cnt += 8) {
-      SLEEP;
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-      rr = 0;
-      tt = -1;
-      do {
-	 gg = TIMFUNC();
-	 DO(mp_add(&a, &b, &c));
-	 gg = (TIMFUNC() - gg) >> 1;
-	 if (tt > gg)
-	    tt = gg;
-      } while (++rr < 100000);
-      printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
-	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
-      FPRINTF(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
-      FFLUSH(log);
-   }
-   FCLOSE(log);
-
-   log = FOPEN("logs/sub.log", "w");
-   for (cnt = 8; cnt <= 128; cnt += 8) {
-      SLEEP;
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-      rr = 0;
-      tt = -1;
-      do {
-	 gg = TIMFUNC();
-	 DO(mp_sub(&a, &b, &c));
-	 gg = (TIMFUNC() - gg) >> 1;
-	 if (tt > gg)
-	    tt = gg;
-      } while (++rr < 100000);
-
-      printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
-	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
-      FPRINTF(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
-      FFLUSH(log);
-   }
-   FCLOSE(log);
-
-   /* do mult/square twice, first without karatsuba and second with */
-   old_kara_m = KARATSUBA_MUL_CUTOFF;
-   old_kara_s = KARATSUBA_SQR_CUTOFF;
-   /* currently toom-cook cut-off is too high to kick in, so we just use the karatsuba values */
-   old_toom_m = old_kara_m;
-   old_toom_s = old_kara_m;
-   for (ix = 0; ix < 3; ix++) {
-      printf("With%s Karatsuba, With%s Toom\n", (ix == 0) ? "out" : "", (ix == 1) ? "out" : "");
-
-      KARATSUBA_MUL_CUTOFF = (ix == 1) ? old_kara_m : 9999;
-      KARATSUBA_SQR_CUTOFF = (ix == 1) ? old_kara_s : 9999;
-      TOOM_MUL_CUTOFF = (ix == 2) ? old_toom_m : 9999;
-      TOOM_SQR_CUTOFF = (ix == 2) ? old_toom_s : 9999;
-
-      log = FOPEN((ix == 0) ? "logs/mult.log" : (ix == 1) ? "logs/mult_kara.log" : "logs/mult_toom.log", "w");
-      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
-	 SLEEP;
-	 mp_rand(&a, cnt);
-	 mp_rand(&b, cnt);
-	 rr = 0;
-	 tt = -1;
-	 do {
-	    gg = TIMFUNC();
-	    DO(mp_mul(&a, &b, &c));
-	    gg = (TIMFUNC() - gg) >> 1;
-	    if (tt > gg)
-	       tt = gg;
-	 } while (++rr < 100);
-	 printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n",
-		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
-	 FPRINTF(log, "%d %9llu\n", mp_count_bits(&a), tt);
-	 FFLUSH(log);
-      }
-      FCLOSE(log);
-
-      log = FOPEN((ix == 0) ? "logs/sqr.log" : (ix == 1) ? "logs/sqr_kara.log" : "logs/sqr_toom.log", "w");
-      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
-	 SLEEP;
-	 mp_rand(&a, cnt);
-	 rr = 0;
-	 tt = -1;
-	 do {
-	    gg = TIMFUNC();
-	    DO(mp_sqr(&a, &b));
-	    gg = (TIMFUNC() - gg) >> 1;
-	    if (tt > gg)
-	       tt = gg;
-	 } while (++rr < 100);
-	 printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n",
-		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
-	 FPRINTF(log, "%d %9llu\n", mp_count_bits(&a), tt);
-	 FFLUSH(log);
-      }
-      FCLOSE(log);
-
-   }
-
-   {
-      char *primes[] = {
-	 /* 2K large moduli */
-	 "179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474124377767893424865485276302219601246094119453082952085005768838150682342462881473913110540827237163350510684586239334100047359817950870678242457666208137217",
-	 "32317006071311007300714876688669951960444102669715484032130345427524655138867890893197201411522913463688717960921898019494119559150490921095088152386448283120630877367300996091750197750389652106796057638384067568276792218642619756161838094338476170470581645852036305042887575891541065808607552399123930385521914333389668342420684974786564569494856176035326322058077805659331026192708460314150258592864177116725943603718461857357598351152301645904403697613233287231227125684710820209725157101726931323469678542580656697935045997268352998638099733077152121140120031150424541696791951097529546801429027668869927491725169",
-	 "1044388881413152506691752710716624382579964249047383780384233483283953907971557456848826811934997558340890106714439262837987573438185793607263236087851365277945956976543709998340361590134383718314428070011855946226376318839397712745672334684344586617496807908705803704071284048740118609114467977783598029006686938976881787785946905630190260940599579453432823469303026696443059025015972399867714215541693835559885291486318237914434496734087811872639496475100189041349008417061675093668333850551032972088269550769983616369411933015213796825837188091833656751221318492846368125550225998300412344784862595674492194617023806505913245610825731835380087608622102834270197698202313169017678006675195485079921636419370285375124784014907159135459982790513399611551794271106831134090584272884279791554849782954323534517065223269061394905987693002122963395687782878948440616007412945674919823050571642377154816321380631045902916136926708342856440730447899971901781465763473223850267253059899795996090799469201774624817718449867455659250178329070473119433165550807568221846571746373296884912819520317457002440926616910874148385078411929804522981857338977648103126085902995208257421855249796721729039744118165938433694823325696642096892124547425283",
-	 /* 2K moduli mersenne primes */
-	 "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
-	 "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
-	 "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
-	 "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
-	 "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
-	 "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
-
-	 /* DR moduli */
-	 "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
-	 "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
-	 "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
-	 "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
-	 "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
-	 "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
-	 "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
-
-	 /* generic unrestricted moduli */
-	 "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
-	 "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
-	 "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
-	 "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
-	 "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
-	 "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
-	 "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-	 NULL
-      };
-      log = FOPEN("logs/expt.log", "w");
-      logb = FOPEN("logs/expt_dr.log", "w");
-      logc = FOPEN("logs/expt_2k.log", "w");
-      logd = FOPEN("logs/expt_2kl.log", "w");
-      for (n = 0; primes[n]; n++) {
-	 SLEEP;
-	 mp_read_radix(&a, primes[n], 10);
-	 mp_zero(&b);
-	 for (rr = 0; rr < (unsigned) mp_count_bits(&a); rr++) {
-	    mp_mul_2(&b, &b);
-	    b.dp[0] |= lbit();
-	    b.used += 1;
-	 }
-	 mp_sub_d(&a, 1, &c);
-	 mp_mod(&b, &c, &b);
-	 mp_set(&c, 3);
-	 rr = 0;
-	 tt = -1;
-	 do {
-	    gg = TIMFUNC();
-	    DO(mp_exptmod(&c, &b, &a, &d));
-	    gg = (TIMFUNC() - gg) >> 1;
-	    if (tt > gg)
-	       tt = gg;
-	 } while (++rr < 10);
-	 mp_sub_d(&a, 1, &e);
-	 mp_sub(&e, &b, &b);
-	 mp_exptmod(&c, &b, &a, &e);	/* c^(p-1-b) mod a */
-	 mp_mulmod(&e, &d, &a, &d);	/* c^b * c^(p-1-b) == c^p-1 == 1 */
-	 if (mp_cmp_d(&d, 1)) {
-	    printf("Different (%d)!!!\n", mp_count_bits(&a));
-	    draw(&d);
-	    exit(0);
-	 }
-	 printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n",
-		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
-	 FPRINTF(n < 4 ? logd : (n < 9) ? logc : (n < 16) ? logb : log,
-		 "%d %9llu\n", mp_count_bits(&a), tt);
-      }
-   }
-   FCLOSE(log);
-   FCLOSE(logb);
-   FCLOSE(logc);
-   FCLOSE(logd);
-
-   log = FOPEN("logs/invmod.log", "w");
-   for (cnt = 4; cnt <= 32; cnt += 4) {
-      SLEEP;
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-
-      do {
-	 mp_add_d(&b, 1, &b);
-	 mp_gcd(&a, &b, &c);
-      } while (mp_cmp_d(&c, 1) != MP_EQ);
-
-      rr = 0;
-      tt = -1;
-      do {
-	 gg = TIMFUNC();
-	 DO(mp_invmod(&b, &a, &c));
-	 gg = (TIMFUNC() - gg) >> 1;
-	 if (tt > gg)
-	    tt = gg;
-      } while (++rr < 1000);
-      mp_mulmod(&b, &c, &a, &d);
-      if (mp_cmp_d(&d, 1) != MP_EQ) {
-	 printf("Failed to invert\n");
-	 return 0;
-      }
-      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n",
-	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
-      FPRINTF(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
-   }
-   FCLOSE(log);
-
-   return 0;
-}
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/dep.pl b/libtommath/dep.pl
deleted file mode 100644
index 0a5d19a..0000000
--- a/libtommath/dep.pl
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/perl 
-#
-# Walk through source, add labels and make classes
-#
-#use strict;
-
-my %deplist;
-
-#open class file and write preamble 
-open(CLASS, ">tommath_class.h") or die "Couldn't open tommath_class.h for writing\n";
-print CLASS "#if !(defined(LTM1) && defined(LTM2) && defined(LTM3))\n#if defined(LTM2)\n#define LTM3\n#endif\n#if defined(LTM1)\n#define LTM2\n#endif\n#define LTM1\n\n#if defined(LTM_ALL)\n";
-
-foreach my $filename (glob "bn*.c") {
-   my $define = $filename;
-
-print "Processing $filename\n";
-
-   # convert filename to upper case so we can use it as a define 
-   $define =~ tr/[a-z]/[A-Z]/;
-   $define =~ tr/\./_/;
-   print CLASS "#define $define\n";
-
-   # now copy text and apply #ifdef as required 
-   my $apply = 0;
-   open(SRC, "<$filename");
-   open(OUT, ">tmp");
-
-   # first line will be the #ifdef
-   my $line = <SRC>;
-   if ($line =~ /include/) {
-      print OUT $line;
-   } else {
-      print OUT "#include <tommath.h>\n#ifdef $define\n$line";
-      $apply = 1;
-   }
-   while (<SRC>) {
-      if (!($_ =~ /tommath\.h/)) {
-         print OUT $_;
-      }
-   }
-   if ($apply == 1) {
-      print OUT "#endif\n";
-   }
-   close SRC;
-   close OUT;
-
-   unlink($filename);
-   rename("tmp", $filename);
-}
-print CLASS "#endif\n\n";
-
-# now do classes 
-
-foreach my $filename (glob "bn*.c") {
-   open(SRC, "<$filename") or die "Can't open source file!\n"; 
-
-   # convert filename to upper case so we can use it as a define 
-   $filename =~ tr/[a-z]/[A-Z]/;
-   $filename =~ tr/\./_/;
-
-   print CLASS "#if defined($filename)\n";
-   my $list = $filename;
-
-   # scan for mp_* and make classes
-   while (<SRC>) {
-      my $line = $_;
-      while ($line =~ m/(fast_)*(s_)*mp\_[a-z_0-9]*/) {
-          $line = $';
-          # now $& is the match, we want to skip over LTM keywords like
-          # mp_int, mp_word, mp_digit
-          if (!($& eq "mp_digit") && !($& eq "mp_word") && !($& eq "mp_int") && !($& eq "mp_min_u32")) {
-             my $a = $&;
-             $a =~ tr/[a-z]/[A-Z]/;
-             $a = "BN_" . $a . "_C";
-             if (!($list =~ /$a/)) {
-                print CLASS "   #define $a\n";
-             }
-             $list = $list . "," . $a;
-          }
-      }
-   }
-   @deplist{$filename} = $list;
-
-   print CLASS "#endif\n\n";
-   close SRC;
-}
-
-print CLASS "#ifdef LTM3\n#define LTM_LAST\n#endif\n#include <tommath_superclass.h>\n#include <tommath_class.h>\n#else\n#define LTM_LAST\n#endif\n";
-close CLASS;
-
-#now let's make a cool call graph... 
-
-open(OUT,">callgraph.txt");
-$indent = 0;
-foreach (keys %deplist) {
-   $list = "";
-   draw_func(@deplist{$_});
-   print OUT "\n\n";
-}
-close(OUT);
-
-sub draw_func()
-{
-   my @funcs = split(",", $_[0]);
-   if ($list =~ /@funcs[0]/) {
-      return;
-   } else {
-      $list = $list . @funcs[0];
-   }
-   if ($indent == 0) { }
-   elsif ($indent >= 1) { print OUT "|   " x ($indent - 1) . "+--->"; }
-   print OUT @funcs[0] . "\n";   
-   shift @funcs;
-      my $temp = $list;
-   foreach my $i (@funcs) {
-      ++$indent;
-      draw_func(@deplist{$i});
-      --$indent;
-   }
-      $list = $temp;
-}
-
-
diff --git a/libtommath/etc/2kprime.1 b/libtommath/etc/2kprime.1
deleted file mode 100644
index c41ded1..0000000
--- a/libtommath/etc/2kprime.1
+++ /dev/null
@@ -1,2 +0,0 @@
-256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
-512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
diff --git a/libtommath/etc/2kprime.c b/libtommath/etc/2kprime.c
deleted file mode 100644
index 9450283..0000000
--- a/libtommath/etc/2kprime.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Makes safe primes of a 2k nature */
-#include <tommath.h>
-#include <time.h>
-
-int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
-
-int main(void)
-{
-   char buf[2000];
-   int x, y;
-   mp_int q, p;
-   FILE *out;
-   clock_t t1;
-   mp_digit z;
-   
-   mp_init_multi(&q, &p, NULL);
-   
-   out = fopen("2kprime.1", "w");
-   for (x = 0; x < (int)(sizeof(sizes) / sizeof(sizes[0])); x++) {
-   top:
-       mp_2expt(&q, sizes[x]);
-       mp_add_d(&q, 3, &q);
-       z = -3;
-       
-       t1 = clock();
-       for(;;) {
-         mp_sub_d(&q, 4, &q);
-         z += 4;
-
-         if (z > MP_MASK) {
-            printf("No primes of size %d found\n", sizes[x]);
-            break;
-         }
-         
-         if (clock() - t1 > CLOCKS_PER_SEC) { 
-            printf("."); fflush(stdout);
-//            sleep((clock() - t1 + CLOCKS_PER_SEC/2)/CLOCKS_PER_SEC);
-            t1 = clock();
-         }
-         
-         /* quick test on q */
-         mp_prime_is_prime(&q, 1, &y);
-         if (y == 0) {
-            continue;
-         }
-
-         /* find (q-1)/2 */
-         mp_sub_d(&q, 1, &p);
-         mp_div_2(&p, &p);
-         mp_prime_is_prime(&p, 3, &y);
-         if (y == 0) {
-            continue;
-         }
-
-         /* test on q */
-         mp_prime_is_prime(&q, 3, &y);
-         if (y == 0) {
-            continue;
-         }
-
-         break;
-       }
-       
-       if (y == 0) {
-          ++sizes[x];
-          goto top;
-       }
-       
-       mp_toradix(&q, buf, 10);
-       printf("\n\n%d-bits (k = %lu) = %s\n", sizes[x], z, buf);
-       fprintf(out, "%d-bits (k = %lu) = %s\n", sizes[x], z, buf); fflush(out);
-   }
-   
-   return 0;
-}   
-       
-         
-            
-            
-          
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/etc/drprime.c b/libtommath/etc/drprime.c
deleted file mode 100644
index c7d253f..0000000
--- a/libtommath/etc/drprime.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Makes safe primes of a DR nature */
-#include <tommath.h>
-
-int sizes[] = { 1+256/DIGIT_BIT, 1+512/DIGIT_BIT, 1+768/DIGIT_BIT, 1+1024/DIGIT_BIT, 1+2048/DIGIT_BIT, 1+4096/DIGIT_BIT };
-int main(void)
-{
-   int res, x, y;
-   char buf[4096];
-   FILE *out;
-   mp_int a, b;
-   
-   mp_init(&a);
-   mp_init(&b);
-   
-   out = fopen("drprimes.txt", "w");
-   for (x = 0; x < (int)(sizeof(sizes)/sizeof(sizes[0])); x++) {
-   top:
-       printf("Seeking a %d-bit safe prime\n", sizes[x] * DIGIT_BIT);
-       mp_grow(&a, sizes[x]);
-       mp_zero(&a);
-       for (y = 1; y < sizes[x]; y++) {
-           a.dp[y] = MP_MASK;
-       }
-       
-       /* make a DR modulus */
-       a.dp[0] = -1;
-       a.used = sizes[x];
-       
-       /* now loop */
-       res = 0;
-       for (;;) { 
-          a.dp[0] += 4;
-          if (a.dp[0] >= MP_MASK) break;
-          mp_prime_is_prime(&a, 1, &res);
-          if (res == 0) continue;
-          printf("."); fflush(stdout);
-          mp_sub_d(&a, 1, &b);
-          mp_div_2(&b, &b);
-          mp_prime_is_prime(&b, 3, &res);  
-          if (res == 0) continue;
-          mp_prime_is_prime(&a, 3, &res);
-          if (res == 1) break;
-	}
-        
-        if (res != 1) {
-           printf("Error not DR modulus\n"); sizes[x] += 1; goto top;
-        } else {
-           mp_toradix(&a, buf, 10);
-           printf("\n\np == %s\n\n", buf);
-           fprintf(out, "%d-bit prime:\np == %s\n\n", mp_count_bits(&a), buf); fflush(out);
-        }           
-   }
-   fclose(out);
-   
-   mp_clear(&a);
-   mp_clear(&b);
-   
-   return 0;
-}
-
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/etc/drprimes.28 b/libtommath/etc/drprimes.28
deleted file mode 100644
index 9d438ad..0000000
--- a/libtommath/etc/drprimes.28
+++ /dev/null
@@ -1,25 +0,0 @@
-DR safe primes for 28-bit digits.
-
-224-bit prime:
-p == 26959946667150639794667015087019630673637144422540572481103341844143
-
-532-bit prime:
-p == 14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368691747
-
-784-bit prime:
-p == 101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039
-
-1036-bit prime:
-p == 736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821798437127
-
-1540-bit prime:
-p == 38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783
-
-2072-bit prime:
-p == 542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147
-
-3080-bit prime:
-p == 1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503
-
-4116-bit prime:
-p == 1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679
diff --git a/libtommath/etc/drprimes.txt b/libtommath/etc/drprimes.txt
deleted file mode 100644
index 7c97f67..0000000
--- a/libtommath/etc/drprimes.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-300-bit prime:
-p == 2037035976334486086268445688409378161051468393665936250636140449354381298610415201576637819
-
-540-bit prime:
-p == 3599131035634557106248430806148785487095757694641533306480604458089470064537190296255232548883112685719936728506816716098566612844395439751206810991770626477344739
-
-780-bit prime:
-p == 6359114106063703798370219984742410466332205126109989319225557147754704702203399726411277962562135973685197744935448875852478791860694279747355800678568677946181447581781401213133886609947027230004277244697462656003655947791725966271167
-
diff --git a/libtommath/etc/makefile b/libtommath/etc/makefile
deleted file mode 100644
index 99154d8..0000000
--- a/libtommath/etc/makefile
+++ /dev/null
@@ -1,50 +0,0 @@
-CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
-
-# default lib name (requires install with root)
-# LIBNAME=-ltommath
-
-# libname when you can't install the lib with install
-LIBNAME=../libtommath.a
-
-#provable primes
-pprime: pprime.o
-	$(CC) pprime.o $(LIBNAME) -o pprime
-
-# portable [well requires clock()] tuning app
-tune: tune.o
-	$(CC) tune.o $(LIBNAME) -o tune
-	
-# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
-tune86: tune.c
-	nasm -f coff timer.asm
-	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
-	
-# for cygwin
-tune86c: tune.c
-	nasm -f gnuwin32 timer.asm
-	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
-
-#make tune86 for linux or any ELF format
-tune86l: tune.c
-	nasm -f elf -DUSE_ELF timer.asm
-	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
-        
-# spits out mersenne primes
-mersenne: mersenne.o
-	$(CC) mersenne.o $(LIBNAME) -o mersenne
-
-# fines DR safe primes for the given config
-drprime: drprime.o
-	$(CC) drprime.o $(LIBNAME) -o drprime
-	
-# fines 2k safe primes for the given config
-2kprime: 2kprime.o
-	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
-
-mont: mont.o
-	$(CC) mont.o $(LIBNAME) -o mont
-
-        
-clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat \
-         *.da *.dyn *.dpi *~
diff --git a/libtommath/etc/makefile.icc b/libtommath/etc/makefile.icc
deleted file mode 100644
index 8a1ffff..0000000
--- a/libtommath/etc/makefile.icc
+++ /dev/null
@@ -1,67 +0,0 @@
-CC = icc
-
-CFLAGS += -I../
-
-# optimize for SPEED
-#
-# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
-# -ax?   specifies make code specifically for ? but compatible with IA-32
-# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
-#
-# where ? is 
-#   K - PIII
-#   W - first P4 [Williamette]
-#   N - P4 Northwood
-#   P - P4 Prescott
-#   B - Blend of P4 and PM [mobile]
-#
-# Default to just generic max opts
-CFLAGS += -O3 -xP -ip
-
-# default lib name (requires install with root)
-# LIBNAME=-ltommath
-
-# libname when you can't install the lib with install
-LIBNAME=../libtommath.a
-
-#provable primes
-pprime: pprime.o
-	$(CC) pprime.o $(LIBNAME) -o pprime
-
-# portable [well requires clock()] tuning app
-tune: tune.o
-	$(CC) tune.o $(LIBNAME) -o tune
-	
-# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
-tune86: tune.c
-	nasm -f coff timer.asm
-	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
-	
-# for cygwin
-tune86c: tune.c
-	nasm -f gnuwin32 timer.asm
-	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
-
-#make tune86 for linux or any ELF format
-tune86l: tune.c
-	nasm -f elf -DUSE_ELF timer.asm
-	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
-        
-# spits out mersenne primes
-mersenne: mersenne.o
-	$(CC) mersenne.o $(LIBNAME) -o mersenne
-
-# fines DR safe primes for the given config
-drprime: drprime.o
-	$(CC) drprime.o $(LIBNAME) -o drprime
-	
-# fines 2k safe primes for the given config
-2kprime: 2kprime.o
-	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
-
-mont: mont.o
-	$(CC) mont.o $(LIBNAME) -o mont
-
-        
-clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il
diff --git a/libtommath/etc/makefile.msvc b/libtommath/etc/makefile.msvc
deleted file mode 100644
index 2833372..0000000
--- a/libtommath/etc/makefile.msvc
+++ /dev/null
@@ -1,23 +0,0 @@
-#MSVC Makefile
-#
-#Tom St Denis
-
-CFLAGS = /I../ /Ox /DWIN32 /W3
-
-pprime: pprime.obj
-	cl pprime.obj ../tommath.lib 
-
-mersenne: mersenne.obj
-	cl mersenne.obj ../tommath.lib
-	
-tune: tune.obj
-	cl tune.obj ../tommath.lib
-
-mont: mont.obj
-	cl mont.obj ../tommath.lib
-	
-drprime: drprime.obj
-	cl drprime.obj ../tommath.lib
-
-2kprime: 2kprime.obj
-	cl 2kprime.obj ../tommath.lib
diff --git a/libtommath/etc/mersenne.c b/libtommath/etc/mersenne.c
deleted file mode 100644
index ae6725a..0000000
--- a/libtommath/etc/mersenne.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Finds Mersenne primes using the Lucas-Lehmer test 
- *
- * Tom St Denis, tomstdenis@gmail.com
- */
-#include <time.h>
-#include <tommath.h>
-
-int
-is_mersenne (long s, int *pp)
-{
-  mp_int  n, u;
-  int     res, k;
-  
-  *pp = 0;
-
-  if ((res = mp_init (&n)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto LBL_N;
-  }
-
-  /* n = 2^s - 1 */
-  if ((res = mp_2expt(&n, s)) != MP_OKAY) {
-     goto LBL_MU;
-  }
-  if ((res = mp_sub_d (&n, 1, &n)) != MP_OKAY) {
-    goto LBL_MU;
-  }
-
-  /* set u=4 */
-  mp_set (&u, 4);
-
-  /* for k=1 to s-2 do */
-  for (k = 1; k <= s - 2; k++) {
-    /* u = u^2 - 2 mod n */
-    if ((res = mp_sqr (&u, &u)) != MP_OKAY) {
-      goto LBL_MU;
-    }
-    if ((res = mp_sub_d (&u, 2, &u)) != MP_OKAY) {
-      goto LBL_MU;
-    }
-
-    /* make sure u is positive */
-    while (u.sign == MP_NEG) {
-      if ((res = mp_add (&u, &n, &u)) != MP_OKAY) {
-         goto LBL_MU;
-      }
-    }
-
-    /* reduce */
-    if ((res = mp_reduce_2k (&u, &n, 1)) != MP_OKAY) {
-      goto LBL_MU;
-    }
-  }
-
-  /* if u == 0 then its prime */
-  if (mp_iszero (&u) == 1) {
-    mp_prime_is_prime(&n, 8, pp);
-  if (*pp != 1) printf("FAILURE\n");
-  }
-
-  res = MP_OKAY;
-LBL_MU:mp_clear (&u);
-LBL_N:mp_clear (&n);
-  return res;
-}
-
-/* square root of a long < 65536 */
-long
-i_sqrt (long x)
-{
-  long    x1, x2;
-
-  x2 = 16;
-  do {
-    x1 = x2;
-    x2 = x1 - ((x1 * x1) - x) / (2 * x1);
-  } while (x1 != x2);
-
-  if (x1 * x1 > x) {
-    --x1;
-  }
-
-  return x1;
-}
-
-/* is the long prime by brute force */
-int
-isprime (long k)
-{
-  long    y, z;
-
-  y = i_sqrt (k);
-  for (z = 2; z <= y; z++) {
-    if ((k % z) == 0)
-      return 0;
-  }
-  return 1;
-}
-
-
-int
-main (void)
-{
-  int     pp;
-  long    k;
-  clock_t tt;
-
-  k = 3;
-
-  for (;;) {
-    /* start time */
-    tt = clock ();
-
-    /* test if 2^k - 1 is prime */
-    if (is_mersenne (k, &pp) != MP_OKAY) {
-      printf ("Whoa error\n");
-      return -1;
-    }
-
-    if (pp == 1) {
-      /* count time */
-      tt = clock () - tt;
-
-      /* display if prime */
-      printf ("2^%-5ld - 1 is prime, test took %ld ticks\n", k, tt);
-    }
-
-    /* goto next odd exponent */
-    k += 2;
-
-    /* but make sure its prime */
-    while (isprime (k) == 0) {
-      k += 2;
-    }
-  }
-  return 0;
-}
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/etc/mont.c b/libtommath/etc/mont.c
deleted file mode 100644
index 45cf3fd..0000000
--- a/libtommath/etc/mont.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* tests the montgomery routines */
-#include <tommath.h>
-
-int main(void)
-{
-   mp_int modulus, R, p, pp;
-   mp_digit mp;
-   long x, y;
-
-   srand(time(NULL));
-   mp_init_multi(&modulus, &R, &p, &pp, NULL);
-
-   /* loop through various sizes */
-   for (x = 4; x < 256; x++) {
-       printf("DIGITS == %3ld...", x); fflush(stdout);
-       
-       /* make up the odd modulus */
-       mp_rand(&modulus, x);
-       modulus.dp[0] |= 1;
-       
-       /* now find the R value */
-       mp_montgomery_calc_normalization(&R, &modulus);
-       mp_montgomery_setup(&modulus, &mp);
-       
-       /* now run through a bunch tests */
-       for (y = 0; y < 1000; y++) {
-           mp_rand(&p, x/2);        /* p = random */
-           mp_mul(&p, &R, &pp);     /* pp = R * p */
-           mp_montgomery_reduce(&pp, &modulus, mp);
-           
-           /* should be equal to p */
-           if (mp_cmp(&pp, &p) != MP_EQ) {
-              printf("FAILURE!\n");
-              exit(-1);
-           }
-       }
-       printf("PASSED\n");
-    }
-    
-    return 0;
-}
-
-
-
-
-
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/etc/pprime.c b/libtommath/etc/pprime.c
deleted file mode 100644
index 9f94423..0000000
--- a/libtommath/etc/pprime.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Generates provable primes
- *
- * See http://gmail.com:8080/papers/pp.pdf for more info.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://tom.gmail.com
- */
-#include <time.h>
-#include "tommath.h"
-
-int   n_prime;
-FILE *primes;
-
-/* fast square root */
-static  mp_digit
-i_sqrt (mp_word x)
-{
-  mp_word x1, x2;
-
-  x2 = x;
-  do {
-    x1 = x2;
-    x2 = x1 - ((x1 * x1) - x) / (2 * x1);
-  } while (x1 != x2);
-
-  if (x1 * x1 > x) {
-    --x1;
-  }
-
-  return x1;
-}
-
-
-/* generates a prime digit */
-static void gen_prime (void)
-{
-  mp_digit r, x, y, next;
-  FILE *out;
-
-  out = fopen("pprime.dat", "wb");
-
-  /* write first set of primes */
-  r = 3; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 5; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 7; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 11; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 13; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 17; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 19; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 23; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 29; fwrite(&r, 1, sizeof(mp_digit), out);
-  r = 31; fwrite(&r, 1, sizeof(mp_digit), out);
-
-  /* get square root, since if 'r' is composite its factors must be < than this */
-  y = i_sqrt (r);
-  next = (y + 1) * (y + 1);
-
-  for (;;) {
-  do {
-    r += 2;			/* next candidate */
-    r &= MP_MASK;
-    if (r < 31) break;
-
-    /* update sqrt ? */
-    if (next <= r) {
-      ++y;
-      next = (y + 1) * (y + 1);
-    }
-
-    /* loop if divisible by 3,5,7,11,13,17,19,23,29  */
-    if ((r % 3) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 5) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 7) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 11) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 13) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 17) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 19) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 23) == 0) {
-      x = 0;
-      continue;
-    }
-    if ((r % 29) == 0) {
-      x = 0;
-      continue;
-    }
-
-    /* now check if r is divisible by x + k={1,7,11,13,17,19,23,29} */
-    for (x = 30; x <= y; x += 30) {
-      if ((r % (x + 1)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 7)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 11)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 13)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 17)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 19)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 23)) == 0) {
-	x = 0;
-	break;
-      }
-      if ((r % (x + 29)) == 0) {
-	x = 0;
-	break;
-      }
-    }
-  } while (x == 0);
-  if (r > 31) { fwrite(&r, 1, sizeof(mp_digit), out); printf("%9d\r", r); fflush(stdout); }
-  if (r < 31) break;
-  }
-
-  fclose(out);
-}
-
-void load_tab(void)
-{
-   primes = fopen("pprime.dat", "rb");
-   if (primes == NULL) {
-      gen_prime();
-      primes = fopen("pprime.dat", "rb");
-   }
-   fseek(primes, 0, SEEK_END);
-   n_prime = ftell(primes) / sizeof(mp_digit);
-}
-
-mp_digit prime_digit(void)
-{
-   int n;
-   mp_digit d;
-
-   n = abs(rand()) % n_prime;
-   fseek(primes, n * sizeof(mp_digit), SEEK_SET);
-   fread(&d, 1, sizeof(mp_digit), primes);
-   return d;
-}
-
-
-/* makes a prime of at least k bits */
-int
-pprime (int k, int li, mp_int * p, mp_int * q)
-{
-  mp_int  a, b, c, n, x, y, z, v;
-  int     res, ii;
-  static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
-
-  /* single digit ? */
-  if (k <= (int) DIGIT_BIT) {
-    mp_set (p, prime_digit ());
-    return MP_OKAY;
-  }
-
-  if ((res = mp_init (&c)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto LBL_C;
-  }
-
-  /* product of first 50 primes */
-  if ((res =
-       mp_read_radix (&v,
-		      "19078266889580195013601891820992757757219839668357012055907516904309700014933909014729740190",
-		      10)) != MP_OKAY) {
-    goto LBL_V;
-  }
-
-  if ((res = mp_init (&a)) != MP_OKAY) {
-    goto LBL_V;
-  }
-
-  /* set the prime */
-  mp_set (&a, prime_digit ());
-
-  if ((res = mp_init (&b)) != MP_OKAY) {
-    goto LBL_A;
-  }
-
-  if ((res = mp_init (&n)) != MP_OKAY) {
-    goto LBL_B;
-  }
-
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto LBL_N;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto LBL_X;
-  }
-
-  if ((res = mp_init (&z)) != MP_OKAY) {
-    goto LBL_Y;
-  }
-
-  /* now loop making the single digit */
-  while (mp_count_bits (&a) < k) {
-    fprintf (stderr, "prime has %4d bits left\r", k - mp_count_bits (&a));
-    fflush (stderr);
-  top:
-    mp_set (&b, prime_digit ());
-
-    /* now compute z = a * b * 2 */
-    if ((res = mp_mul (&a, &b, &z)) != MP_OKAY) {	/* z = a * b */
-      goto LBL_Z;
-    }
-
-    if ((res = mp_copy (&z, &c)) != MP_OKAY) {	/* c = a * b */
-      goto LBL_Z;
-    }
-
-    if ((res = mp_mul_2 (&z, &z)) != MP_OKAY) {	/* z = 2 * a * b */
-      goto LBL_Z;
-    }
-
-    /* n = z + 1 */
-    if ((res = mp_add_d (&z, 1, &n)) != MP_OKAY) {	/* n = z + 1 */
-      goto LBL_Z;
-    }
-
-    /* check (n, v) == 1 */
-    if ((res = mp_gcd (&n, &v, &y)) != MP_OKAY) {	/* y = (n, v) */
-      goto LBL_Z;
-    }
-
-    if (mp_cmp_d (&y, 1) != MP_EQ)
-      goto top;
-
-    /* now try base x=bases[ii]  */
-    for (ii = 0; ii < li; ii++) {
-      mp_set (&x, bases[ii]);
-
-      /* compute x^a mod n */
-      if ((res = mp_exptmod (&x, &a, &n, &y)) != MP_OKAY) {	/* y = x^a mod n */
-	goto LBL_Z;
-      }
-
-      /* if y == 1 loop */
-      if (mp_cmp_d (&y, 1) == MP_EQ)
-	continue;
-
-      /* now x^2a mod n */
-      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2a mod n */
-	goto LBL_Z;
-      }
-
-      if (mp_cmp_d (&y, 1) == MP_EQ)
-	continue;
-
-      /* compute x^b mod n */
-      if ((res = mp_exptmod (&x, &b, &n, &y)) != MP_OKAY) {	/* y = x^b mod n */
-	goto LBL_Z;
-      }
-
-      /* if y == 1 loop */
-      if (mp_cmp_d (&y, 1) == MP_EQ)
-	continue;
-
-      /* now x^2b mod n */
-      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2b mod n */
-	goto LBL_Z;
-      }
-
-      if (mp_cmp_d (&y, 1) == MP_EQ)
-	continue;
-
-      /* compute x^c mod n == x^ab mod n */
-      if ((res = mp_exptmod (&x, &c, &n, &y)) != MP_OKAY) {	/* y = x^ab mod n */
-	goto LBL_Z;
-      }
-
-      /* if y == 1 loop */
-      if (mp_cmp_d (&y, 1) == MP_EQ)
-	continue;
-
-      /* now compute (x^c mod n)^2 */
-      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2ab mod n */
-	goto LBL_Z;
-      }
-
-      /* y should be 1 */
-      if (mp_cmp_d (&y, 1) != MP_EQ)
-	continue;
-      break;
-    }
-
-    /* no bases worked? */
-    if (ii == li)
-      goto top;
-
-{
-   char buf[4096];
-
-   mp_toradix(&n, buf, 10);
-   printf("Certificate of primality for:\n%s\n\n", buf);
-   mp_toradix(&a, buf, 10);
-   printf("A == \n%s\n\n", buf);
-   mp_toradix(&b, buf, 10);
-   printf("B == \n%s\n\nG == %d\n", buf, bases[ii]);
-   printf("----------------------------------------------------------------\n");
-}
-
-    /* a = n */
-    mp_copy (&n, &a);
-  }
-
-  /* get q to be the order of the large prime subgroup */
-  mp_sub_d (&n, 1, q);
-  mp_div_2 (q, q);
-  mp_div (q, &b, q, NULL);
-
-  mp_exch (&n, p);
-
-  res = MP_OKAY;
-LBL_Z:mp_clear (&z);
-LBL_Y:mp_clear (&y);
-LBL_X:mp_clear (&x);
-LBL_N:mp_clear (&n);
-LBL_B:mp_clear (&b);
-LBL_A:mp_clear (&a);
-LBL_V:mp_clear (&v);
-LBL_C:mp_clear (&c);
-  return res;
-}
-
-
-int
-main (void)
-{
-  mp_int  p, q;
-  char    buf[4096];
-  int     k, li;
-  clock_t t1;
-
-  srand (time (NULL));
-  load_tab();
-
-  printf ("Enter # of bits: \n");
-  fgets (buf, sizeof (buf), stdin);
-  sscanf (buf, "%d", &k);
-
-  printf ("Enter number of bases to try (1 to 8):\n");
-  fgets (buf, sizeof (buf), stdin);
-  sscanf (buf, "%d", &li);
-
-
-  mp_init (&p);
-  mp_init (&q);
-
-  t1 = clock ();
-  pprime (k, li, &p, &q);
-  t1 = clock () - t1;
-
-  printf ("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits (&p));
-
-  mp_toradix (&p, buf, 10);
-  printf ("P == %s\n", buf);
-  mp_toradix (&q, buf, 10);
-  printf ("Q == %s\n", buf);
-
-  return 0;
-}
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/etc/prime.1024 b/libtommath/etc/prime.1024
deleted file mode 100644
index 5636e2d..0000000
--- a/libtommath/etc/prime.1024
+++ /dev/null
@@ -1,414 +0,0 @@
-Enter # of bits: 
-Enter number of bases to try (1 to 8):
-Certificate of primality for:
-36360080703173363
-
-A == 
-89963569
-
-B == 
-202082249
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-4851595597739856136987139
-
-A == 
-36360080703173363
-
-B == 
-66715963
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-19550639734462621430325731591027
-
-A == 
-4851595597739856136987139
-
-B == 
-2014867
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-10409036141344317165691858509923818734539
-
-A == 
-19550639734462621430325731591027
-
-B == 
-266207047
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1049829549988285012736475602118094726647504414203
-
-A == 
-10409036141344317165691858509923818734539
-
-B == 
-50428759
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-77194737385528288387712399596835459931920358844586615003
-
-A == 
-1049829549988285012736475602118094726647504414203
-
-B == 
-36765367
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-35663756695365208574443215955488689578374232732893628896541201763
-
-A == 
-77194737385528288387712399596835459931920358844586615003
-
-B == 
-230998627
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-16711831463502165169495622246023119698415848120292671294127567620396469803
-
-A == 
-35663756695365208574443215955488689578374232732893628896541201763
-
-B == 
-234297127
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-6163534781560285962890718925972249753147470953579266394395432475622345597103528739
-
-A == 
-16711831463502165169495622246023119698415848120292671294127567620396469803
-
-B == 
-184406323
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-814258256205243497704094951432575867360065658372158511036259934640748088306764553488803787
-
-A == 
-6163534781560285962890718925972249753147470953579266394395432475622345597103528739
-
-B == 
-66054487
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-176469695533271657902814176811660357049007467856432383037590673407330246967781451723764079581998187
-
-A == 
-814258256205243497704094951432575867360065658372158511036259934640748088306764553488803787
-
-B == 
-108362239
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-44924492859445516541759485198544012102424796403707253610035148063863073596051272171194806669756971406400419
-
-A == 
-176469695533271657902814176811660357049007467856432383037590673407330246967781451723764079581998187
-
-B == 
-127286707
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-20600996927219343383225424320134474929609459588323857796871086845924186191561749519858600696159932468024710985371059
-
-A == 
-44924492859445516541759485198544012102424796403707253610035148063863073596051272171194806669756971406400419
-
-B == 
-229284691
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-6295696427695493110141186605837397185848992307978456138112526915330347715236378041486547994708748840844217371233735072572979
-
-A == 
-20600996927219343383225424320134474929609459588323857796871086845924186191561749519858600696159932468024710985371059
-
-B == 
-152800771
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-3104984078042317488749073016454213579257792635142218294052134804187631661145261015102617582090263808696699966840735333252107678792123
-
-A == 
-6295696427695493110141186605837397185848992307978456138112526915330347715236378041486547994708748840844217371233735072572979
-
-B == 
-246595759
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-26405175827665701256325699315126705508919255051121452292124404943796947287968603975320562847910946802396632302209435206627913466015741799499
-
-A == 
-3104984078042317488749073016454213579257792635142218294052134804187631661145261015102617582090263808696699966840735333252107678792123
-
-B == 
-4252063
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-11122146237908413610034600609460545703591095894418599759742741406628055069007082998134905595800236452010905900391505454890446585211975124558601770163
-
-A == 
-26405175827665701256325699315126705508919255051121452292124404943796947287968603975320562847910946802396632302209435206627913466015741799499
-
-B == 
-210605419
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1649861642047798890580354082088712649911849362201343649289384923147797960364736011515757482030049342943790127685185806092659832129486307035500638595572396187
-
-A == 
-11122146237908413610034600609460545703591095894418599759742741406628055069007082998134905595800236452010905900391505454890446585211975124558601770163
-
-B == 
-74170111
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-857983367126266717607389719637086684134462613006415859877666235955788392464081914127715967940968197765042399904117392707518175220864852816390004264107201177394565363
-
-A == 
-1649861642047798890580354082088712649911849362201343649289384923147797960364736011515757482030049342943790127685185806092659832129486307035500638595572396187
-
-B == 
-260016763
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-175995909353623703257072120479340610010337144085688850745292031336724691277374210929188442230237711063783727092685448718515661641054886101716698390145283196296702450566161283
-
-A == 
-857983367126266717607389719637086684134462613006415859877666235955788392464081914127715967940968197765042399904117392707518175220864852816390004264107201177394565363
-
-B == 
-102563707
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-48486002551155667224487059713350447239190772068092630563272168418880661006593537218144160068395218642353495339720640699721703003648144463556291315694787862009052641640656933232794283
-
-A == 
-175995909353623703257072120479340610010337144085688850745292031336724691277374210929188442230237711063783727092685448718515661641054886101716698390145283196296702450566161283
-
-B == 
-137747527
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-13156468011529105025061495011938518171328604045212410096476697450506055664012861932372156505805788068791146986282263016790631108386790291275939575123375304599622623328517354163964228279867403
-
-A == 
-48486002551155667224487059713350447239190772068092630563272168418880661006593537218144160068395218642353495339720640699721703003648144463556291315694787862009052641640656933232794283
-
-B == 
-135672847
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-6355194692790533601105154341731997464407930009404822926832136060319955058388106456084549316415200519472481147942263916585428906582726749131479465958107142228236909665306781538860053107680830113869123
-
-A == 
-13156468011529105025061495011938518171328604045212410096476697450506055664012861932372156505805788068791146986282263016790631108386790291275939575123375304599622623328517354163964228279867403
-
-B == 
-241523587
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-3157116676535430302794438027544146642863331358530722860333745617571010460905857862561870488000265751138954271040017454405707755458702044884023184574412221802502351503929935224995314581932097706874819348858083
-
-A == 
-6355194692790533601105154341731997464407930009404822926832136060319955058388106456084549316415200519472481147942263916585428906582726749131479465958107142228236909665306781538860053107680830113869123
-
-B == 
-248388667
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-390533129219992506725320633489467713907837370444962163378727819939092929448752905310115311180032249230394348337568973177802874166228132778126338883671958897238722734394783244237133367055422297736215754829839364158067
-
-A == 
-3157116676535430302794438027544146642863331358530722860333745617571010460905857862561870488000265751138954271040017454405707755458702044884023184574412221802502351503929935224995314581932097706874819348858083
-
-B == 
-61849651
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-48583654555070224891047847050732516652910250240135992225139515777200432486685999462997073444468380434359929499498804723793106565291183220444221080449740542884172281158126259373095216435009661050109711341419005972852770440739
-
-A == 
-390533129219992506725320633489467713907837370444962163378727819939092929448752905310115311180032249230394348337568973177802874166228132778126338883671958897238722734394783244237133367055422297736215754829839364158067
-
-B == 
-62201707
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-25733035251905120039135866524384525138869748427727001128764704499071378939227862068500633813538831598776578372709963673670934388213622433800015759585470542686333039614931682098922935087822950084908715298627996115185849260703525317419
-
-A == 
-48583654555070224891047847050732516652910250240135992225139515777200432486685999462997073444468380434359929499498804723793106565291183220444221080449740542884172281158126259373095216435009661050109711341419005972852770440739
-
-B == 
-264832231
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-2804594464939948901906623499531073917980499195397462605359913717827014360538186518540781517129548650937632008683280555602633122170458773895504894807182664540529077836857897972175530148107545939211339044386106111633510166695386323426241809387
-
-A == 
-25733035251905120039135866524384525138869748427727001128764704499071378939227862068500633813538831598776578372709963673670934388213622433800015759585470542686333039614931682098922935087822950084908715298627996115185849260703525317419
-
-B == 
-54494047
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-738136612083433720096707308165797114449914259256979340471077690416567237592465306112484843530074782721390528773594351482384711900456440808251196845265132086486672447136822046628407467459921823150600138073268385534588238548865012638209515923513516547
-
-A == 
-2804594464939948901906623499531073917980499195397462605359913717827014360538186518540781517129548650937632008683280555602633122170458773895504894807182664540529077836857897972175530148107545939211339044386106111633510166695386323426241809387
-
-B == 
-131594179
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-392847529056126766528615419937165193421166694172790666626558750047057558168124866940509180171236517681470100877687445134633784815352076138790217228749332398026714192707447855731679485746120589851992221508292976900578299504461333767437280988393026452846013683
-
-A == 
-738136612083433720096707308165797114449914259256979340471077690416567237592465306112484843530074782721390528773594351482384711900456440808251196845265132086486672447136822046628407467459921823150600138073268385534588238548865012638209515923513516547
-
-B == 
-266107603
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-168459393231883505975876919268398655632763956627405508859662408056221544310200546265681845397346956580604208064328814319465940958080244889692368602591598503944015835190587740756859842792554282496742843600573336023639256008687581291233481455395123454655488735304365627
-
-A == 
-392847529056126766528615419937165193421166694172790666626558750047057558168124866940509180171236517681470100877687445134633784815352076138790217228749332398026714192707447855731679485746120589851992221508292976900578299504461333767437280988393026452846013683
-
-B == 
-214408111
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-14865774288636941404884923981945833072113667565310054952177860608355263252462409554658728941191929400198053290113492910272458441655458514080123870132092365833472436407455910185221474386718838138135065780840839893113912689594815485706154461164071775481134379794909690501684643
-
-A == 
-168459393231883505975876919268398655632763956627405508859662408056221544310200546265681845397346956580604208064328814319465940958080244889692368602591598503944015835190587740756859842792554282496742843600573336023639256008687581291233481455395123454655488735304365627
-
-B == 
-44122723
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1213301773203241614897109856134894783021668292000023984098824423682568173639394290886185366993108292039068940333907505157813934962357206131450244004178619265868614859794316361031904412926604138893775068853175215502104744339658944443630407632290152772487455298652998368296998719996019
-
-A == 
-14865774288636941404884923981945833072113667565310054952177860608355263252462409554658728941191929400198053290113492910272458441655458514080123870132092365833472436407455910185221474386718838138135065780840839893113912689594815485706154461164071775481134379794909690501684643
-
-B == 
-40808563
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-186935245989515158127969129347464851990429060640910951266513740972248428651109062997368144722015290092846666943896556191257222521203647606911446635194198213436423080005867489516421559330500722264446765608763224572386410155413161172707802334865729654109050873820610813855041667633843601286843
-
-A == 
-1213301773203241614897109856134894783021668292000023984098824423682568173639394290886185366993108292039068940333907505157813934962357206131450244004178619265868614859794316361031904412926604138893775068853175215502104744339658944443630407632290152772487455298652998368296998719996019
-
-B == 
-77035759
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-83142661079751490510739960019112406284111408348732592580459037404394946037094409915127399165633756159385609671956087845517678367844901424617866988187132480585966721962585586730693443536100138246516868613250009028187662080828012497191775172228832247706080044971423654632146928165751885302331924491683
-
-A == 
-186935245989515158127969129347464851990429060640910951266513740972248428651109062997368144722015290092846666943896556191257222521203647606911446635194198213436423080005867489516421559330500722264446765608763224572386410155413161172707802334865729654109050873820610813855041667633843601286843
-
-B == 
-222383587
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-3892354773803809855317742245039794448230625839512638747643814927766738642436392673485997449586432241626440927010641564064764336402368634186618250134234189066179771240232458249806850838490410473462391401438160528157981942499581634732706904411807195259620779379274017704050790865030808501633772117217899534443
-
-A == 
-83142661079751490510739960019112406284111408348732592580459037404394946037094409915127399165633756159385609671956087845517678367844901424617866988187132480585966721962585586730693443536100138246516868613250009028187662080828012497191775172228832247706080044971423654632146928165751885302331924491683
-
-B == 
-23407687
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1663606652988091811284014366560171522582683318514519379924950390627250155440313691226744227787921928894551755219495501365555370027257568506349958010457682898612082048959464465369892842603765280317696116552850664773291371490339084156052244256635115997453399761029567033971998617303988376172539172702246575225837054723
-
-A == 
-3892354773803809855317742245039794448230625839512638747643814927766738642436392673485997449586432241626440927010641564064764336402368634186618250134234189066179771240232458249806850838490410473462391401438160528157981942499581634732706904411807195259620779379274017704050790865030808501633772117217899534443
-
-B == 
-213701827
-
-G == 2
-----------------------------------------------------------------
-
-
-Took 33057 ticks, 1048 bits
-P == 1663606652988091811284014366560171522582683318514519379924950390627250155440313691226744227787921928894551755219495501365555370027257568506349958010457682898612082048959464465369892842603765280317696116552850664773291371490339084156052244256635115997453399761029567033971998617303988376172539172702246575225837054723
-Q == 3892354773803809855317742245039794448230625839512638747643814927766738642436392673485997449586432241626440927010641564064764336402368634186618250134234189066179771240232458249806850838490410473462391401438160528157981942499581634732706904411807195259620779379274017704050790865030808501633772117217899534443
diff --git a/libtommath/etc/prime.512 b/libtommath/etc/prime.512
deleted file mode 100644
index cb6ec30..0000000
--- a/libtommath/etc/prime.512
+++ /dev/null
@@ -1,205 +0,0 @@
-Enter # of bits: 
-Enter number of bases to try (1 to 8):
-Certificate of primality for:
-85933926807634727
-
-A == 
-253758023
-
-B == 
-169322581
-
-G == 5
-----------------------------------------------------------------
-Certificate of primality for:
-23930198825086241462113799
-
-A == 
-85933926807634727
-
-B == 
-139236037
-
-G == 11
-----------------------------------------------------------------
-Certificate of primality for:
-6401844647261612602378676572510019
-
-A == 
-23930198825086241462113799
-
-B == 
-133760791
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-269731366027728777712034888684015329354259
-
-A == 
-6401844647261612602378676572510019
-
-B == 
-21066691
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-37942338209025571690075025099189467992329684223707
-
-A == 
-269731366027728777712034888684015329354259
-
-B == 
-70333567
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-15306904714258982484473490774101705363308327436988160248323
-
-A == 
-37942338209025571690075025099189467992329684223707
-
-B == 
-201712723
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1616744757018513392810355191503853040357155275733333124624513530099
-
-A == 
-15306904714258982484473490774101705363308327436988160248323
-
-B == 
-52810963
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-464222094814208047161771036072622485188658077940154689939306386289983787983
-
-A == 
-1616744757018513392810355191503853040357155275733333124624513530099
-
-B == 
-143566909
-
-G == 5
-----------------------------------------------------------------
-Certificate of primality for:
-187429931674053784626487560729643601208757374994177258429930699354770049369025096447
-
-A == 
-464222094814208047161771036072622485188658077940154689939306386289983787983
-
-B == 
-201875281
-
-G == 5
-----------------------------------------------------------------
-Certificate of primality for:
-100579220846502621074093727119851331775052664444339632682598589456666938521976625305832917563
-
-A == 
-187429931674053784626487560729643601208757374994177258429930699354770049369025096447
-
-B == 
-268311523
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1173616081309758475197022137833792133815753368965945885089720153370737965497134878651384030219765163
-
-A == 
-100579220846502621074093727119851331775052664444339632682598589456666938521976625305832917563
-
-B == 
-5834287
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-191456913489905913185935197655672585713573070349044195411728114905691721186574907738081340754373032735283623
-
-A == 
-1173616081309758475197022137833792133815753368965945885089720153370737965497134878651384030219765163
-
-B == 
-81567097
-
-G == 5
-----------------------------------------------------------------
-Certificate of primality for:
-57856530489201750164178576399448868489243874083056587683743345599898489554401618943240901541005080049321706789987519
-
-A == 
-191456913489905913185935197655672585713573070349044195411728114905691721186574907738081340754373032735283623
-
-B == 
-151095433
-
-G == 7
-----------------------------------------------------------------
-Certificate of primality for:
-13790529750452576698109671710773784949185621244122040804792403407272729038377767162233653248852099545134831722512085881814803
-
-A == 
-57856530489201750164178576399448868489243874083056587683743345599898489554401618943240901541005080049321706789987519
-
-B == 
-119178679
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-7075985989000817742677547821106534174334812111605018857703825637170140040509067704269696198231266351631132464035671858077052876058979
-
-A == 
-13790529750452576698109671710773784949185621244122040804792403407272729038377767162233653248852099545134831722512085881814803
-
-B == 
-256552363
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-1227273006232588072907488910282307435921226646895131225407452056677899411162892829564455154080310937471747140942360789623819327234258162420463
-
-A == 
-7075985989000817742677547821106534174334812111605018857703825637170140040509067704269696198231266351631132464035671858077052876058979
-
-B == 
-86720989
-
-G == 5
-----------------------------------------------------------------
-Certificate of primality for:
-446764896913554613686067036908702877942872355053329937790398156069936255759889884246832779737114032666318220500106499161852193765380831330106375235763
-
-A == 
-1227273006232588072907488910282307435921226646895131225407452056677899411162892829564455154080310937471747140942360789623819327234258162420463
-
-B == 
-182015287
-
-G == 2
-----------------------------------------------------------------
-Certificate of primality for:
-5290203010849586596974953717018896543907195901082056939587768479377028575911127944611236020459652034082251335583308070846379514569838984811187823420951275243
-
-A == 
-446764896913554613686067036908702877942872355053329937790398156069936255759889884246832779737114032666318220500106499161852193765380831330106375235763
-
-B == 
-5920567
-
-G == 2
-----------------------------------------------------------------
-
-
-Took 3454 ticks, 521 bits
-P == 5290203010849586596974953717018896543907195901082056939587768479377028575911127944611236020459652034082251335583308070846379514569838984811187823420951275243
-Q == 446764896913554613686067036908702877942872355053329937790398156069936255759889884246832779737114032666318220500106499161852193765380831330106375235763
diff --git a/libtommath/etc/timer.asm b/libtommath/etc/timer.asm
deleted file mode 100644
index 326a947..0000000
--- a/libtommath/etc/timer.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-; x86 timer in NASM
-;
-; Tom St Denis, tomstdenis@iahu.ca
-[bits 32]
-[section .data]
-time dd 0, 0
-
-[section .text]
-
-%ifdef USE_ELF
-[global t_start]
-t_start:
-%else
-[global _t_start]
-_t_start:
-%endif
-   push edx
-   push eax
-   rdtsc
-   mov [time+0],edx
-   mov [time+4],eax
-   pop eax
-   pop edx
-   ret
-   
-%ifdef USE_ELF
-[global t_read]
-t_read:
-%else
-[global _t_read]
-_t_read:
-%endif
-   rdtsc
-   sub eax,[time+4]
-   sbb edx,[time+0]
-   ret
-   
-\ No newline at end of file
diff --git a/libtommath/etc/tune.c b/libtommath/etc/tune.c
deleted file mode 100644
index c2ac998..0000000
--- a/libtommath/etc/tune.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Tune the Karatsuba parameters
- *
- * Tom St Denis, tomstdenis@gmail.com
- */
-#include <tommath.h>
-#include <time.h>
-
-/* how many times todo each size mult.  Depends on your computer.  For slow computers
- * this can be low like 5 or 10.  For fast [re: Athlon] should be 25 - 50 or so
- */
-#define TIMES (1UL<<14UL)
-
-#ifndef X86_TIMER
-
-/* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC (void)
-   {
-   #if defined __GNUC__
-      #if defined(__i386__) || defined(__x86_64__)
-        /* version from http://www.mcs.anl.gov/~kazutomo/rdtsc.html
-         * the old code always got a warning issued by gcc, clang did not complain...
-         */
-        unsigned hi, lo;
-        __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-        return ((ulong64)lo)|( ((ulong64)hi)<<32);
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
-
-   // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
-
-
-/* generic ISO C timer */
-ulong64 LBL_T;
-void t_start(void) { LBL_T = TIMFUNC(); }
-ulong64 t_read(void) { return TIMFUNC() - LBL_T; }
-
-#else
-extern void t_start(void);
-extern ulong64 t_read(void);
-#endif
-
-ulong64 time_mult(int size, int s)
-{
-  unsigned long     x;
-  mp_int  a, b, c;
-  ulong64 t1;
-
-  mp_init (&a);
-  mp_init (&b);
-  mp_init (&c);
-
-  mp_rand (&a, size);
-  mp_rand (&b, size);
-
-  if (s == 1) {
-      KARATSUBA_MUL_CUTOFF = size;
-  } else {
-      KARATSUBA_MUL_CUTOFF = 100000;
-  }
-
-  t_start();
-  for (x = 0; x < TIMES; x++) {
-      mp_mul(&a,&b,&c);
-  }
-  t1 = t_read();
-  mp_clear (&a);
-  mp_clear (&b);
-  mp_clear (&c);
-  return t1;
-}
-
-ulong64 time_sqr(int size, int s)
-{
-  unsigned long     x;
-  mp_int  a, b;
-  ulong64 t1;
-
-  mp_init (&a);
-  mp_init (&b);
-
-  mp_rand (&a, size);
-
-  if (s == 1) {
-      KARATSUBA_SQR_CUTOFF = size;
-  } else {
-      KARATSUBA_SQR_CUTOFF = 100000;
-  }
-
-  t_start();
-  for (x = 0; x < TIMES; x++) {
-      mp_sqr(&a,&b);
-  }
-  t1 = t_read();
-  mp_clear (&a);
-  mp_clear (&b);
-  return t1;
-}
-
-int
-main (void)
-{
-  ulong64 t1, t2;
-  int x, y;
-
-  for (x = 8; ; x += 2) {
-     t1 = time_mult(x, 0);
-     t2 = time_mult(x, 1);
-     printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
-     if (t2 < t1) break;
-  }
-  y = x;
-
-  for (x = 8; ; x += 2) {
-     t1 = time_sqr(x, 0);
-     t2 = time_sqr(x, 1);
-     printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
-     if (t2 < t1) break;
-  }
-  printf("KARATSUBA_MUL_CUTOFF = %d\n", y);
-  printf("KARATSUBA_SQR_CUTOFF = %d\n", x);
-
-  return 0;
-}
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/gen.pl b/libtommath/gen.pl
deleted file mode 100644
index 57f65ac..0000000
--- a/libtommath/gen.pl
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/perl -w
-#
-# Generates a "single file" you can use to quickly
-# add the whole source without any makefile troubles
-#
-use strict;
-
-open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
-foreach my $filename (glob "bn*.c") {
-   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
-   print OUT "/* Start: $filename */\n";
-   print OUT while <SRC>;
-   print OUT "\n/* End: $filename */\n\n";
-   close SRC or die "Error closing $filename after reading: $!";
-}
-print OUT "\n/* EOF */\n";
-close OUT or die "Error closing mpi.c after writing: $!";
-
-system('perl -pli -e "s/\s*$//" mpi.c');
diff --git a/libtommath/logs/README b/libtommath/logs/README
deleted file mode 100644
index 965e7c8..0000000
--- a/libtommath/logs/README
+++ /dev/null
@@ -1,13 +0,0 @@
-To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
-Todo this type 
-
-make timing ; ltmtest
-
-in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
-
-After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
-them all :-)
-
-Have fun
-
-Tom
-\ No newline at end of file
diff --git a/libtommath/logs/add.log b/libtommath/logs/add.log
deleted file mode 100644
index 43503ac..0000000
--- a/libtommath/logs/add.log
+++ /dev/null
@@ -1,16 +0,0 @@
-480        87
-960       111
-1440       135
-1920       159
-2400       200
-2880       224
-3360       248
-3840       272
-4320       296
-4800       320
-5280       344
-5760       368
-6240       392
-6720       416
-7200       440
-7680       464
diff --git a/libtommath/logs/addsub.png b/libtommath/logs/addsub.png
deleted file mode 100644
index 441c7b2..0000000
--- a/libtommath/logs/addsub.png
+++ /dev/null
diff --git a/libtommath/logs/expt.log b/libtommath/logs/expt.log
deleted file mode 100644
index 70932ab..0000000
--- a/libtommath/logs/expt.log
+++ /dev/null
@@ -1,7 +0,0 @@
-513   1435869
-769   3544970
-1025   7791638
-2049  46902238
-2561  85334899
-3073 141451412
-4097 308770310
diff --git a/libtommath/logs/expt.png b/libtommath/logs/expt.png
deleted file mode 100644
index d779cc5..0000000
--- a/libtommath/logs/expt.png
+++ /dev/null
diff --git a/libtommath/logs/expt_2k.log b/libtommath/logs/expt_2k.log
deleted file mode 100644
index 97d325f..0000000
--- a/libtommath/logs/expt_2k.log
+++ /dev/null
@@ -1,5 +0,0 @@
-607   2109225
-1279  10148314
-2203  34126877
-3217  82716424
-4253 161569606
diff --git a/libtommath/logs/expt_2kl.log b/libtommath/logs/expt_2kl.log
deleted file mode 100644
index d9ad4be..0000000
--- a/libtommath/logs/expt_2kl.log
+++ /dev/null
@@ -1,4 +0,0 @@
-1024   7705271
-2048  34286851
-4096 165207491
-521   1618631
diff --git a/libtommath/logs/expt_dr.log b/libtommath/logs/expt_dr.log
deleted file mode 100644
index c6bbe07..0000000
--- a/libtommath/logs/expt_dr.log
+++ /dev/null
@@ -1,7 +0,0 @@
-532   1928550
-784   3763908
-1036   7564221
-1540  16566059
-2072  32283784
-3080  79851565
-4116 157843530
diff --git a/libtommath/logs/graphs.dem b/libtommath/logs/graphs.dem
deleted file mode 100644
index dfaf613..0000000
--- a/libtommath/logs/graphs.dem
+++ /dev/null
@@ -1,17 +0,0 @@
-set terminal png
-set size 1.75
-set ylabel "Cycles per Operation"
-set xlabel "Operand size (bits)"
-
-set output "addsub.png"
-plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
-
-set output "mult.png"
-plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
-
-set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
-
-set output "invmod.png"
-plot 'invmod.log' smooth bezier title "Modular Inverse"
-
diff --git a/libtommath/logs/index.html b/libtommath/logs/index.html
deleted file mode 100644
index 8c1ed9d..0000000
--- a/libtommath/logs/index.html
+++ /dev/null
@@ -1,24 +0,0 @@
-<html>
-<head>
-<title>LibTomMath Log Plots</title>
-</head>
-<body>
-
-<h1>Addition and Subtraction</h1>
-<center><img src=addsub.png></center>
-<hr>
-
-<h1>Multipliers</h1>
-<center><img src=mult.png></center>
-<hr>
-
-<h1>Exptmod</h1>
-<center><img src=expt.png></center>
-<hr>
-
-<h1>Modular Inverse</h1>
-<center><img src=invmod.png></center>
-<hr>
-
-</body>
-</html>
diff --git a/libtommath/logs/invmod.log b/libtommath/logs/invmod.log
deleted file mode 100644
index e69de29..0000000
--- a/libtommath/logs/invmod.log
+++ /dev/null
diff --git a/libtommath/logs/invmod.png b/libtommath/logs/invmod.png
deleted file mode 100644
index 9dcd7d8..0000000
--- a/libtommath/logs/invmod.png
+++ /dev/null
diff --git a/libtommath/logs/mult.log b/libtommath/logs/mult.log
deleted file mode 100644
index 33563fc..0000000
--- a/libtommath/logs/mult.log
+++ /dev/null
@@ -1,84 +0,0 @@
-271       555
-390       855
-508      1161
-631      1605
-749      2117
-871      2687
-991      3329
-1108      4084
-1231      4786
-1351      5624
-1470      6392
-1586      7364
-1710      8218
-1830      9255
-1951     10217
-2067     11461
-2191     12463
-2308     13677
-2430     14800
-2551     16232
-2671     17460
-2791     18899
-2902     20247
-3028     21902
-3151     23240
-3267     24927
-3391     26441
-3511     28277
-3631     29838
-3749     31751
-3869     33673
-3989     35431
-4111     37518
-4231     39426
-4349     41504
-4471     43567
-4591     45786
-4711     47876
-4831     50299
-4951     52427
-5071     54785
-5189     57241
-5307     59730
-5431     62194
-5551     64761
-5670     67322
-5789     70073
-5907     72663
-6030     75437
-6151     78242
-6268     81202
-6389     83948
-6509     86985
-6631     89903
-6747     93184
-6869     96044
-6991     99286
-7109    102395
-7229    105917
-7351    108940
-7470    112490
-7589    115702
-7711    119508
-7831    122632
-7951    126410
-8071    129808
-8190    133895
-8311    137146
-8431    141218
-8549    144732
-8667    149131
-8790    152462
-8911    156754
-9030    160479
-9149    165138
-9271    168601
-9391    173185
-9511    176988
-9627    181976
-9751    185539
-9870    190388
-9991    194335
-10110    199605
-10228    203298
diff --git a/libtommath/logs/mult.png b/libtommath/logs/mult.png
deleted file mode 100644
index d22e8c8..0000000
--- a/libtommath/logs/mult.png
+++ /dev/null
diff --git a/libtommath/logs/mult_kara.log b/libtommath/logs/mult_kara.log
deleted file mode 100644
index 7136c79..0000000
--- a/libtommath/logs/mult_kara.log
+++ /dev/null
@@ -1,84 +0,0 @@
-271       560
-391       870
-511      1159
-631      1605
-750      2111
-871      2737
-991      3361
-1111      4054
-1231      4778
-1351      5600
-1471      6404
-1591      7323
-1710      8255
-1831      9239
-1948     10257
-2070     11397
-2190     12531
-2308     13665
-2429     14870
-2550     16175
-2671     17539
-2787     18879
-2911     20350
-3031     21807
-3150     23415
-3270     24897
-3388     26567
-3511     28205
-3627     30076
-3751     31744
-3869     33657
-3991     35425
-4111     37522
-4229     39363
-4351     41503
-4470     43491
-4590     45827
-4711     47795
-4828     50166
-4951     52318
-5070     54911
-5191     57036
-5308     58237
-5431     60248
-5551     62678
-5671     64786
-5791     67294
-5908     69343
-6031     71607
-6151     74166
-6271     76590
-6391     78734
-6511     81175
-6631     83742
-6750     86403
-6868     88873
-6990     91150
-7110     94211
-7228     96922
-7351     99445
-7469    102216
-7589    104968
-7711    108113
-7827    110758
-7950    113714
-8071    116511
-8186    119643
-8310    122679
-8425    125581
-8551    128715
-8669    131778
-8788    135116
-8910    138138
-9031    141628
-9148    144754
-9268    148367
-9391    151551
-9511    155033
-9631    158652
-9751    162125
-9871    165248
-9988    168627
-10111    172427
-10231    176412
diff --git a/libtommath/logs/sqr.log b/libtommath/logs/sqr.log
deleted file mode 100644
index cd29fc5..0000000
--- a/libtommath/logs/sqr.log
+++ /dev/null
@@ -1,84 +0,0 @@
-265       562
-389       882
-509      1207
-631      1572
-750      1990
-859      2433
-991      2894
-1109      3555
-1230      4228
-1350      5018
-1471      5805
-1591      6579
-1709      7415
-1829      8329
-1949      9225
-2071     10139
-2188     11239
-2309     12178
-2431     13212
-2551     14294
-2671     15551
-2791     16512
-2911     17718
-3030     18876
-3150     20259
-3270     21374
-3391     22650
-3511     23948
-3631     25493
-3750     26756
-3870     28225
-3989     29705
-4110     31409
-4230     32834
-4351     34327
-4471     35818
-4591     37636
-4711     39228
-4830     40868
-4949     42393
-5070     44541
-5191     46269
-5310     48162
-5429     49728
-5548     51985
-5671     53948
-5791     55885
-5910     57584
-6031     60082
-6150     62239
-6270     64309
-6390     66014
-6511     68766
-6631     71012
-6750     73172
-6871     74952
-6991     77909
-7111     80371
-7231     82666
-7351     84531
-7469     87698
-7589     90318
-7711    225384
-7830    232428
-7950    240009
-8070    246522
-8190    253662
-8310    260961
-8431    269253
-8549    275743
-8671    283769
-8789    290811
-8911    300034
-9030    306873
-9149    315085
-9270    323944
-9390    332390
-9508    337519
-9631    348986
-9749    356904
-9871    367013
-9989    373831
-10108    381033
-10230    393475
diff --git a/libtommath/logs/sqr_kara.log b/libtommath/logs/sqr_kara.log
deleted file mode 100644
index 06355a7..0000000
--- a/libtommath/logs/sqr_kara.log
+++ /dev/null
@@ -1,84 +0,0 @@
-271       560
-388       878
-511      1179
-629      1625
-751      1988
-871      2423
-989      2896
-1111      3561
-1231      4209
-1350      5015
-1470      5804
-1591      6556
-1709      7420
-1831      8263
-1951      9173
-2070     10153
-2191     11229
-2310     12167
-2431     13211
-2550     14309
-2671     15524
-2788     16525
-2910     17712
-3028     18822
-3148     20220
-3271     21343
-3391     22652
-3511     23944
-3630     25485
-3750     26778
-3868     28201
-3990     29653
-4111     31393
-4225     32841
-4350     34328
-4471     35786
-4590     37652
-4711     39245
-4830     40876
-4951     42433
-5068     44547
-5191     46321
-5311     48140
-5430     49727
-5550     52034
-5671     53954
-5791     55921
-5908     57597
-6031     60084
-6148     62226
-6270     64295
-6390     66045
-6511     68779
-6629     71003
-6751     73169
-6871     74992
-6991     77895
-7110     80376
-7231     82628
-7351     84468
-7470     87664
-7591     90284
-7711     91352
-7828     93995
-7950     96276
-8071     98691
-8190    101256
-8308    103631
-8431    105222
-8550    108343
-8671    110281
-8787    112764
-8911    115397
-9031    117690
-9151    120266
-9271    122715
-9391    124624
-9510    127937
-9630    130313
-9750    132914
-9871    136129
-9991    138517
-10108    141525
-10231    144225
diff --git a/libtommath/logs/sub.log b/libtommath/logs/sub.log
deleted file mode 100644
index 9f84fa2..0000000
--- a/libtommath/logs/sub.log
+++ /dev/null
@@ -1,16 +0,0 @@
-480        94
-960       116
-1440       140
-1920       164
-2400       205
-2880       229
-3360       253
-3840       277
-4320       299
-4800       321
-5280       345
-5760       371
-6240       395
-6720       419
-7200       441
-7680       465
diff --git a/libtommath/mess.sh b/libtommath/mess.sh
deleted file mode 100644
index bf639ce..0000000
--- a/libtommath/mess.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-if cvs log $1 >/dev/null 2>/dev/null; then exit 0; else echo "$1 shouldn't be here" ; exit 1; fi
-
-
diff --git a/libtommath/mtest/logtab.h b/libtommath/mtest/logtab.h
deleted file mode 100644
index 751111e..0000000
--- a/libtommath/mtest/logtab.h
+++ /dev/null
@@ -1,24 +0,0 @@
-const float s_logv_2[] = {
-   0.000000000, 0.000000000, 1.000000000, 0.630929754, 	/*  0  1  2  3 */
-   0.500000000, 0.430676558, 0.386852807, 0.356207187, 	/*  4  5  6  7 */
-   0.333333333, 0.315464877, 0.301029996, 0.289064826, 	/*  8  9 10 11 */
-   0.278942946, 0.270238154, 0.262649535, 0.255958025, 	/* 12 13 14 15 */
-   0.250000000, 0.244650542, 0.239812467, 0.235408913, 	/* 16 17 18 19 */
-   0.231378213, 0.227670249, 0.224243824, 0.221064729, 	/* 20 21 22 23 */
-   0.218104292, 0.215338279, 0.212746054, 0.210309918, 	/* 24 25 26 27 */
-   0.208014598, 0.205846832, 0.203795047, 0.201849087, 	/* 28 29 30 31 */
-   0.200000000, 0.198239863, 0.196561632, 0.194959022, 	/* 32 33 34 35 */
-   0.193426404, 0.191958720, 0.190551412, 0.189200360, 	/* 36 37 38 39 */
-   0.187901825, 0.186652411, 0.185449023, 0.184288833, 	/* 40 41 42 43 */
-   0.183169251, 0.182087900, 0.181042597, 0.180031327, 	/* 44 45 46 47 */
-   0.179052232, 0.178103594, 0.177183820, 0.176291434, 	/* 48 49 50 51 */
-   0.175425064, 0.174583430, 0.173765343, 0.172969690, 	/* 52 53 54 55 */
-   0.172195434, 0.171441601, 0.170707280, 0.169991616, 	/* 56 57 58 59 */
-   0.169293808, 0.168613099, 0.167948779, 0.167300179, 	/* 60 61 62 63 */
-   0.166666667
-};
-
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/mtest/mpi-config.h b/libtommath/mtest/mpi-config.h
deleted file mode 100644
index fc2a885..0000000
--- a/libtommath/mtest/mpi-config.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Default configuration for MPI library */
-/* $Id$ */
-
-#ifndef MPI_CONFIG_H_
-#define MPI_CONFIG_H_
-
-/*
-  For boolean options, 
-  0 = no
-  1 = yes
-
-  Other options are documented individually.
-
- */
-
-#ifndef MP_IOFUNC
-#define MP_IOFUNC     0  /* include mp_print() ?                */
-#endif
-
-#ifndef MP_MODARITH
-#define MP_MODARITH   1  /* include modular arithmetic ?        */
-#endif
-
-#ifndef MP_NUMTH
-#define MP_NUMTH      1  /* include number theoretic functions? */
-#endif
-
-#ifndef MP_LOGTAB
-#define MP_LOGTAB     1  /* use table of logs instead of log()? */
-#endif
-
-#ifndef MP_MEMSET
-#define MP_MEMSET     1  /* use memset() to zero buffers?       */
-#endif
-
-#ifndef MP_MEMCPY
-#define MP_MEMCPY     1  /* use memcpy() to copy buffers?       */
-#endif
-
-#ifndef MP_CRYPTO
-#define MP_CRYPTO     1  /* erase memory on free?               */
-#endif
-
-#ifndef MP_ARGCHK
-/*
-  0 = no parameter checks
-  1 = runtime checks, continue execution and return an error to caller
-  2 = assertions; dump core on parameter errors
- */
-#define MP_ARGCHK     2  /* how to check input arguments        */
-#endif
-
-#ifndef MP_DEBUG
-#define MP_DEBUG      0  /* print diagnostic output?            */
-#endif
-
-#ifndef MP_DEFPREC
-#define MP_DEFPREC    64 /* default precision, in digits        */
-#endif
-
-#ifndef MP_MACRO
-#define MP_MACRO      1  /* use macros for frequent calls?      */
-#endif
-
-#ifndef MP_SQUARE
-#define MP_SQUARE     1  /* use separate squaring code?         */
-#endif
-
-#ifndef MP_PTAB_SIZE
-/*
-  When building mpprime.c, we build in a table of small prime
-  values to use for primality testing.  The more you include,
-  the more space they take up.  See primes.c for the possible
-  values (currently 16, 32, 64, 128, 256, and 6542)
- */
-#define MP_PTAB_SIZE  128  /* how many built-in primes?         */
-#endif
-
-#ifndef MP_COMPAT_MACROS
-#define MP_COMPAT_MACROS 1   /* define compatibility macros?    */
-#endif
-
-#endif /* ifndef MPI_CONFIG_H_ */
-
-
-/* crc==3287762869, version==2, Sat Feb 02 06:43:53 2002 */
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/mtest/mpi-types.h b/libtommath/mtest/mpi-types.h
deleted file mode 100644
index f99d7ee..0000000
--- a/libtommath/mtest/mpi-types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Type definitions generated by 'types.pl' */
-typedef char               mp_sign;
-typedef unsigned short     mp_digit;  /* 2 byte type */
-typedef unsigned int       mp_word;   /* 4 byte type */
-typedef unsigned int       mp_size;
-typedef int                mp_err;
-
-#define MP_DIGIT_BIT       (CHAR_BIT*sizeof(mp_digit))
-#define MP_DIGIT_MAX       USHRT_MAX
-#define MP_WORD_BIT        (CHAR_BIT*sizeof(mp_word))
-#define MP_WORD_MAX        UINT_MAX
-
-#define MP_DIGIT_SIZE      2
-#define DIGIT_FMT          "%04X"
-#define RADIX              (MP_DIGIT_MAX+1)
-
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/mtest/mpi.c b/libtommath/mtest/mpi.c
deleted file mode 100644
index 567b12d..0000000
--- a/libtommath/mtest/mpi.c
+++ /dev/null
@@ -1,3985 +0,0 @@
-/*
-    mpi.c
-
-    by Michael J. Fromberger <sting@linguist.dartmouth.edu>
-    Copyright (C) 1998 Michael J. Fromberger, All Rights Reserved
-
-    Arbitrary precision integer arithmetic library
-
-    $Id$
- */
-
-#include "mpi.h"
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-
-#if MP_DEBUG
-#include <stdio.h>
-
-#define DIAG(T,V) {fprintf(stderr,T);mp_print(V,stderr);fputc('\n',stderr);}
-#else
-#define DIAG(T,V)
-#endif
-
-/*
-   If MP_LOGTAB is not defined, use the math library to compute the
-   logarithms on the fly.  Otherwise, use the static table below.
-   Pick which works best for your system.
- */
-#if MP_LOGTAB
-
-/* {{{ s_logv_2[] - log table for 2 in various bases */
-
-/*
-  A table of the logs of 2 for various bases (the 0 and 1 entries of
-  this table are meaningless and should not be referenced).
-
-  This table is used to compute output lengths for the mp_toradix()
-  function.  Since a number n in radix r takes up about log_r(n)
-  digits, we estimate the output size by taking the least integer
-  greater than log_r(n), where:
-
-  log_r(n) = log_2(n) * log_r(2)
-
-  This table, therefore, is a table of log_r(2) for 2 <= r <= 36,
-  which are the output bases supported.
- */
-
-#include "logtab.h"
-
-/* }}} */
-#define LOG_V_2(R)  s_logv_2[(R)]
-
-#else
-
-#include <math.h>
-#define LOG_V_2(R)  (log(2.0)/log(R))
-
-#endif
-
-/* Default precision for newly created mp_int's      */
-static unsigned int s_mp_defprec = MP_DEFPREC;
-
-/* {{{ Digit arithmetic macros */
-
-/*
-  When adding and multiplying digits, the results can be larger than
-  can be contained in an mp_digit.  Thus, an mp_word is used.  These
-  macros mask off the upper and lower digits of the mp_word (the
-  mp_word may be more than 2 mp_digits wide, but we only concern
-  ourselves with the low-order 2 mp_digits)
-
-  If your mp_word DOES have more than 2 mp_digits, you need to
-  uncomment the first line, and comment out the second.
- */
-
-/* #define  CARRYOUT(W)  (((W)>>DIGIT_BIT)&MP_DIGIT_MAX) */
-#define  CARRYOUT(W)  ((W)>>DIGIT_BIT)
-#define  ACCUM(W)     ((W)&MP_DIGIT_MAX)
-
-/* }}} */
-
-/* {{{ Comparison constants */
-
-#define  MP_LT       -1
-#define  MP_EQ        0
-#define  MP_GT        1
-
-/* }}} */
-
-/* {{{ Constant strings */
-
-/* Constant strings returned by mp_strerror() */
-static const char *const mp_err_string[] = {
-  "unknown result code",     /* say what?            */
-  "boolean true",            /* MP_OKAY, MP_YES      */
-  "boolean false",           /* MP_NO                */
-  "out of memory",           /* MP_MEM               */
-  "argument out of range",   /* MP_RANGE             */
-  "invalid input parameter", /* MP_BADARG            */
-  "result is undefined"      /* MP_UNDEF             */
-};
-
-/* Value to digit maps for radix conversion   */
-
-/* s_dmap_1 - standard digits and letters */
-static const char *s_dmap_1 =
-  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-#if 0
-/* s_dmap_2 - base64 ordering for digits  */
-static const char *s_dmap_2 =
-  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-#endif
-
-/* }}} */
-
-/* {{{ Static function declarations */
-
-/*
-   If MP_MACRO is false, these will be defined as actual functions;
-   otherwise, suitable macro definitions will be used.  This works
-   around the fact that ANSI C89 doesn't support an 'inline' keyword
-   (although I hear C9x will ... about bloody time).  At present, the
-   macro definitions are identical to the function bodies, but they'll
-   expand in place, instead of generating a function call.
-
-   I chose these particular functions to be made into macros because
-   some profiling showed they are called a lot on a typical workload,
-   and yet they are primarily housekeeping.
- */
-#if MP_MACRO == 0
- void     s_mp_setz(mp_digit *dp, mp_size count); /* zero digits           */
- void     s_mp_copy(mp_digit *sp, mp_digit *dp, mp_size count); /* copy    */
- void    *s_mp_alloc(size_t nb, size_t ni);       /* general allocator     */
- void     s_mp_free(void *ptr);                   /* general free function */
-#else
-
- /* Even if these are defined as macros, we need to respect the settings
-    of the MP_MEMSET and MP_MEMCPY configuration options...
-  */
- #if MP_MEMSET == 0
-  #define  s_mp_setz(dp, count) \
-       {int ix;for(ix=0;ix<(count);ix++)(dp)[ix]=0;}
- #else
-  #define  s_mp_setz(dp, count) memset(dp, 0, (count) * sizeof(mp_digit))
- #endif /* MP_MEMSET */
-
- #if MP_MEMCPY == 0
-  #define  s_mp_copy(sp, dp, count) \
-       {int ix;for(ix=0;ix<(count);ix++)(dp)[ix]=(sp)[ix];}
- #else
-  #define  s_mp_copy(sp, dp, count) memcpy(dp, sp, (count) * sizeof(mp_digit))
- #endif /* MP_MEMCPY */
-
- #define  s_mp_alloc(nb, ni)  calloc(nb, ni)
- #define  s_mp_free(ptr) {if(ptr) free(ptr);}
-#endif /* MP_MACRO */
-
-mp_err   s_mp_grow(mp_int *mp, mp_size min);   /* increase allocated size */
-mp_err   s_mp_pad(mp_int *mp, mp_size min);    /* left pad with zeroes    */
-
-void     s_mp_clamp(mp_int *mp);               /* clip leading zeroes     */
-
-void     s_mp_exch(mp_int *a, mp_int *b);      /* swap a and b in place   */
-
-mp_err   s_mp_lshd(mp_int *mp, mp_size p);     /* left-shift by p digits  */
-void     s_mp_rshd(mp_int *mp, mp_size p);     /* right-shift by p digits */
-void     s_mp_div_2d(mp_int *mp, mp_digit d);  /* divide by 2^d in place  */
-void     s_mp_mod_2d(mp_int *mp, mp_digit d);  /* modulo 2^d in place     */
-mp_err   s_mp_mul_2d(mp_int *mp, mp_digit d);  /* multiply by 2^d in place*/
-void     s_mp_div_2(mp_int *mp);               /* divide by 2 in place    */
-mp_err   s_mp_mul_2(mp_int *mp);               /* multiply by 2 in place  */
-mp_digit s_mp_norm(mp_int *a, mp_int *b);      /* normalize for division  */
-mp_err   s_mp_add_d(mp_int *mp, mp_digit d);   /* unsigned digit addition */
-mp_err   s_mp_sub_d(mp_int *mp, mp_digit d);   /* unsigned digit subtract */
-mp_err   s_mp_mul_d(mp_int *mp, mp_digit d);   /* unsigned digit multiply */
-mp_err   s_mp_div_d(mp_int *mp, mp_digit d, mp_digit *r);
-		                               /* unsigned digit divide   */
-mp_err   s_mp_reduce(mp_int *x, mp_int *m, mp_int *mu);
-                                               /* Barrett reduction       */
-mp_err   s_mp_add(mp_int *a, mp_int *b);       /* magnitude addition      */
-mp_err   s_mp_sub(mp_int *a, mp_int *b);       /* magnitude subtract      */
-mp_err   s_mp_mul(mp_int *a, mp_int *b);       /* magnitude multiply      */
-#if 0
-void     s_mp_kmul(mp_digit *a, mp_digit *b, mp_digit *out, mp_size len);
-                                               /* multiply buffers in place */
-#endif
-#if MP_SQUARE
-mp_err   s_mp_sqr(mp_int *a);                  /* magnitude square        */
-#else
-#define  s_mp_sqr(a) s_mp_mul(a, a)
-#endif
-mp_err   s_mp_div(mp_int *a, mp_int *b);       /* magnitude divide        */
-mp_err   s_mp_2expt(mp_int *a, mp_digit k);    /* a = 2^k                 */
-int      s_mp_cmp(mp_int *a, mp_int *b);       /* magnitude comparison    */
-int      s_mp_cmp_d(mp_int *a, mp_digit d);    /* magnitude digit compare */
-int      s_mp_ispow2(mp_int *v);               /* is v a power of 2?      */
-int      s_mp_ispow2d(mp_digit d);             /* is d a power of 2?      */
-
-int      s_mp_tovalue(char ch, int r);          /* convert ch to value    */
-char     s_mp_todigit(int val, int r, int low); /* convert val to digit   */
-int      s_mp_outlen(int bits, int r);          /* output length in bytes */
-
-/* }}} */
-
-/* {{{ Default precision manipulation */
-
-unsigned int mp_get_prec(void)
-{
-  return s_mp_defprec;
-
-} /* end mp_get_prec() */
-
-void         mp_set_prec(unsigned int prec)
-{
-  if(prec == 0)
-    s_mp_defprec = MP_DEFPREC;
-  else
-    s_mp_defprec = prec;
-
-} /* end mp_set_prec() */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ mp_init(mp) */
-
-/*
-  mp_init(mp)
-
-  Initialize a new zero-valued mp_int.  Returns MP_OKAY if successful,
-  MP_MEM if memory could not be allocated for the structure.
- */
-
-mp_err mp_init(mp_int *mp)
-{
-  return mp_init_size(mp, s_mp_defprec);
-
-} /* end mp_init() */
-
-/* }}} */
-
-/* {{{ mp_init_array(mp[], count) */
-
-mp_err mp_init_array(mp_int mp[], int count)
-{
-  mp_err  res;
-  int     pos;
-
-  ARGCHK(mp !=NULL && count > 0, MP_BADARG);
-
-  for(pos = 0; pos < count; ++pos) {
-    if((res = mp_init(&mp[pos])) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  return MP_OKAY;
-
- CLEANUP:
-  while(--pos >= 0)
-    mp_clear(&mp[pos]);
-
-  return res;
-
-} /* end mp_init_array() */
-
-/* }}} */
-
-/* {{{ mp_init_size(mp, prec) */
-
-/*
-  mp_init_size(mp, prec)
-
-  Initialize a new zero-valued mp_int with at least the given
-  precision; returns MP_OKAY if successful, or MP_MEM if memory could
-  not be allocated for the structure.
- */
-
-mp_err mp_init_size(mp_int *mp, mp_size prec)
-{
-  ARGCHK(mp != NULL && prec > 0, MP_BADARG);
-
-  if((DIGITS(mp) = s_mp_alloc(prec, sizeof(mp_digit))) == NULL)
-    return MP_MEM;
-
-  SIGN(mp) = MP_ZPOS;
-  USED(mp) = 1;
-  ALLOC(mp) = prec;
-
-  return MP_OKAY;
-
-} /* end mp_init_size() */
-
-/* }}} */
-
-/* {{{ mp_init_copy(mp, from) */
-
-/*
-  mp_init_copy(mp, from)
-
-  Initialize mp as an exact copy of from.  Returns MP_OKAY if
-  successful, MP_MEM if memory could not be allocated for the new
-  structure.
- */
-
-mp_err mp_init_copy(mp_int *mp, mp_int *from)
-{
-  ARGCHK(mp != NULL && from != NULL, MP_BADARG);
-
-  if(mp == from)
-    return MP_OKAY;
-
-  if((DIGITS(mp) = s_mp_alloc(USED(from), sizeof(mp_digit))) == NULL)
-    return MP_MEM;
-
-  s_mp_copy(DIGITS(from), DIGITS(mp), USED(from));
-  USED(mp) = USED(from);
-  ALLOC(mp) = USED(from);
-  SIGN(mp) = SIGN(from);
-
-  return MP_OKAY;
-
-} /* end mp_init_copy() */
-
-/* }}} */
-
-/* {{{ mp_copy(from, to) */
-
-/*
-  mp_copy(from, to)
-
-  Copies the mp_int 'from' to the mp_int 'to'.  It is presumed that
-  'to' has already been initialized (if not, use mp_init_copy()
-  instead). If 'from' and 'to' are identical, nothing happens.
- */
-
-mp_err mp_copy(mp_int *from, mp_int *to)
-{
-  ARGCHK(from != NULL && to != NULL, MP_BADARG);
-
-  if(from == to)
-    return MP_OKAY;
-
-  { /* copy */
-    mp_digit   *tmp;
-
-    /*
-      If the allocated buffer in 'to' already has enough space to hold
-      all the used digits of 'from', we'll re-use it to avoid hitting
-      the memory allocater more than necessary; otherwise, we'd have
-      to grow anyway, so we just allocate a hunk and make the copy as
-      usual
-     */
-    if(ALLOC(to) >= USED(from)) {
-      s_mp_setz(DIGITS(to) + USED(from), ALLOC(to) - USED(from));
-      s_mp_copy(DIGITS(from), DIGITS(to), USED(from));
-
-    } else {
-      if((tmp = s_mp_alloc(USED(from), sizeof(mp_digit))) == NULL)
-	return MP_MEM;
-
-      s_mp_copy(DIGITS(from), tmp, USED(from));
-
-      if(DIGITS(to) != NULL) {
-#if MP_CRYPTO
-	s_mp_setz(DIGITS(to), ALLOC(to));
-#endif
-	s_mp_free(DIGITS(to));
-      }
-
-      DIGITS(to) = tmp;
-      ALLOC(to) = USED(from);
-    }
-
-    /* Copy the precision and sign from the original */
-    USED(to) = USED(from);
-    SIGN(to) = SIGN(from);
-  } /* end copy */
-
-  return MP_OKAY;
-
-} /* end mp_copy() */
-
-/* }}} */
-
-/* {{{ mp_exch(mp1, mp2) */
-
-/*
-  mp_exch(mp1, mp2)
-
-  Exchange mp1 and mp2 without allocating any intermediate memory
-  (well, unless you count the stack space needed for this call and the
-  locals it creates...).  This cannot fail.
- */
-
-void mp_exch(mp_int *mp1, mp_int *mp2)
-{
-#if MP_ARGCHK == 2
-  assert(mp1 != NULL && mp2 != NULL);
-#else
-  if(mp1 == NULL || mp2 == NULL)
-    return;
-#endif
-
-  s_mp_exch(mp1, mp2);
-
-} /* end mp_exch() */
-
-/* }}} */
-
-/* {{{ mp_clear(mp) */
-
-/*
-  mp_clear(mp)
-
-  Release the storage used by an mp_int, and void its fields so that
-  if someone calls mp_clear() again for the same int later, we won't
-  get tollchocked.
- */
-
-void   mp_clear(mp_int *mp)
-{
-  if(mp == NULL)
-    return;
-
-  if(DIGITS(mp) != NULL) {
-#if MP_CRYPTO
-    s_mp_setz(DIGITS(mp), ALLOC(mp));
-#endif
-    s_mp_free(DIGITS(mp));
-    DIGITS(mp) = NULL;
-  }
-
-  USED(mp) = 0;
-  ALLOC(mp) = 0;
-
-} /* end mp_clear() */
-
-/* }}} */
-
-/* {{{ mp_clear_array(mp[], count) */
-
-void   mp_clear_array(mp_int mp[], int count)
-{
-  ARGCHK(mp != NULL && count > 0, MP_BADARG);
-
-  while(--count >= 0)
-    mp_clear(&mp[count]);
-
-} /* end mp_clear_array() */
-
-/* }}} */
-
-/* {{{ mp_zero(mp) */
-
-/*
-  mp_zero(mp)
-
-  Set mp to zero.  Does not change the allocated size of the structure,
-  and therefore cannot fail (except on a bad argument, which we ignore)
- */
-void   mp_zero(mp_int *mp)
-{
-  if(mp == NULL)
-    return;
-
-  s_mp_setz(DIGITS(mp), ALLOC(mp));
-  USED(mp) = 1;
-  SIGN(mp) = MP_ZPOS;
-
-} /* end mp_zero() */
-
-/* }}} */
-
-/* {{{ mp_set(mp, d) */
-
-void   mp_set(mp_int *mp, mp_digit d)
-{
-  if(mp == NULL)
-    return;
-
-  mp_zero(mp);
-  DIGIT(mp, 0) = d;
-
-} /* end mp_set() */
-
-/* }}} */
-
-/* {{{ mp_set_int(mp, z) */
-
-mp_err mp_set_int(mp_int *mp, long z)
-{
-  int            ix;
-  unsigned long  v = abs(z);
-  mp_err         res;
-
-  ARGCHK(mp != NULL, MP_BADARG);
-
-  mp_zero(mp);
-  if(z == 0)
-    return MP_OKAY;  /* shortcut for zero */
-
-  for(ix = sizeof(long) - 1; ix >= 0; ix--) {
-
-    if((res = s_mp_mul_2d(mp, CHAR_BIT)) != MP_OKAY)
-      return res;
-
-    res = s_mp_add_d(mp,
-		     (mp_digit)((v >> (ix * CHAR_BIT)) & UCHAR_MAX));
-    if(res != MP_OKAY)
-      return res;
-
-  }
-
-  if(z < 0)
-    SIGN(mp) = MP_NEG;
-
-  return MP_OKAY;
-
-} /* end mp_set_int() */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ Digit arithmetic */
-
-/* {{{ mp_add_d(a, d, b) */
-
-/*
-  mp_add_d(a, d, b)
-
-  Compute the sum b = a + d, for a single digit d.  Respects the sign of
-  its primary addend (single digits are unsigned anyway).
- */
-
-mp_err mp_add_d(mp_int *a, mp_digit d, mp_int *b)
-{
-  mp_err   res = MP_OKAY;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, b)) != MP_OKAY)
-    return res;
-
-  if(SIGN(b) == MP_ZPOS) {
-    res = s_mp_add_d(b, d);
-  } else if(s_mp_cmp_d(b, d) >= 0) {
-    res = s_mp_sub_d(b, d);
-  } else {
-    SIGN(b) = MP_ZPOS;
-
-    DIGIT(b, 0) = d - DIGIT(b, 0);
-  }
-
-  return res;
-
-} /* end mp_add_d() */
-
-/* }}} */
-
-/* {{{ mp_sub_d(a, d, b) */
-
-/*
-  mp_sub_d(a, d, b)
-
-  Compute the difference b = a - d, for a single digit d.  Respects the
-  sign of its subtrahend (single digits are unsigned anyway).
- */
-
-mp_err mp_sub_d(mp_int *a, mp_digit d, mp_int *b)
-{
-  mp_err   res;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, b)) != MP_OKAY)
-    return res;
-
-  if(SIGN(b) == MP_NEG) {
-    if((res = s_mp_add_d(b, d)) != MP_OKAY)
-      return res;
-
-  } else if(s_mp_cmp_d(b, d) >= 0) {
-    if((res = s_mp_sub_d(b, d)) != MP_OKAY)
-      return res;
-
-  } else {
-    mp_neg(b, b);
-
-    DIGIT(b, 0) = d - DIGIT(b, 0);
-    SIGN(b) = MP_NEG;
-  }
-
-  if(s_mp_cmp_d(b, 0) == 0)
-    SIGN(b) = MP_ZPOS;
-
-  return MP_OKAY;
-
-} /* end mp_sub_d() */
-
-/* }}} */
-
-/* {{{ mp_mul_d(a, d, b) */
-
-/*
-  mp_mul_d(a, d, b)
-
-  Compute the product b = a * d, for a single digit d.  Respects the sign
-  of its multiplicand (single digits are unsigned anyway)
- */
-
-mp_err mp_mul_d(mp_int *a, mp_digit d, mp_int *b)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if(d == 0) {
-    mp_zero(b);
-    return MP_OKAY;
-  }
-
-  if((res = mp_copy(a, b)) != MP_OKAY)
-    return res;
-
-  res = s_mp_mul_d(b, d);
-
-  return res;
-
-} /* end mp_mul_d() */
-
-/* }}} */
-
-/* {{{ mp_mul_2(a, c) */
-
-mp_err mp_mul_2(mp_int *a, mp_int *c)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, c)) != MP_OKAY)
-    return res;
-
-  return s_mp_mul_2(c);
-
-} /* end mp_mul_2() */
-
-/* }}} */
-
-/* {{{ mp_div_d(a, d, q, r) */
-
-/*
-  mp_div_d(a, d, q, r)
-
-  Compute the quotient q = a / d and remainder r = a mod d, for a
-  single digit d.  Respects the sign of its divisor (single digits are
-  unsigned anyway).
- */
-
-mp_err mp_div_d(mp_int *a, mp_digit d, mp_int *q, mp_digit *r)
-{
-  mp_err   res;
-  mp_digit rem;
-  int      pow;
-
-  ARGCHK(a != NULL, MP_BADARG);
-
-  if(d == 0)
-    return MP_RANGE;
-
-  /* Shortcut for powers of two ... */
-  if((pow = s_mp_ispow2d(d)) >= 0) {
-    mp_digit  mask;
-
-    mask = (1 << pow) - 1;
-    rem = DIGIT(a, 0) & mask;
-
-    if(q) {
-      mp_copy(a, q);
-      s_mp_div_2d(q, pow);
-    }
-
-    if(r)
-      *r = rem;
-
-    return MP_OKAY;
-  }
-
-  /*
-    If the quotient is actually going to be returned, we'll try to
-    avoid hitting the memory allocator by copying the dividend into it
-    and doing the division there.  This can't be any _worse_ than
-    always copying, and will sometimes be better (since it won't make
-    another copy)
-
-    If it's not going to be returned, we need to allocate a temporary
-    to hold the quotient, which will just be discarded.
-   */
-  if(q) {
-    if((res = mp_copy(a, q)) != MP_OKAY)
-      return res;
-
-    res = s_mp_div_d(q, d, &rem);
-    if(s_mp_cmp_d(q, 0) == MP_EQ)
-      SIGN(q) = MP_ZPOS;
-
-  } else {
-    mp_int  qp;
-
-    if((res = mp_init_copy(&qp, a)) != MP_OKAY)
-      return res;
-
-    res = s_mp_div_d(&qp, d, &rem);
-    if(s_mp_cmp_d(&qp, 0) == 0)
-      SIGN(&qp) = MP_ZPOS;
-
-    mp_clear(&qp);
-  }
-
-  if(r)
-    *r = rem;
-
-  return res;
-
-} /* end mp_div_d() */
-
-/* }}} */
-
-/* {{{ mp_div_2(a, c) */
-
-/*
-  mp_div_2(a, c)
-
-  Compute c = a / 2, disregarding the remainder.
- */
-
-mp_err mp_div_2(mp_int *a, mp_int *c)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, c)) != MP_OKAY)
-    return res;
-
-  s_mp_div_2(c);
-
-  return MP_OKAY;
-
-} /* end mp_div_2() */
-
-/* }}} */
-
-/* {{{ mp_expt_d(a, d, b) */
-
-mp_err mp_expt_d(mp_int *a, mp_digit d, mp_int *c)
-{
-  mp_int   s, x;
-  mp_err   res;
-
-  ARGCHK(a != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_init(&s)) != MP_OKAY)
-    return res;
-  if((res = mp_init_copy(&x, a)) != MP_OKAY)
-    goto X;
-
-  DIGIT(&s, 0) = 1;
-
-  while(d != 0) {
-    if(d & 1) {
-      if((res = s_mp_mul(&s, &x)) != MP_OKAY)
-	goto CLEANUP;
-    }
-
-    d >>= 1;
-
-    if((res = s_mp_sqr(&x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  s_mp_exch(&s, c);
-
-CLEANUP:
-  mp_clear(&x);
-X:
-  mp_clear(&s);
-
-  return res;
-
-} /* end mp_expt_d() */
-
-/* }}} */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ Full arithmetic */
-
-/* {{{ mp_abs(a, b) */
-
-/*
-  mp_abs(a, b)
-
-  Compute b = |a|.  'a' and 'b' may be identical.
- */
-
-mp_err mp_abs(mp_int *a, mp_int *b)
-{
-  mp_err   res;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, b)) != MP_OKAY)
-    return res;
-
-  SIGN(b) = MP_ZPOS;
-
-  return MP_OKAY;
-
-} /* end mp_abs() */
-
-/* }}} */
-
-/* {{{ mp_neg(a, b) */
-
-/*
-  mp_neg(a, b)
-
-  Compute b = -a.  'a' and 'b' may be identical.
- */
-
-mp_err mp_neg(mp_int *a, mp_int *b)
-{
-  mp_err   res;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, b)) != MP_OKAY)
-    return res;
-
-  if(s_mp_cmp_d(b, 0) == MP_EQ)
-    SIGN(b) = MP_ZPOS;
-  else
-    SIGN(b) = (SIGN(b) == MP_NEG) ? MP_ZPOS : MP_NEG;
-
-  return MP_OKAY;
-
-} /* end mp_neg() */
-
-/* }}} */
-
-/* {{{ mp_add(a, b, c) */
-
-/*
-  mp_add(a, b, c)
-
-  Compute c = a + b.  All parameters may be identical.
- */
-
-mp_err mp_add(mp_int *a, mp_int *b, mp_int *c)
-{
-  mp_err  res;
-  int     cmp;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  if(SIGN(a) == SIGN(b)) { /* same sign:  add values, keep sign */
-
-    /* Commutativity of addition lets us do this in either order,
-       so we avoid having to use a temporary even if the result
-       is supposed to replace the output
-     */
-    if(c == b) {
-      if((res = s_mp_add(c, a)) != MP_OKAY)
-	return res;
-    } else {
-      if(c != a && (res = mp_copy(a, c)) != MP_OKAY)
-	return res;
-
-      if((res = s_mp_add(c, b)) != MP_OKAY)
-	return res;
-    }
-
-  } else if((cmp = s_mp_cmp(a, b)) > 0) {  /* different sign: a > b   */
-
-    /* If the output is going to be clobbered, we will use a temporary
-       variable; otherwise, we'll do it without touching the memory
-       allocator at all, if possible
-     */
-    if(c == b) {
-      mp_int  tmp;
-
-      if((res = mp_init_copy(&tmp, a)) != MP_OKAY)
-	return res;
-      if((res = s_mp_sub(&tmp, b)) != MP_OKAY) {
-	mp_clear(&tmp);
-	return res;
-      }
-
-      s_mp_exch(&tmp, c);
-      mp_clear(&tmp);
-
-    } else {
-
-      if(c != a && (res = mp_copy(a, c)) != MP_OKAY)
-	return res;
-      if((res = s_mp_sub(c, b)) != MP_OKAY)
-	return res;
-
-    }
-
-  } else if(cmp == 0) {             /* different sign, a == b   */
-
-    mp_zero(c);
-    return MP_OKAY;
-
-  } else {                          /* different sign: a < b    */
-
-    /* See above... */
-    if(c == a) {
-      mp_int  tmp;
-
-      if((res = mp_init_copy(&tmp, b)) != MP_OKAY)
-	return res;
-      if((res = s_mp_sub(&tmp, a)) != MP_OKAY) {
-	mp_clear(&tmp);
-	return res;
-      }
-
-      s_mp_exch(&tmp, c);
-      mp_clear(&tmp);
-
-    } else {
-
-      if(c != b && (res = mp_copy(b, c)) != MP_OKAY)
-	return res;
-      if((res = s_mp_sub(c, a)) != MP_OKAY)
-	return res;
-
-    }
-  }
-
-  if(USED(c) == 1 && DIGIT(c, 0) == 0)
-    SIGN(c) = MP_ZPOS;
-
-  return MP_OKAY;
-
-} /* end mp_add() */
-
-/* }}} */
-
-/* {{{ mp_sub(a, b, c) */
-
-/*
-  mp_sub(a, b, c)
-
-  Compute c = a - b.  All parameters may be identical.
- */
-
-mp_err mp_sub(mp_int *a, mp_int *b, mp_int *c)
-{
-  mp_err  res;
-  int     cmp;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  if(SIGN(a) != SIGN(b)) {
-    if(c == a) {
-      if((res = s_mp_add(c, b)) != MP_OKAY)
-	return res;
-    } else {
-      if(c != b && ((res = mp_copy(b, c)) != MP_OKAY))
-	return res;
-      if((res = s_mp_add(c, a)) != MP_OKAY)
-	return res;
-      SIGN(c) = SIGN(a);
-    }
-
-  } else if((cmp = s_mp_cmp(a, b)) > 0) { /* Same sign, a > b */
-    if(c == b) {
-      mp_int  tmp;
-
-      if((res = mp_init_copy(&tmp, a)) != MP_OKAY)
-	return res;
-      if((res = s_mp_sub(&tmp, b)) != MP_OKAY) {
-	mp_clear(&tmp);
-	return res;
-      }
-      s_mp_exch(&tmp, c);
-      mp_clear(&tmp);
-
-    } else {
-      if(c != a && ((res = mp_copy(a, c)) != MP_OKAY))
-	return res;
-
-      if((res = s_mp_sub(c, b)) != MP_OKAY)
-	return res;
-    }
-
-  } else if(cmp == 0) {  /* Same sign, equal magnitude */
-    mp_zero(c);
-    return MP_OKAY;
-
-  } else {               /* Same sign, b > a */
-    if(c == a) {
-      mp_int  tmp;
-
-      if((res = mp_init_copy(&tmp, b)) != MP_OKAY)
-	return res;
-
-      if((res = s_mp_sub(&tmp, a)) != MP_OKAY) {
-	mp_clear(&tmp);
-	return res;
-      }
-      s_mp_exch(&tmp, c);
-      mp_clear(&tmp);
-
-    } else {
-      if(c != b && ((res = mp_copy(b, c)) != MP_OKAY))
-	return res;
-
-      if((res = s_mp_sub(c, a)) != MP_OKAY)
-	return res;
-    }
-
-    SIGN(c) = !SIGN(b);
-  }
-
-  if(USED(c) == 1 && DIGIT(c, 0) == 0)
-    SIGN(c) = MP_ZPOS;
-
-  return MP_OKAY;
-
-} /* end mp_sub() */
-
-/* }}} */
-
-/* {{{ mp_mul(a, b, c) */
-
-/*
-  mp_mul(a, b, c)
-
-  Compute c = a * b.  All parameters may be identical.
- */
-
-mp_err mp_mul(mp_int *a, mp_int *b, mp_int *c)
-{
-  mp_err   res;
-  mp_sign  sgn;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  sgn = (SIGN(a) == SIGN(b)) ? MP_ZPOS : MP_NEG;
-
-  if(c == b) {
-    if((res = s_mp_mul(c, a)) != MP_OKAY)
-      return res;
-
-  } else {
-    if((res = mp_copy(a, c)) != MP_OKAY)
-      return res;
-
-    if((res = s_mp_mul(c, b)) != MP_OKAY)
-      return res;
-  }
-
-  if(sgn == MP_ZPOS || s_mp_cmp_d(c, 0) == MP_EQ)
-    SIGN(c) = MP_ZPOS;
-  else
-    SIGN(c) = sgn;
-
-  return MP_OKAY;
-
-} /* end mp_mul() */
-
-/* }}} */
-
-/* {{{ mp_mul_2d(a, d, c) */
-
-/*
-  mp_mul_2d(a, d, c)
-
-  Compute c = a * 2^d.  a may be the same as c.
- */
-
-mp_err mp_mul_2d(mp_int *a, mp_digit d, mp_int *c)
-{
-  mp_err   res;
-
-  ARGCHK(a != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, c)) != MP_OKAY)
-    return res;
-
-  if(d == 0)
-    return MP_OKAY;
-
-  return s_mp_mul_2d(c, d);
-
-} /* end mp_mul() */
-
-/* }}} */
-
-/* {{{ mp_sqr(a, b) */
-
-#if MP_SQUARE
-mp_err mp_sqr(mp_int *a, mp_int *b)
-{
-  mp_err   res;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if((res = mp_copy(a, b)) != MP_OKAY)
-    return res;
-
-  if((res = s_mp_sqr(b)) != MP_OKAY)
-    return res;
-
-  SIGN(b) = MP_ZPOS;
-
-  return MP_OKAY;
-
-} /* end mp_sqr() */
-#endif
-
-/* }}} */
-
-/* {{{ mp_div(a, b, q, r) */
-
-/*
-  mp_div(a, b, q, r)
-
-  Compute q = a / b and r = a mod b.  Input parameters may be re-used
-  as output parameters.  If q or r is NULL, that portion of the
-  computation will be discarded (although it will still be computed)
-
-  Pay no attention to the hacker behind the curtain.
- */
-
-mp_err mp_div(mp_int *a, mp_int *b, mp_int *q, mp_int *r)
-{
-  mp_err   res;
-  mp_int   qtmp, rtmp;
-  int      cmp;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  if(mp_cmp_z(b) == MP_EQ)
-    return MP_RANGE;
-
-  /* If a <= b, we can compute the solution without division, and
-     avoid any memory allocation
-   */
-  if((cmp = s_mp_cmp(a, b)) < 0) {
-    if(r) {
-      if((res = mp_copy(a, r)) != MP_OKAY)
-	return res;
-    }
-
-    if(q)
-      mp_zero(q);
-
-    return MP_OKAY;
-
-  } else if(cmp == 0) {
-
-    /* Set quotient to 1, with appropriate sign */
-    if(q) {
-      int qneg = (SIGN(a) != SIGN(b));
-
-      mp_set(q, 1);
-      if(qneg)
-	SIGN(q) = MP_NEG;
-    }
-
-    if(r)
-      mp_zero(r);
-
-    return MP_OKAY;
-  }
-
-  /* If we get here, it means we actually have to do some division */
-
-  /* Set up some temporaries... */
-  if((res = mp_init_copy(&qtmp, a)) != MP_OKAY)
-    return res;
-  if((res = mp_init_copy(&rtmp, b)) != MP_OKAY)
-    goto CLEANUP;
-
-  if((res = s_mp_div(&qtmp, &rtmp)) != MP_OKAY)
-    goto CLEANUP;
-
-  /* Compute the signs for the output  */
-  SIGN(&rtmp) = SIGN(a); /* Sr = Sa              */
-  if(SIGN(a) == SIGN(b))
-    SIGN(&qtmp) = MP_ZPOS;  /* Sq = MP_ZPOS if Sa = Sb */
-  else
-    SIGN(&qtmp) = MP_NEG;   /* Sq = MP_NEG if Sa != Sb */
-
-  if(s_mp_cmp_d(&qtmp, 0) == MP_EQ)
-    SIGN(&qtmp) = MP_ZPOS;
-  if(s_mp_cmp_d(&rtmp, 0) == MP_EQ)
-    SIGN(&rtmp) = MP_ZPOS;
-
-  /* Copy output, if it is needed      */
-  if(q)
-    s_mp_exch(&qtmp, q);
-
-  if(r)
-    s_mp_exch(&rtmp, r);
-
-CLEANUP:
-  mp_clear(&rtmp);
-  mp_clear(&qtmp);
-
-  return res;
-
-} /* end mp_div() */
-
-/* }}} */
-
-/* {{{ mp_div_2d(a, d, q, r) */
-
-mp_err mp_div_2d(mp_int *a, mp_digit d, mp_int *q, mp_int *r)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL, MP_BADARG);
-
-  if(q) {
-    if((res = mp_copy(a, q)) != MP_OKAY)
-      return res;
-
-    s_mp_div_2d(q, d);
-  }
-
-  if(r) {
-    if((res = mp_copy(a, r)) != MP_OKAY)
-      return res;
-
-    s_mp_mod_2d(r, d);
-  }
-
-  return MP_OKAY;
-
-} /* end mp_div_2d() */
-
-/* }}} */
-
-/* {{{ mp_expt(a, b, c) */
-
-/*
-  mp_expt(a, b, c)
-
-  Compute c = a ** b, that is, raise a to the b power.  Uses a
-  standard iterative square-and-multiply technique.
- */
-
-mp_err mp_expt(mp_int *a, mp_int *b, mp_int *c)
-{
-  mp_int   s, x;
-  mp_err   res;
-  mp_digit d;
-  unsigned int bit, dig;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  if(mp_cmp_z(b) < 0)
-    return MP_RANGE;
-
-  if((res = mp_init(&s)) != MP_OKAY)
-    return res;
-
-  mp_set(&s, 1);
-
-  if((res = mp_init_copy(&x, a)) != MP_OKAY)
-    goto X;
-
-  /* Loop over low-order digits in ascending order */
-  for(dig = 0; dig < (USED(b) - 1); dig++) {
-    d = DIGIT(b, dig);
-
-    /* Loop over bits of each non-maximal digit */
-    for(bit = 0; bit < DIGIT_BIT; bit++) {
-      if(d & 1) {
-	if((res = s_mp_mul(&s, &x)) != MP_OKAY)
-	  goto CLEANUP;
-      }
-
-      d >>= 1;
-
-      if((res = s_mp_sqr(&x)) != MP_OKAY)
-	goto CLEANUP;
-    }
-  }
-
-  /* Consider now the last digit... */
-  d = DIGIT(b, dig);
-
-  while(d) {
-    if(d & 1) {
-      if((res = s_mp_mul(&s, &x)) != MP_OKAY)
-	goto CLEANUP;
-    }
-
-    d >>= 1;
-
-    if((res = s_mp_sqr(&x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  if(mp_iseven(b))
-    SIGN(&s) = SIGN(a);
-
-  res = mp_copy(&s, c);
-
-CLEANUP:
-  mp_clear(&x);
-X:
-  mp_clear(&s);
-
-  return res;
-
-} /* end mp_expt() */
-
-/* }}} */
-
-/* {{{ mp_2expt(a, k) */
-
-/* Compute a = 2^k */
-
-mp_err mp_2expt(mp_int *a, mp_digit k)
-{
-  ARGCHK(a != NULL, MP_BADARG);
-
-  return s_mp_2expt(a, k);
-
-} /* end mp_2expt() */
-
-/* }}} */
-
-/* {{{ mp_mod(a, m, c) */
-
-/*
-  mp_mod(a, m, c)
-
-  Compute c = a (mod m).  Result will always be 0 <= c < m.
- */
-
-mp_err mp_mod(mp_int *a, mp_int *m, mp_int *c)
-{
-  mp_err  res;
-  int     mag;
-
-  ARGCHK(a != NULL && m != NULL && c != NULL, MP_BADARG);
-
-  if(SIGN(m) == MP_NEG)
-    return MP_RANGE;
-
-  /*
-     If |a| > m, we need to divide to get the remainder and take the
-     absolute value.
-
-     If |a| < m, we don't need to do any division, just copy and adjust
-     the sign (if a is negative).
-
-     If |a| == m, we can simply set the result to zero.
-
-     This order is intended to minimize the average path length of the
-     comparison chain on common workloads -- the most frequent cases are
-     that |a| != m, so we do those first.
-   */
-  if((mag = s_mp_cmp(a, m)) > 0) {
-    if((res = mp_div(a, m, NULL, c)) != MP_OKAY)
-      return res;
-
-    if(SIGN(c) == MP_NEG) {
-      if((res = mp_add(c, m, c)) != MP_OKAY)
-	return res;
-    }
-
-  } else if(mag < 0) {
-    if((res = mp_copy(a, c)) != MP_OKAY)
-      return res;
-
-    if(mp_cmp_z(a) < 0) {
-      if((res = mp_add(c, m, c)) != MP_OKAY)
-	return res;
-
-    }
-
-  } else {
-    mp_zero(c);
-
-  }
-
-  return MP_OKAY;
-
-} /* end mp_mod() */
-
-/* }}} */
-
-/* {{{ mp_mod_d(a, d, c) */
-
-/*
-  mp_mod_d(a, d, c)
-
-  Compute c = a (mod d).  Result will always be 0 <= c < d
- */
-mp_err mp_mod_d(mp_int *a, mp_digit d, mp_digit *c)
-{
-  mp_err   res;
-  mp_digit rem;
-
-  ARGCHK(a != NULL && c != NULL, MP_BADARG);
-
-  if(s_mp_cmp_d(a, d) > 0) {
-    if((res = mp_div_d(a, d, NULL, &rem)) != MP_OKAY)
-      return res;
-
-  } else {
-    if(SIGN(a) == MP_NEG)
-      rem = d - DIGIT(a, 0);
-    else
-      rem = DIGIT(a, 0);
-  }
-
-  if(c)
-    *c = rem;
-
-  return MP_OKAY;
-
-} /* end mp_mod_d() */
-
-/* }}} */
-
-/* {{{ mp_sqrt(a, b) */
-
-/*
-  mp_sqrt(a, b)
-
-  Compute the integer square root of a, and store the result in b.
-  Uses an integer-arithmetic version of Newton's iterative linear
-  approximation technique to determine this value; the result has the
-  following two properties:
-
-     b^2 <= a
-     (b+1)^2 >= a
-
-  It is a range error to pass a negative value.
- */
-mp_err mp_sqrt(mp_int *a, mp_int *b)
-{
-  mp_int   x, t;
-  mp_err   res;
-
-  ARGCHK(a != NULL && b != NULL, MP_BADARG);
-
-  /* Cannot take square root of a negative value */
-  if(SIGN(a) == MP_NEG)
-    return MP_RANGE;
-
-  /* Special cases for zero and one, trivial     */
-  if(mp_cmp_d(a, 0) == MP_EQ || mp_cmp_d(a, 1) == MP_EQ)
-    return mp_copy(a, b);
-
-  /* Initialize the temporaries we'll use below  */
-  if((res = mp_init_size(&t, USED(a))) != MP_OKAY)
-    return res;
-
-  /* Compute an initial guess for the iteration as a itself */
-  if((res = mp_init_copy(&x, a)) != MP_OKAY)
-    goto X;
-
-s_mp_rshd(&x, (USED(&x)/2)+1);
-mp_add_d(&x, 1, &x);
-
-  for(;;) {
-    /* t = (x * x) - a */
-    mp_copy(&x, &t);      /* can't fail, t is big enough for original x */
-    if((res = mp_sqr(&t, &t)) != MP_OKAY ||
-       (res = mp_sub(&t, a, &t)) != MP_OKAY)
-      goto CLEANUP;
-
-    /* t = t / 2x       */
-    s_mp_mul_2(&x);
-    if((res = mp_div(&t, &x, &t, NULL)) != MP_OKAY)
-      goto CLEANUP;
-    s_mp_div_2(&x);
-
-    /* Terminate the loop, if the quotient is zero */
-    if(mp_cmp_z(&t) == MP_EQ)
-      break;
-
-    /* x = x - t       */
-    if((res = mp_sub(&x, &t, &x)) != MP_OKAY)
-      goto CLEANUP;
-
-  }
-
-  /* Copy result to output parameter */
-  mp_sub_d(&x, 1, &x);
-  s_mp_exch(&x, b);
-
- CLEANUP:
-  mp_clear(&x);
- X:
-  mp_clear(&t);
-
-  return res;
-
-} /* end mp_sqrt() */
-
-/* }}} */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ Modular arithmetic */
-
-#if MP_MODARITH
-/* {{{ mp_addmod(a, b, m, c) */
-
-/*
-  mp_addmod(a, b, m, c)
-
-  Compute c = (a + b) mod m
- */
-
-mp_err mp_addmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && b != NULL && m != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_add(a, b, c)) != MP_OKAY)
-    return res;
-  if((res = mp_mod(c, m, c)) != MP_OKAY)
-    return res;
-
-  return MP_OKAY;
-
-}
-
-/* }}} */
-
-/* {{{ mp_submod(a, b, m, c) */
-
-/*
-  mp_submod(a, b, m, c)
-
-  Compute c = (a - b) mod m
- */
-
-mp_err mp_submod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && b != NULL && m != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_sub(a, b, c)) != MP_OKAY)
-    return res;
-  if((res = mp_mod(c, m, c)) != MP_OKAY)
-    return res;
-
-  return MP_OKAY;
-
-}
-
-/* }}} */
-
-/* {{{ mp_mulmod(a, b, m, c) */
-
-/*
-  mp_mulmod(a, b, m, c)
-
-  Compute c = (a * b) mod m
- */
-
-mp_err mp_mulmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && b != NULL && m != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_mul(a, b, c)) != MP_OKAY)
-    return res;
-  if((res = mp_mod(c, m, c)) != MP_OKAY)
-    return res;
-
-  return MP_OKAY;
-
-}
-
-/* }}} */
-
-/* {{{ mp_sqrmod(a, m, c) */
-
-#if MP_SQUARE
-mp_err mp_sqrmod(mp_int *a, mp_int *m, mp_int *c)
-{
-  mp_err  res;
-
-  ARGCHK(a != NULL && m != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_sqr(a, c)) != MP_OKAY)
-    return res;
-  if((res = mp_mod(c, m, c)) != MP_OKAY)
-    return res;
-
-  return MP_OKAY;
-
-} /* end mp_sqrmod() */
-#endif
-
-/* }}} */
-
-/* {{{ mp_exptmod(a, b, m, c) */
-
-/*
-  mp_exptmod(a, b, m, c)
-
-  Compute c = (a ** b) mod m.  Uses a standard square-and-multiply
-  method with modular reductions at each step. (This is basically the
-  same code as mp_expt(), except for the addition of the reductions)
-
-  The modular reductions are done using Barrett's algorithm (see
-  s_mp_reduce() below for details)
- */
-
-mp_err mp_exptmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
-{
-  mp_int   s, x, mu;
-  mp_err   res;
-  mp_digit d, *db = DIGITS(b);
-  mp_size  ub = USED(b);
-  unsigned int bit, dig;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  if(mp_cmp_z(b) < 0 || mp_cmp_z(m) <= 0)
-    return MP_RANGE;
-
-  if((res = mp_init(&s)) != MP_OKAY)
-    return res;
-  if((res = mp_init_copy(&x, a)) != MP_OKAY)
-    goto X;
-  if((res = mp_mod(&x, m, &x)) != MP_OKAY ||
-     (res = mp_init(&mu)) != MP_OKAY)
-    goto MU;
-
-  mp_set(&s, 1);
-
-  /* mu = b^2k / m */
-  s_mp_add_d(&mu, 1);
-  s_mp_lshd(&mu, 2 * USED(m));
-  if((res = mp_div(&mu, m, &mu, NULL)) != MP_OKAY)
-    goto CLEANUP;
-
-  /* Loop over digits of b in ascending order, except highest order */
-  for(dig = 0; dig < (ub - 1); dig++) {
-    d = *db++;
-
-    /* Loop over the bits of the lower-order digits */
-    for(bit = 0; bit < DIGIT_BIT; bit++) {
-      if(d & 1) {
-	if((res = s_mp_mul(&s, &x)) != MP_OKAY)
-	  goto CLEANUP;
-	if((res = s_mp_reduce(&s, m, &mu)) != MP_OKAY)
-	  goto CLEANUP;
-      }
-
-      d >>= 1;
-
-      if((res = s_mp_sqr(&x)) != MP_OKAY)
-	goto CLEANUP;
-      if((res = s_mp_reduce(&x, m, &mu)) != MP_OKAY)
-	goto CLEANUP;
-    }
-  }
-
-  /* Now do the last digit... */
-  d = *db;
-
-  while(d) {
-    if(d & 1) {
-      if((res = s_mp_mul(&s, &x)) != MP_OKAY)
-	goto CLEANUP;
-      if((res = s_mp_reduce(&s, m, &mu)) != MP_OKAY)
-	goto CLEANUP;
-    }
-
-    d >>= 1;
-
-    if((res = s_mp_sqr(&x)) != MP_OKAY)
-      goto CLEANUP;
-    if((res = s_mp_reduce(&x, m, &mu)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  s_mp_exch(&s, c);
-
- CLEANUP:
-  mp_clear(&mu);
- MU:
-  mp_clear(&x);
- X:
-  mp_clear(&s);
-
-  return res;
-
-} /* end mp_exptmod() */
-
-/* }}} */
-
-/* {{{ mp_exptmod_d(a, d, m, c) */
-
-mp_err mp_exptmod_d(mp_int *a, mp_digit d, mp_int *m, mp_int *c)
-{
-  mp_int   s, x;
-  mp_err   res;
-
-  ARGCHK(a != NULL && c != NULL, MP_BADARG);
-
-  if((res = mp_init(&s)) != MP_OKAY)
-    return res;
-  if((res = mp_init_copy(&x, a)) != MP_OKAY)
-    goto X;
-
-  mp_set(&s, 1);
-
-  while(d != 0) {
-    if(d & 1) {
-      if((res = s_mp_mul(&s, &x)) != MP_OKAY ||
-	 (res = mp_mod(&s, m, &s)) != MP_OKAY)
-	goto CLEANUP;
-    }
-
-    d /= 2;
-
-    if((res = s_mp_sqr(&x)) != MP_OKAY ||
-       (res = mp_mod(&x, m, &x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  s_mp_exch(&s, c);
-
-CLEANUP:
-  mp_clear(&x);
-X:
-  mp_clear(&s);
-
-  return res;
-
-} /* end mp_exptmod_d() */
-
-/* }}} */
-#endif /* if MP_MODARITH */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ Comparison functions */
-
-/* {{{ mp_cmp_z(a) */
-
-/*
-  mp_cmp_z(a)
-
-  Compare a <=> 0.  Returns <0 if a<0, 0 if a=0, >0 if a>0.
- */
-
-int    mp_cmp_z(mp_int *a)
-{
-  if(SIGN(a) == MP_NEG)
-    return MP_LT;
-  else if(USED(a) == 1 && DIGIT(a, 0) == 0)
-    return MP_EQ;
-  else
-    return MP_GT;
-
-} /* end mp_cmp_z() */
-
-/* }}} */
-
-/* {{{ mp_cmp_d(a, d) */
-
-/*
-  mp_cmp_d(a, d)
-
-  Compare a <=> d.  Returns <0 if a<d, 0 if a=d, >0 if a>d
- */
-
-int    mp_cmp_d(mp_int *a, mp_digit d)
-{
-  ARGCHK(a != NULL, MP_EQ);
-
-  if(SIGN(a) == MP_NEG)
-    return MP_LT;
-
-  return s_mp_cmp_d(a, d);
-
-} /* end mp_cmp_d() */
-
-/* }}} */
-
-/* {{{ mp_cmp(a, b) */
-
-int    mp_cmp(mp_int *a, mp_int *b)
-{
-  ARGCHK(a != NULL && b != NULL, MP_EQ);
-
-  if(SIGN(a) == SIGN(b)) {
-    int  mag;
-
-    if((mag = s_mp_cmp(a, b)) == MP_EQ)
-      return MP_EQ;
-
-    if(SIGN(a) == MP_ZPOS)
-      return mag;
-    else
-      return -mag;
-
-  } else if(SIGN(a) == MP_ZPOS) {
-    return MP_GT;
-  } else {
-    return MP_LT;
-  }
-
-} /* end mp_cmp() */
-
-/* }}} */
-
-/* {{{ mp_cmp_mag(a, b) */
-
-/*
-  mp_cmp_mag(a, b)
-
-  Compares |a| <=> |b|, and returns an appropriate comparison result
- */
-
-int    mp_cmp_mag(mp_int *a, mp_int *b)
-{
-  ARGCHK(a != NULL && b != NULL, MP_EQ);
-
-  return s_mp_cmp(a, b);
-
-} /* end mp_cmp_mag() */
-
-/* }}} */
-
-/* {{{ mp_cmp_int(a, z) */
-
-/*
-  This just converts z to an mp_int, and uses the existing comparison
-  routines.  This is sort of inefficient, but it's not clear to me how
-  frequently this wil get used anyway.  For small positive constants,
-  you can always use mp_cmp_d(), and for zero, there is mp_cmp_z().
- */
-int    mp_cmp_int(mp_int *a, long z)
-{
-  mp_int  tmp;
-  int     out;
-
-  ARGCHK(a != NULL, MP_EQ);
-
-  mp_init(&tmp); mp_set_int(&tmp, z);
-  out = mp_cmp(a, &tmp);
-  mp_clear(&tmp);
-
-  return out;
-
-} /* end mp_cmp_int() */
-
-/* }}} */
-
-/* {{{ mp_isodd(a) */
-
-/*
-  mp_isodd(a)
-
-  Returns a true (non-zero) value if a is odd, false (zero) otherwise.
- */
-int    mp_isodd(mp_int *a)
-{
-  ARGCHK(a != NULL, 0);
-
-  return (DIGIT(a, 0) & 1);
-
-} /* end mp_isodd() */
-
-/* }}} */
-
-/* {{{ mp_iseven(a) */
-
-int    mp_iseven(mp_int *a)
-{
-  return !mp_isodd(a);
-
-} /* end mp_iseven() */
-
-/* }}} */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ Number theoretic functions */
-
-#if MP_NUMTH
-/* {{{ mp_gcd(a, b, c) */
-
-/*
-  Like the old mp_gcd() function, except computes the GCD using the
-  binary algorithm due to Josef Stein in 1961 (via Knuth).
- */
-mp_err mp_gcd(mp_int *a, mp_int *b, mp_int *c)
-{
-  mp_err   res;
-  mp_int   u, v, t;
-  mp_size  k = 0;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  if(mp_cmp_z(a) == MP_EQ && mp_cmp_z(b) == MP_EQ)
-      return MP_RANGE;
-  if(mp_cmp_z(a) == MP_EQ) {
-    return mp_copy(b, c);
-  } else if(mp_cmp_z(b) == MP_EQ) {
-    return mp_copy(a, c);
-  }
-
-  if((res = mp_init(&t)) != MP_OKAY)
-    return res;
-  if((res = mp_init_copy(&u, a)) != MP_OKAY)
-    goto U;
-  if((res = mp_init_copy(&v, b)) != MP_OKAY)
-    goto V;
-
-  SIGN(&u) = MP_ZPOS;
-  SIGN(&v) = MP_ZPOS;
-
-  /* Divide out common factors of 2 until at least 1 of a, b is even */
-  while(mp_iseven(&u) && mp_iseven(&v)) {
-    s_mp_div_2(&u);
-    s_mp_div_2(&v);
-    ++k;
-  }
-
-  /* Initialize t */
-  if(mp_isodd(&u)) {
-    if((res = mp_copy(&v, &t)) != MP_OKAY)
-      goto CLEANUP;
-
-    /* t = -v */
-    if(SIGN(&v) == MP_ZPOS)
-      SIGN(&t) = MP_NEG;
-    else
-      SIGN(&t) = MP_ZPOS;
-
-  } else {
-    if((res = mp_copy(&u, &t)) != MP_OKAY)
-      goto CLEANUP;
-
-  }
-
-  for(;;) {
-    while(mp_iseven(&t)) {
-      s_mp_div_2(&t);
-    }
-
-    if(mp_cmp_z(&t) == MP_GT) {
-      if((res = mp_copy(&t, &u)) != MP_OKAY)
-	goto CLEANUP;
-
-    } else {
-      if((res = mp_copy(&t, &v)) != MP_OKAY)
-	goto CLEANUP;
-
-      /* v = -t */
-      if(SIGN(&t) == MP_ZPOS)
-	SIGN(&v) = MP_NEG;
-      else
-	SIGN(&v) = MP_ZPOS;
-    }
-
-    if((res = mp_sub(&u, &v, &t)) != MP_OKAY)
-      goto CLEANUP;
-
-    if(s_mp_cmp_d(&t, 0) == MP_EQ)
-      break;
-  }
-
-  s_mp_2expt(&v, k);       /* v = 2^k   */
-  res = mp_mul(&u, &v, c); /* c = u * v */
-
- CLEANUP:
-  mp_clear(&v);
- V:
-  mp_clear(&u);
- U:
-  mp_clear(&t);
-
-  return res;
-
-} /* end mp_bgcd() */
-
-/* }}} */
-
-/* {{{ mp_lcm(a, b, c) */
-
-/* We compute the least common multiple using the rule:
-
-   ab = [a, b](a, b)
-
-   ... by computing the product, and dividing out the gcd.
- */
-
-mp_err mp_lcm(mp_int *a, mp_int *b, mp_int *c)
-{
-  mp_int  gcd, prod;
-  mp_err  res;
-
-  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
-
-  /* Set up temporaries */
-  if((res = mp_init(&gcd)) != MP_OKAY)
-    return res;
-  if((res = mp_init(&prod)) != MP_OKAY)
-    goto GCD;
-
-  if((res = mp_mul(a, b, &prod)) != MP_OKAY)
-    goto CLEANUP;
-  if((res = mp_gcd(a, b, &gcd)) != MP_OKAY)
-    goto CLEANUP;
-
-  res = mp_div(&prod, &gcd, c, NULL);
-
- CLEANUP:
-  mp_clear(&prod);
- GCD:
-  mp_clear(&gcd);
-
-  return res;
-
-} /* end mp_lcm() */
-
-/* }}} */
-
-/* {{{ mp_xgcd(a, b, g, x, y) */
-
-/*
-  mp_xgcd(a, b, g, x, y)
-
-  Compute g = (a, b) and values x and y satisfying Bezout's identity
-  (that is, ax + by = g).  This uses the extended binary GCD algorithm
-  based on the Stein algorithm used for mp_gcd()
- */
-
-mp_err mp_xgcd(mp_int *a, mp_int *b, mp_int *g, mp_int *x, mp_int *y)
-{
-  mp_int   gx, xc, yc, u, v, A, B, C, D;
-  mp_int  *clean[9];
-  mp_err   res;
-  int      last = -1;
-
-  if(mp_cmp_z(b) == 0)
-    return MP_RANGE;
-
-  /* Initialize all these variables we need */
-  if((res = mp_init(&u)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &u;
-  if((res = mp_init(&v)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &v;
-  if((res = mp_init(&gx)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &gx;
-  if((res = mp_init(&A)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &A;
-  if((res = mp_init(&B)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &B;
-  if((res = mp_init(&C)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &C;
-  if((res = mp_init(&D)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &D;
-  if((res = mp_init_copy(&xc, a)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &xc;
-  mp_abs(&xc, &xc);
-  if((res = mp_init_copy(&yc, b)) != MP_OKAY) goto CLEANUP;
-  clean[++last] = &yc;
-  mp_abs(&yc, &yc);
-
-  mp_set(&gx, 1);
-
-  /* Divide by two until at least one of them is even */
-  while(mp_iseven(&xc) && mp_iseven(&yc)) {
-    s_mp_div_2(&xc);
-    s_mp_div_2(&yc);
-    if((res = s_mp_mul_2(&gx)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  mp_copy(&xc, &u);
-  mp_copy(&yc, &v);
-  mp_set(&A, 1); mp_set(&D, 1);
-
-  /* Loop through binary GCD algorithm */
-  for(;;) {
-    while(mp_iseven(&u)) {
-      s_mp_div_2(&u);
-
-      if(mp_iseven(&A) && mp_iseven(&B)) {
-	s_mp_div_2(&A); s_mp_div_2(&B);
-      } else {
-	if((res = mp_add(&A, &yc, &A)) != MP_OKAY) goto CLEANUP;
-	s_mp_div_2(&A);
-	if((res = mp_sub(&B, &xc, &B)) != MP_OKAY) goto CLEANUP;
-	s_mp_div_2(&B);
-      }
-    }
-
-    while(mp_iseven(&v)) {
-      s_mp_div_2(&v);
-
-      if(mp_iseven(&C) && mp_iseven(&D)) {
-	s_mp_div_2(&C); s_mp_div_2(&D);
-      } else {
-	if((res = mp_add(&C, &yc, &C)) != MP_OKAY) goto CLEANUP;
-	s_mp_div_2(&C);
-	if((res = mp_sub(&D, &xc, &D)) != MP_OKAY) goto CLEANUP;
-	s_mp_div_2(&D);
-      }
-    }
-
-    if(mp_cmp(&u, &v) >= 0) {
-      if((res = mp_sub(&u, &v, &u)) != MP_OKAY) goto CLEANUP;
-      if((res = mp_sub(&A, &C, &A)) != MP_OKAY) goto CLEANUP;
-      if((res = mp_sub(&B, &D, &B)) != MP_OKAY) goto CLEANUP;
-
-    } else {
-      if((res = mp_sub(&v, &u, &v)) != MP_OKAY) goto CLEANUP;
-      if((res = mp_sub(&C, &A, &C)) != MP_OKAY) goto CLEANUP;
-      if((res = mp_sub(&D, &B, &D)) != MP_OKAY) goto CLEANUP;
-
-    }
-
-    /* If we're done, copy results to output */
-    if(mp_cmp_z(&u) == 0) {
-      if(x)
-	if((res = mp_copy(&C, x)) != MP_OKAY) goto CLEANUP;
-
-      if(y)
-	if((res = mp_copy(&D, y)) != MP_OKAY) goto CLEANUP;
-
-      if(g)
-	if((res = mp_mul(&gx, &v, g)) != MP_OKAY) goto CLEANUP;
-
-      break;
-    }
-  }
-
- CLEANUP:
-  while(last >= 0)
-    mp_clear(clean[last--]);
-
-  return res;
-
-} /* end mp_xgcd() */
-
-/* }}} */
-
-/* {{{ mp_invmod(a, m, c) */
-
-/*
-  mp_invmod(a, m, c)
-
-  Compute c = a^-1 (mod m), if there is an inverse for a (mod m).
-  This is equivalent to the question of whether (a, m) = 1.  If not,
-  MP_UNDEF is returned, and there is no inverse.
- */
-
-mp_err mp_invmod(mp_int *a, mp_int *m, mp_int *c)
-{
-  mp_int  g, x;
-  mp_err  res;
-
-  ARGCHK(a && m && c, MP_BADARG);
-
-  if(mp_cmp_z(a) == 0 || mp_cmp_z(m) == 0)
-    return MP_RANGE;
-
-  if((res = mp_init(&g)) != MP_OKAY)
-    return res;
-  if((res = mp_init(&x)) != MP_OKAY)
-    goto X;
-
-  if((res = mp_xgcd(a, m, &g, &x, NULL)) != MP_OKAY)
-    goto CLEANUP;
-
-  if(mp_cmp_d(&g, 1) != MP_EQ) {
-    res = MP_UNDEF;
-    goto CLEANUP;
-  }
-
-  res = mp_mod(&x, m, c);
-  SIGN(c) = SIGN(a);
-
-CLEANUP:
-  mp_clear(&x);
-X:
-  mp_clear(&g);
-
-  return res;
-
-} /* end mp_invmod() */
-
-/* }}} */
-#endif /* if MP_NUMTH */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ mp_print(mp, ofp) */
-
-#if MP_IOFUNC
-/*
-  mp_print(mp, ofp)
-
-  Print a textual representation of the given mp_int on the output
-  stream 'ofp'.  Output is generated using the internal radix.
- */
-
-void   mp_print(mp_int *mp, FILE *ofp)
-{
-  int   ix;
-
-  if(mp == NULL || ofp == NULL)
-    return;
-
-  fputc((SIGN(mp) == MP_NEG) ? '-' : '+', ofp);
-
-  for(ix = USED(mp) - 1; ix >= 0; ix--) {
-    fprintf(ofp, DIGIT_FMT, DIGIT(mp, ix));
-  }
-
-} /* end mp_print() */
-
-#endif /* if MP_IOFUNC */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* {{{ More I/O Functions */
-
-/* {{{ mp_read_signed_bin(mp, str, len) */
-
-/*
-   mp_read_signed_bin(mp, str, len)
-
-   Read in a raw value (base 256) into the given mp_int
- */
-
-mp_err  mp_read_signed_bin(mp_int *mp, unsigned char *str, int len)
-{
-  mp_err         res;
-
-  ARGCHK(mp != NULL && str != NULL && len > 0, MP_BADARG);
-
-  if((res = mp_read_unsigned_bin(mp, str + 1, len - 1)) == MP_OKAY) {
-    /* Get sign from first byte */
-    if(str[0])
-      SIGN(mp) = MP_NEG;
-    else
-      SIGN(mp) = MP_ZPOS;
-  }
-
-  return res;
-
-} /* end mp_read_signed_bin() */
-
-/* }}} */
-
-/* {{{ mp_signed_bin_size(mp) */
-
-int    mp_signed_bin_size(mp_int *mp)
-{
-  ARGCHK(mp != NULL, 0);
-
-  return mp_unsigned_bin_size(mp) + 1;
-
-} /* end mp_signed_bin_size() */
-
-/* }}} */
-
-/* {{{ mp_to_signed_bin(mp, str) */
-
-mp_err mp_to_signed_bin(mp_int *mp, unsigned char *str)
-{
-  ARGCHK(mp != NULL && str != NULL, MP_BADARG);
-
-  /* Caller responsible for allocating enough memory (use mp_raw_size(mp)) */
-  str[0] = (char)SIGN(mp);
-
-  return mp_to_unsigned_bin(mp, str + 1);
-
-} /* end mp_to_signed_bin() */
-
-/* }}} */
-
-/* {{{ mp_read_unsigned_bin(mp, str, len) */
-
-/*
-  mp_read_unsigned_bin(mp, str, len)
-
-  Read in an unsigned value (base 256) into the given mp_int
- */
-
-mp_err  mp_read_unsigned_bin(mp_int *mp, unsigned char *str, int len)
-{
-  int     ix;
-  mp_err  res;
-
-  ARGCHK(mp != NULL && str != NULL && len > 0, MP_BADARG);
-
-  mp_zero(mp);
-
-  for(ix = 0; ix < len; ix++) {
-    if((res = s_mp_mul_2d(mp, CHAR_BIT)) != MP_OKAY)
-      return res;
-
-    if((res = mp_add_d(mp, str[ix], mp)) != MP_OKAY)
-      return res;
-  }
-
-  return MP_OKAY;
-
-} /* end mp_read_unsigned_bin() */
-
-/* }}} */
-
-/* {{{ mp_unsigned_bin_size(mp) */
-
-int     mp_unsigned_bin_size(mp_int *mp)
-{
-  mp_digit   topdig;
-  int        count;
-
-  ARGCHK(mp != NULL, 0);
-
-  /* Special case for the value zero */
-  if(USED(mp) == 1 && DIGIT(mp, 0) == 0)
-    return 1;
-
-  count = (USED(mp) - 1) * sizeof(mp_digit);
-  topdig = DIGIT(mp, USED(mp) - 1);
-
-  while(topdig != 0) {
-    ++count;
-    topdig >>= CHAR_BIT;
-  }
-
-  return count;
-
-} /* end mp_unsigned_bin_size() */
-
-/* }}} */
-
-/* {{{ mp_to_unsigned_bin(mp, str) */
-
-mp_err mp_to_unsigned_bin(mp_int *mp, unsigned char *str)
-{
-  mp_digit      *dp, *end, d;
-  unsigned char *spos;
-
-  ARGCHK(mp != NULL && str != NULL, MP_BADARG);
-
-  dp = DIGITS(mp);
-  end = dp + USED(mp) - 1;
-  spos = str;
-
-  /* Special case for zero, quick test */
-  if(dp == end && *dp == 0) {
-    *str = '\0';
-    return MP_OKAY;
-  }
-
-  /* Generate digits in reverse order */
-  while(dp < end) {
-    unsigned int ix;
-
-    d = *dp;
-    for(ix = 0; ix < sizeof(mp_digit); ++ix) {
-      *spos = d & UCHAR_MAX;
-      d >>= CHAR_BIT;
-      ++spos;
-    }
-
-    ++dp;
-  }
-
-  /* Now handle last digit specially, high order zeroes are not written */
-  d = *end;
-  while(d != 0) {
-    *spos = d & UCHAR_MAX;
-    d >>= CHAR_BIT;
-    ++spos;
-  }
-
-  /* Reverse everything to get digits in the correct order */
-  while(--spos > str) {
-    unsigned char t = *str;
-    *str = *spos;
-    *spos = t;
-
-    ++str;
-  }
-
-  return MP_OKAY;
-
-} /* end mp_to_unsigned_bin() */
-
-/* }}} */
-
-/* {{{ mp_count_bits(mp) */
-
-int    mp_count_bits(mp_int *mp)
-{
-  int      len;
-  mp_digit d;
-
-  ARGCHK(mp != NULL, MP_BADARG);
-
-  len = DIGIT_BIT * (USED(mp) - 1);
-  d = DIGIT(mp, USED(mp) - 1);
-
-  while(d != 0) {
-    ++len;
-    d >>= 1;
-  }
-
-  return len;
-
-} /* end mp_count_bits() */
-
-/* }}} */
-
-/* {{{ mp_read_radix(mp, str, radix) */
-
-/*
-  mp_read_radix(mp, str, radix)
-
-  Read an integer from the given string, and set mp to the resulting
-  value.  The input is presumed to be in base 10.  Leading non-digit
-  characters are ignored, and the function reads until a non-digit
-  character or the end of the string.
- */
-
-mp_err  mp_read_radix(mp_int *mp, unsigned char *str, int radix)
-{
-  int     ix = 0, val = 0;
-  mp_err  res;
-  mp_sign sig = MP_ZPOS;
-
-  ARGCHK(mp != NULL && str != NULL && radix >= 2 && radix <= MAX_RADIX,
-	 MP_BADARG);
-
-  mp_zero(mp);
-
-  /* Skip leading non-digit characters until a digit or '-' or '+' */
-  while(str[ix] &&
-	(s_mp_tovalue(str[ix], radix) < 0) &&
-	str[ix] != '-' &&
-	str[ix] != '+') {
-    ++ix;
-  }
-
-  if(str[ix] == '-') {
-    sig = MP_NEG;
-    ++ix;
-  } else if(str[ix] == '+') {
-    sig = MP_ZPOS; /* this is the default anyway... */
-    ++ix;
-  }
-
-  while((val = s_mp_tovalue(str[ix], radix)) >= 0) {
-    if((res = s_mp_mul_d(mp, radix)) != MP_OKAY)
-      return res;
-    if((res = s_mp_add_d(mp, val)) != MP_OKAY)
-      return res;
-    ++ix;
-  }
-
-  if(s_mp_cmp_d(mp, 0) == MP_EQ)
-    SIGN(mp) = MP_ZPOS;
-  else
-    SIGN(mp) = sig;
-
-  return MP_OKAY;
-
-} /* end mp_read_radix() */
-
-/* }}} */
-
-/* {{{ mp_radix_size(mp, radix) */
-
-int    mp_radix_size(mp_int *mp, int radix)
-{
-  int  len;
-  ARGCHK(mp != NULL, 0);
-
-  len = s_mp_outlen(mp_count_bits(mp), radix) + 1; /* for NUL terminator */
-
-  if(mp_cmp_z(mp) < 0)
-    ++len; /* for sign */
-
-  return len;
-
-} /* end mp_radix_size() */
-
-/* }}} */
-
-/* {{{ mp_value_radix_size(num, qty, radix) */
-
-/* num = number of digits
-   qty = number of bits per digit
-   radix = target base
-
-   Return the number of digits in the specified radix that would be
-   needed to express 'num' digits of 'qty' bits each.
- */
-int    mp_value_radix_size(int num, int qty, int radix)
-{
-  ARGCHK(num >= 0 && qty > 0 && radix >= 2 && radix <= MAX_RADIX, 0);
-
-  return s_mp_outlen(num * qty, radix);
-
-} /* end mp_value_radix_size() */
-
-/* }}} */
-
-/* {{{ mp_toradix(mp, str, radix) */
-
-mp_err mp_toradix(mp_int *mp, char *str, int radix)
-{
-  int  ix, pos = 0;
-
-  ARGCHK(mp != NULL && str != NULL, MP_BADARG);
-  ARGCHK(radix > 1 && radix <= MAX_RADIX, MP_RANGE);
-
-  if(mp_cmp_z(mp) == MP_EQ) {
-    str[0] = '0';
-    str[1] = '\0';
-  } else {
-    mp_err   res;
-    mp_int   tmp;
-    mp_sign  sgn;
-    mp_digit rem, rdx = (mp_digit)radix;
-    char     ch;
-
-    if((res = mp_init_copy(&tmp, mp)) != MP_OKAY)
-      return res;
-
-    /* Save sign for later, and take absolute value */
-    sgn = SIGN(&tmp); SIGN(&tmp) = MP_ZPOS;
-
-    /* Generate output digits in reverse order      */
-    while(mp_cmp_z(&tmp) != 0) {
-      if((res = s_mp_div_d(&tmp, rdx, &rem)) != MP_OKAY) {
-	mp_clear(&tmp);
-	return res;
-      }
-
-      /* Generate digits, use capital letters */
-      ch = s_mp_todigit(rem, radix, 0);
-
-      str[pos++] = ch;
-    }
-
-    /* Add - sign if original value was negative */
-    if(sgn == MP_NEG)
-      str[pos++] = '-';
-
-    /* Add trailing NUL to end the string        */
-    str[pos--] = '\0';
-
-    /* Reverse the digits and sign indicator     */
-    ix = 0;
-    while(ix < pos) {
-      char _tmp = str[ix];
-
-      str[ix] = str[pos];
-      str[pos] = _tmp;
-      ++ix;
-      --pos;
-    }
-
-    mp_clear(&tmp);
-  }
-
-  return MP_OKAY;
-
-} /* end mp_toradix() */
-
-/* }}} */
-
-/* {{{ mp_char2value(ch, r) */
-
-int    mp_char2value(char ch, int r)
-{
-  return s_mp_tovalue(ch, r);
-
-} /* end mp_tovalue() */
-
-/* }}} */
-
-/* }}} */
-
-/* {{{ mp_strerror(ec) */
-
-/*
-  mp_strerror(ec)
-
-  Return a string describing the meaning of error code 'ec'.  The
-  string returned is allocated in static memory, so the caller should
-  not attempt to modify or free the memory associated with this
-  string.
- */
-const char  *mp_strerror(mp_err ec)
-{
-  int   aec = (ec < 0) ? -ec : ec;
-
-  /* Code values are negative, so the senses of these comparisons
-     are accurate */
-  if(ec < MP_LAST_CODE || ec > MP_OKAY) {
-    return mp_err_string[0];  /* unknown error code */
-  } else {
-    return mp_err_string[aec + 1];
-  }
-
-} /* end mp_strerror() */
-
-/* }}} */
-
-/*========================================================================*/
-/*------------------------------------------------------------------------*/
-/* Static function definitions (internal use only)                        */
-
-/* {{{ Memory management */
-
-/* {{{ s_mp_grow(mp, min) */
-
-/* Make sure there are at least 'min' digits allocated to mp              */
-mp_err   s_mp_grow(mp_int *mp, mp_size min)
-{
-  if(min > ALLOC(mp)) {
-    mp_digit   *tmp;
-
-    /* Set min to next nearest default precision block size */
-    min = ((min + (s_mp_defprec - 1)) / s_mp_defprec) * s_mp_defprec;
-
-    if((tmp = s_mp_alloc(min, sizeof(mp_digit))) == NULL)
-      return MP_MEM;
-
-    s_mp_copy(DIGITS(mp), tmp, USED(mp));
-
-#if MP_CRYPTO
-    s_mp_setz(DIGITS(mp), ALLOC(mp));
-#endif
-    s_mp_free(DIGITS(mp));
-    DIGITS(mp) = tmp;
-    ALLOC(mp) = min;
-  }
-
-  return MP_OKAY;
-
-} /* end s_mp_grow() */
-
-/* }}} */
-
-/* {{{ s_mp_pad(mp, min) */
-
-/* Make sure the used size of mp is at least 'min', growing if needed     */
-mp_err   s_mp_pad(mp_int *mp, mp_size min)
-{
-  if(min > USED(mp)) {
-    mp_err  res;
-
-    /* Make sure there is room to increase precision  */
-    if(min > ALLOC(mp) && (res = s_mp_grow(mp, min)) != MP_OKAY)
-      return res;
-
-    /* Increase precision; should already be 0-filled */
-    USED(mp) = min;
-  }
-
-  return MP_OKAY;
-
-} /* end s_mp_pad() */
-
-/* }}} */
-
-/* {{{ s_mp_setz(dp, count) */
-
-#if MP_MACRO == 0
-/* Set 'count' digits pointed to by dp to be zeroes                       */
-void s_mp_setz(mp_digit *dp, mp_size count)
-{
-#if MP_MEMSET == 0
-  int  ix;
-
-  for(ix = 0; ix < count; ix++)
-    dp[ix] = 0;
-#else
-  memset(dp, 0, count * sizeof(mp_digit));
-#endif
-
-} /* end s_mp_setz() */
-#endif
-
-/* }}} */
-
-/* {{{ s_mp_copy(sp, dp, count) */
-
-#if MP_MACRO == 0
-/* Copy 'count' digits from sp to dp                                      */
-void s_mp_copy(mp_digit *sp, mp_digit *dp, mp_size count)
-{
-#if MP_MEMCPY == 0
-  int  ix;
-
-  for(ix = 0; ix < count; ix++)
-    dp[ix] = sp[ix];
-#else
-  memcpy(dp, sp, count * sizeof(mp_digit));
-#endif
-
-} /* end s_mp_copy() */
-#endif
-
-/* }}} */
-
-/* {{{ s_mp_alloc(nb, ni) */
-
-#if MP_MACRO == 0
-/* Allocate ni records of nb bytes each, and return a pointer to that     */
-void    *s_mp_alloc(size_t nb, size_t ni)
-{
-  return calloc(nb, ni);
-
-} /* end s_mp_alloc() */
-#endif
-
-/* }}} */
-
-/* {{{ s_mp_free(ptr) */
-
-#if MP_MACRO == 0
-/* Free the memory pointed to by ptr                                      */
-void     s_mp_free(void *ptr)
-{
-  if(ptr)
-    free(ptr);
-
-} /* end s_mp_free() */
-#endif
-
-/* }}} */
-
-/* {{{ s_mp_clamp(mp) */
-
-/* Remove leading zeroes from the given value                             */
-void     s_mp_clamp(mp_int *mp)
-{
-  mp_size   du = USED(mp);
-  mp_digit *zp = DIGITS(mp) + du - 1;
-
-  while(du > 1 && !*zp--)
-    --du;
-
-  USED(mp) = du;
-
-} /* end s_mp_clamp() */
-
-
-/* }}} */
-
-/* {{{ s_mp_exch(a, b) */
-
-/* Exchange the data for a and b; (b, a) = (a, b)                         */
-void     s_mp_exch(mp_int *a, mp_int *b)
-{
-  mp_int   tmp;
-
-  tmp = *a;
-  *a = *b;
-  *b = tmp;
-
-} /* end s_mp_exch() */
-
-/* }}} */
-
-/* }}} */
-
-/* {{{ Arithmetic helpers */
-
-/* {{{ s_mp_lshd(mp, p) */
-
-/*
-   Shift mp leftward by p digits, growing if needed, and zero-filling
-   the in-shifted digits at the right end.  This is a convenient
-   alternative to multiplication by powers of the radix
- */
-
-mp_err   s_mp_lshd(mp_int *mp, mp_size p)
-{
-  mp_err   res;
-  mp_size  pos;
-  mp_digit *dp;
-  int ix;
-
-  if(p == 0)
-    return MP_OKAY;
-
-  if((res = s_mp_pad(mp, USED(mp) + p)) != MP_OKAY)
-    return res;
-
-  pos = USED(mp) - 1;
-  dp = DIGITS(mp);
-
-  /* Shift all the significant figures over as needed */
-  for(ix = pos - p; ix >= 0; ix--)
-    dp[ix + p] = dp[ix];
-
-  /* Fill the bottom digits with zeroes */
-  for(ix = 0; (unsigned)ix < p; ix++)
-    dp[ix] = 0;
-
-  return MP_OKAY;
-
-} /* end s_mp_lshd() */
-
-/* }}} */
-
-/* {{{ s_mp_rshd(mp, p) */
-
-/*
-   Shift mp rightward by p digits.  Maintains the invariant that
-   digits above the precision are all zero.  Digits shifted off the
-   end are lost.  Cannot fail.
- */
-
-void     s_mp_rshd(mp_int *mp, mp_size p)
-{
-  mp_size  ix;
-  mp_digit *dp;
-
-  if(p == 0)
-    return;
-
-  /* Shortcut when all digits are to be shifted off */
-  if(p >= USED(mp)) {
-    s_mp_setz(DIGITS(mp), ALLOC(mp));
-    USED(mp) = 1;
-    SIGN(mp) = MP_ZPOS;
-    return;
-  }
-
-  /* Shift all the significant figures over as needed */
-  dp = DIGITS(mp);
-  for(ix = p; ix < USED(mp); ix++)
-    dp[ix - p] = dp[ix];
-
-  /* Fill the top digits with zeroes */
-  ix -= p;
-  while(ix < USED(mp))
-    dp[ix++] = 0;
-
-  /* Strip off any leading zeroes    */
-  s_mp_clamp(mp);
-
-} /* end s_mp_rshd() */
-
-/* }}} */
-
-/* {{{ s_mp_div_2(mp) */
-
-/* Divide by two -- take advantage of radix properties to do it fast      */
-void     s_mp_div_2(mp_int *mp)
-{
-  s_mp_div_2d(mp, 1);
-
-} /* end s_mp_div_2() */
-
-/* }}} */
-
-/* {{{ s_mp_mul_2(mp) */
-
-mp_err s_mp_mul_2(mp_int *mp)
-{
-  unsigned int ix;
-  mp_digit kin = 0, kout, *dp = DIGITS(mp);
-  mp_err   res;
-
-  /* Shift digits leftward by 1 bit */
-  for(ix = 0; ix < USED(mp); ix++) {
-    kout = (dp[ix] >> (DIGIT_BIT - 1)) & 1;
-    dp[ix] = (dp[ix] << 1) | kin;
-
-    kin = kout;
-  }
-
-  /* Deal with rollover from last digit */
-  if(kin) {
-    if(ix >= ALLOC(mp)) {
-      if((res = s_mp_grow(mp, ALLOC(mp) + 1)) != MP_OKAY)
-	return res;
-      dp = DIGITS(mp);
-    }
-
-    dp[ix] = kin;
-    USED(mp) += 1;
-  }
-
-  return MP_OKAY;
-
-} /* end s_mp_mul_2() */
-
-/* }}} */
-
-/* {{{ s_mp_mod_2d(mp, d) */
-
-/*
-  Remainder the integer by 2^d, where d is a number of bits.  This
-  amounts to a bitwise AND of the value, and does not require the full
-  division code
- */
-void     s_mp_mod_2d(mp_int *mp, mp_digit d)
-{
-  unsigned int  ndig = (d / DIGIT_BIT), nbit = (d % DIGIT_BIT);
-  unsigned int  ix;
-  mp_digit      dmask, *dp = DIGITS(mp);
-
-  if(ndig >= USED(mp))
-    return;
-
-  /* Flush all the bits above 2^d in its digit */
-  dmask = (1 << nbit) - 1;
-  dp[ndig] &= dmask;
-
-  /* Flush all digits above the one with 2^d in it */
-  for(ix = ndig + 1; ix < USED(mp); ix++)
-    dp[ix] = 0;
-
-  s_mp_clamp(mp);
-
-} /* end s_mp_mod_2d() */
-
-/* }}} */
-
-/* {{{ s_mp_mul_2d(mp, d) */
-
-/*
-  Multiply by the integer 2^d, where d is a number of bits.  This
-  amounts to a bitwise shift of the value, and does not require the
-  full multiplication code.
- */
-mp_err    s_mp_mul_2d(mp_int *mp, mp_digit d)
-{
-  mp_err   res;
-  mp_digit save, next, mask, *dp;
-  mp_size  used;
-  unsigned int ix;
-
-  if((res = s_mp_lshd(mp, d / DIGIT_BIT)) != MP_OKAY)
-    return res;
-
-  dp = DIGITS(mp); used = USED(mp);
-  d %= DIGIT_BIT;
-
-  mask = (1 << d) - 1;
-
-  /* If the shift requires another digit, make sure we've got one to
-     work with */
-  if((dp[used - 1] >> (DIGIT_BIT - d)) & mask) {
-    if((res = s_mp_grow(mp, used + 1)) != MP_OKAY)
-      return res;
-    dp = DIGITS(mp);
-  }
-
-  /* Do the shifting... */
-  save = 0;
-  for(ix = 0; ix < used; ix++) {
-    next = (dp[ix] >> (DIGIT_BIT - d)) & mask;
-    dp[ix] = (dp[ix] << d) | save;
-    save = next;
-  }
-
-  /* If, at this point, we have a nonzero carryout into the next
-     digit, we'll increase the size by one digit, and store it...
-   */
-  if(save) {
-    dp[used] = save;
-    USED(mp) += 1;
-  }
-
-  s_mp_clamp(mp);
-  return MP_OKAY;
-
-} /* end s_mp_mul_2d() */
-
-/* }}} */
-
-/* {{{ s_mp_div_2d(mp, d) */
-
-/*
-  Divide the integer by 2^d, where d is a number of bits.  This
-  amounts to a bitwise shift of the value, and does not require the
-  full division code (used in Barrett reduction, see below)
- */
-void     s_mp_div_2d(mp_int *mp, mp_digit d)
-{
-  int       ix;
-  mp_digit  save, next, mask, *dp = DIGITS(mp);
-
-  s_mp_rshd(mp, d / DIGIT_BIT);
-  d %= DIGIT_BIT;
-
-  mask = (1 << d) - 1;
-
-  save = 0;
-  for(ix = USED(mp) - 1; ix >= 0; ix--) {
-    next = dp[ix] & mask;
-    dp[ix] = (dp[ix] >> d) | (save << (DIGIT_BIT - d));
-    save = next;
-  }
-
-  s_mp_clamp(mp);
-
-} /* end s_mp_div_2d() */
-
-/* }}} */
-
-/* {{{ s_mp_norm(a, b) */
-
-/*
-  s_mp_norm(a, b)
-
-  Normalize a and b for division, where b is the divisor.  In order
-  that we might make good guesses for quotient digits, we want the
-  leading digit of b to be at least half the radix, which we
-  accomplish by multiplying a and b by a constant.  This constant is
-  returned (so that it can be divided back out of the remainder at the
-  end of the division process).
-
-  We multiply by the smallest power of 2 that gives us a leading digit
-  at least half the radix.  By choosing a power of 2, we simplify the
-  multiplication and division steps to simple shifts.
- */
-mp_digit s_mp_norm(mp_int *a, mp_int *b)
-{
-  mp_digit  t, d = 0;
-
-  t = DIGIT(b, USED(b) - 1);
-  while(t < (RADIX / 2)) {
-    t <<= 1;
-    ++d;
-  }
-
-  if(d != 0) {
-    s_mp_mul_2d(a, d);
-    s_mp_mul_2d(b, d);
-  }
-
-  return d;
-
-} /* end s_mp_norm() */
-
-/* }}} */
-
-/* }}} */
-
-/* {{{ Primitive digit arithmetic */
-
-/* {{{ s_mp_add_d(mp, d) */
-
-/* Add d to |mp| in place                                                 */
-mp_err   s_mp_add_d(mp_int *mp, mp_digit d)    /* unsigned digit addition */
-{
-  mp_word   w, k = 0;
-  mp_size   ix = 1, used = USED(mp);
-  mp_digit *dp = DIGITS(mp);
-
-  w = dp[0] + d;
-  dp[0] = ACCUM(w);
-  k = CARRYOUT(w);
-
-  while(ix < used && k) {
-    w = dp[ix] + k;
-    dp[ix] = ACCUM(w);
-    k = CARRYOUT(w);
-    ++ix;
-  }
-
-  if(k != 0) {
-    mp_err  res;
-
-    if((res = s_mp_pad(mp, USED(mp) + 1)) != MP_OKAY)
-      return res;
-
-    DIGIT(mp, ix) = k;
-  }
-
-  return MP_OKAY;
-
-} /* end s_mp_add_d() */
-
-/* }}} */
-
-/* {{{ s_mp_sub_d(mp, d) */
-
-/* Subtract d from |mp| in place, assumes |mp| > d                        */
-mp_err   s_mp_sub_d(mp_int *mp, mp_digit d)    /* unsigned digit subtract */
-{
-  mp_word   w, b = 0;
-  mp_size   ix = 1, used = USED(mp);
-  mp_digit *dp = DIGITS(mp);
-
-  /* Compute initial subtraction    */
-  w = (RADIX + dp[0]) - d;
-  b = CARRYOUT(w) ? 0 : 1;
-  dp[0] = ACCUM(w);
-
-  /* Propagate borrows leftward     */
-  while(b && ix < used) {
-    w = (RADIX + dp[ix]) - b;
-    b = CARRYOUT(w) ? 0 : 1;
-    dp[ix] = ACCUM(w);
-    ++ix;
-  }
-
-  /* Remove leading zeroes          */
-  s_mp_clamp(mp);
-
-  /* If we have a borrow out, it's a violation of the input invariant */
-  if(b)
-    return MP_RANGE;
-  else
-    return MP_OKAY;
-
-} /* end s_mp_sub_d() */
-
-/* }}} */
-
-/* {{{ s_mp_mul_d(a, d) */
-
-/* Compute a = a * d, single digit multiplication                         */
-mp_err   s_mp_mul_d(mp_int *a, mp_digit d)
-{
-  mp_word w, k = 0;
-  mp_size ix, max;
-  mp_err  res;
-  mp_digit *dp = DIGITS(a);
-
-  /*
-    Single-digit multiplication will increase the precision of the
-    output by at most one digit.  However, we can detect when this
-    will happen -- if the high-order digit of a, times d, gives a
-    two-digit result, then the precision of the result will increase;
-    otherwise it won't.  We use this fact to avoid calling s_mp_pad()
-    unless absolutely necessary.
-   */
-  max = USED(a);
-  w = dp[max - 1] * d;
-  if(CARRYOUT(w) != 0) {
-    if((res = s_mp_pad(a, max + 1)) != MP_OKAY)
-      return res;
-    dp = DIGITS(a);
-  }
-
-  for(ix = 0; ix < max; ix++) {
-    w = (dp[ix] * d) + k;
-    dp[ix] = ACCUM(w);
-    k = CARRYOUT(w);
-  }
-
-  /* If there is a precision increase, take care of it here; the above
-     test guarantees we have enough storage to do this safely.
-   */
-  if(k) {
-    dp[max] = k;
-    USED(a) = max + 1;
-  }
-
-  s_mp_clamp(a);
-
-  return MP_OKAY;
-
-} /* end s_mp_mul_d() */
-
-/* }}} */
-
-/* {{{ s_mp_div_d(mp, d, r) */
-
-/*
-  s_mp_div_d(mp, d, r)
-
-  Compute the quotient mp = mp / d and remainder r = mp mod d, for a
-  single digit d.  If r is null, the remainder will be discarded.
- */
-
-mp_err   s_mp_div_d(mp_int *mp, mp_digit d, mp_digit *r)
-{
-  mp_word   w = 0, t;
-  mp_int    quot;
-  mp_err    res;
-  mp_digit *dp = DIGITS(mp), *qp;
-  int       ix;
-
-  if(d == 0)
-    return MP_RANGE;
-
-  /* Make room for the quotient */
-  if((res = mp_init_size(&quot, USED(mp))) != MP_OKAY)
-    return res;
-
-  USED(&quot) = USED(mp); /* so clamping will work below */
-  qp = DIGITS(&quot);
-
-  /* Divide without subtraction */
-  for(ix = USED(mp) - 1; ix >= 0; ix--) {
-    w = (w << DIGIT_BIT) | dp[ix];
-
-    if(w >= d) {
-      t = w / d;
-      w = w % d;
-    } else {
-      t = 0;
-    }
-
-    qp[ix] = t;
-  }
-
-  /* Deliver the remainder, if desired */
-  if(r)
-    *r = w;
-
-  s_mp_clamp(&quot);
-  mp_exch(&quot, mp);
-  mp_clear(&quot);
-
-  return MP_OKAY;
-
-} /* end s_mp_div_d() */
-
-/* }}} */
-
-/* }}} */
-
-/* {{{ Primitive full arithmetic */
-
-/* {{{ s_mp_add(a, b) */
-
-/* Compute a = |a| + |b|                                                  */
-mp_err   s_mp_add(mp_int *a, mp_int *b)        /* magnitude addition      */
-{
-  mp_word   w = 0;
-  mp_digit *pa, *pb;
-  mp_size   ix, used = USED(b);
-  mp_err    res;
-
-  /* Make sure a has enough precision for the output value */
-  if((used > USED(a)) && (res = s_mp_pad(a, used)) != MP_OKAY)
-    return res;
-
-  /*
-    Add up all digits up to the precision of b.  If b had initially
-    the same precision as a, or greater, we took care of it by the
-    padding step above, so there is no problem.  If b had initially
-    less precision, we'll have to make sure the carry out is duly
-    propagated upward among the higher-order digits of the sum.
-   */
-  pa = DIGITS(a);
-  pb = DIGITS(b);
-  for(ix = 0; ix < used; ++ix) {
-    w += *pa + *pb++;
-    *pa++ = ACCUM(w);
-    w = CARRYOUT(w);
-  }
-
-  /* If we run out of 'b' digits before we're actually done, make
-     sure the carries get propagated upward...
-   */
-  used = USED(a);
-  while(w && ix < used) {
-    w += *pa;
-    *pa++ = ACCUM(w);
-    w = CARRYOUT(w);
-    ++ix;
-  }
-
-  /* If there's an overall carry out, increase precision and include
-     it.  We could have done this initially, but why touch the memory
-     allocator unless we're sure we have to?
-   */
-  if(w) {
-    if((res = s_mp_pad(a, used + 1)) != MP_OKAY)
-      return res;
-
-    DIGIT(a, ix) = w;  /* pa may not be valid after s_mp_pad() call */
-  }
-
-  return MP_OKAY;
-
-} /* end s_mp_add() */
-
-/* }}} */
-
-/* {{{ s_mp_sub(a, b) */
-
-/* Compute a = |a| - |b|, assumes |a| >= |b|                              */
-mp_err   s_mp_sub(mp_int *a, mp_int *b)        /* magnitude subtract      */
-{
-  mp_word   w = 0;
-  mp_digit *pa, *pb;
-  mp_size   ix, used = USED(b);
-
-  /*
-    Subtract and propagate borrow.  Up to the precision of b, this
-    accounts for the digits of b; after that, we just make sure the
-    carries get to the right place.  This saves having to pad b out to
-    the precision of a just to make the loops work right...
-   */
-  pa = DIGITS(a);
-  pb = DIGITS(b);
-
-  for(ix = 0; ix < used; ++ix) {
-    w = (RADIX + *pa) - w - *pb++;
-    *pa++ = ACCUM(w);
-    w = CARRYOUT(w) ? 0 : 1;
-  }
-
-  used = USED(a);
-  while(ix < used) {
-    w = RADIX + *pa - w;
-    *pa++ = ACCUM(w);
-    w = CARRYOUT(w) ? 0 : 1;
-    ++ix;
-  }
-
-  /* Clobber any leading zeroes we created    */
-  s_mp_clamp(a);
-
-  /*
-     If there was a borrow out, then |b| > |a| in violation
-     of our input invariant.  We've already done the work,
-     but we'll at least complain about it...
-   */
-  if(w)
-    return MP_RANGE;
-  else
-    return MP_OKAY;
-
-} /* end s_mp_sub() */
-
-/* }}} */
-
-mp_err   s_mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
-{
-  mp_int   q;
-  mp_err   res;
-  mp_size  um = USED(m);
-
-  if((res = mp_init_copy(&q, x)) != MP_OKAY)
-    return res;
-
-  s_mp_rshd(&q, um - 1);       /* q1 = x / b^(k-1)  */
-  s_mp_mul(&q, mu);            /* q2 = q1 * mu      */
-  s_mp_rshd(&q, um + 1);       /* q3 = q2 / b^(k+1) */
-
-  /* x = x mod b^(k+1), quick (no division) */
-  s_mp_mod_2d(x, (mp_digit)(DIGIT_BIT * (um + 1)));
-
-  /* q = q * m mod b^(k+1), quick (no division), uses the short multiplier */
-#ifndef SHRT_MUL
-  s_mp_mul(&q, m);
-  s_mp_mod_2d(&q, (mp_digit)(DIGIT_BIT * (um + 1)));
-#else
-  s_mp_mul_dig(&q, m, um + 1);
-#endif
-
-  /* x = x - q */
-  if((res = mp_sub(x, &q, x)) != MP_OKAY)
-    goto CLEANUP;
-
-  /* If x < 0, add b^(k+1) to it */
-  if(mp_cmp_z(x) < 0) {
-    mp_set(&q, 1);
-    if((res = s_mp_lshd(&q, um + 1)) != MP_OKAY)
-      goto CLEANUP;
-    if((res = mp_add(x, &q, x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  /* Back off if it's too big */
-  while(mp_cmp(x, m) >= 0) {
-    if((res = s_mp_sub(x, m)) != MP_OKAY)
-      break;
-  }
-
- CLEANUP:
-  mp_clear(&q);
-
-  return res;
-
-} /* end s_mp_reduce() */
-
-
-
-/* {{{ s_mp_mul(a, b) */
-
-/* Compute a = |a| * |b|                                                  */
-mp_err   s_mp_mul(mp_int *a, mp_int *b)
-{
-  mp_word   w, k = 0;
-  mp_int    tmp;
-  mp_err    res;
-  mp_size   ix, jx, ua = USED(a), ub = USED(b);
-  mp_digit *pa, *pb, *pt, *pbt;
-
-  if((res = mp_init_size(&tmp, ua + ub)) != MP_OKAY)
-    return res;
-
-  /* This has the effect of left-padding with zeroes... */
-  USED(&tmp) = ua + ub;
-
-  /* We're going to need the base value each iteration */
-  pbt = DIGITS(&tmp);
-
-  /* Outer loop:  Digits of b */
-
-  pb = DIGITS(b);
-  for(ix = 0; ix < ub; ++ix, ++pb) {
-    if(*pb == 0)
-      continue;
-
-    /* Inner product:  Digits of a */
-    pa = DIGITS(a);
-    for(jx = 0; jx < ua; ++jx, ++pa) {
-      pt = pbt + ix + jx;
-      w = *pb * *pa + k + *pt;
-      *pt = ACCUM(w);
-      k = CARRYOUT(w);
-    }
-
-    pbt[ix + jx] = k;
-    k = 0;
-  }
-
-  s_mp_clamp(&tmp);
-  s_mp_exch(&tmp, a);
-
-  mp_clear(&tmp);
-
-  return MP_OKAY;
-
-} /* end s_mp_mul() */
-
-/* }}} */
-
-/* {{{ s_mp_kmul(a, b, out, len) */
-
-#if 0
-void   s_mp_kmul(mp_digit *a, mp_digit *b, mp_digit *out, mp_size len)
-{
-  mp_word   w, k = 0;
-  mp_size   ix, jx;
-  mp_digit *pa, *pt;
-
-  for(ix = 0; ix < len; ++ix, ++b) {
-    if(*b == 0)
-      continue;
-
-    pa = a;
-    for(jx = 0; jx < len; ++jx, ++pa) {
-      pt = out + ix + jx;
-      w = *b * *pa + k + *pt;
-      *pt = ACCUM(w);
-      k = CARRYOUT(w);
-    }
-
-    out[ix + jx] = k;
-    k = 0;
-  }
-
-} /* end s_mp_kmul() */
-#endif
-
-/* }}} */
-
-/* {{{ s_mp_sqr(a) */
-
-/*
-  Computes the square of a, in place.  This can be done more
-  efficiently than a general multiplication, because many of the
-  computation steps are redundant when squaring.  The inner product
-  step is a bit more complicated, but we save a fair number of
-  iterations of the multiplication loop.
- */
-#if MP_SQUARE
-mp_err   s_mp_sqr(mp_int *a)
-{
-  mp_word  w, k = 0;
-  mp_int   tmp;
-  mp_err   res;
-  mp_size  ix, jx, kx, used = USED(a);
-  mp_digit *pa1, *pa2, *pt, *pbt;
-
-  if((res = mp_init_size(&tmp, 2 * used)) != MP_OKAY)
-    return res;
-
-  /* Left-pad with zeroes */
-  USED(&tmp) = 2 * used;
-
-  /* We need the base value each time through the loop */
-  pbt = DIGITS(&tmp);
-
-  pa1 = DIGITS(a);
-  for(ix = 0; ix < used; ++ix, ++pa1) {
-    if(*pa1 == 0)
-      continue;
-
-    w = DIGIT(&tmp, ix + ix) + (*pa1 * *pa1);
-
-    pbt[ix + ix] = ACCUM(w);
-    k = CARRYOUT(w);
-
-    /*
-      The inner product is computed as:
-
-         (C, S) = t[i,j] + 2 a[i] a[j] + C
-
-      This can overflow what can be represented in an mp_word, and
-      since C arithmetic does not provide any way to check for
-      overflow, we have to check explicitly for overflow conditions
-      before they happen.
-     */
-    for(jx = ix + 1, pa2 = DIGITS(a) + jx; jx < used; ++jx, ++pa2) {
-      mp_word  u = 0, v;
-
-      /* Store this in a temporary to avoid indirections later */
-      pt = pbt + ix + jx;
-
-      /* Compute the multiplicative step */
-      w = *pa1 * *pa2;
-
-      /* If w is more than half MP_WORD_MAX, the doubling will
-	 overflow, and we need to record a carry out into the next
-	 word */
-      u = (w >> (MP_WORD_BIT - 1)) & 1;
-
-      /* Double what we've got, overflow will be ignored as defined
-	 for C arithmetic (we've already noted if it is to occur)
-       */
-      w *= 2;
-
-      /* Compute the additive step */
-      v = *pt + k;
-
-      /* If we do not already have an overflow carry, check to see
-	 if the addition will cause one, and set the carry out if so
-       */
-      u |= ((MP_WORD_MAX - v) < w);
-
-      /* Add in the rest, again ignoring overflow */
-      w += v;
-
-      /* Set the i,j digit of the output */
-      *pt = ACCUM(w);
-
-      /* Save carry information for the next iteration of the loop.
-	 This is why k must be an mp_word, instead of an mp_digit */
-      k = CARRYOUT(w) | (u << DIGIT_BIT);
-
-    } /* for(jx ...) */
-
-    /* Set the last digit in the cycle and reset the carry */
-    k = DIGIT(&tmp, ix + jx) + k;
-    pbt[ix + jx] = ACCUM(k);
-    k = CARRYOUT(k);
-
-    /* If we are carrying out, propagate the carry to the next digit
-       in the output.  This may cascade, so we have to be somewhat
-       circumspect -- but we will have enough precision in the output
-       that we won't overflow
-     */
-    kx = 1;
-    while(k) {
-      k = pbt[ix + jx + kx] + 1;
-      pbt[ix + jx + kx] = ACCUM(k);
-      k = CARRYOUT(k);
-      ++kx;
-    }
-  } /* for(ix ...) */
-
-  s_mp_clamp(&tmp);
-  s_mp_exch(&tmp, a);
-
-  mp_clear(&tmp);
-
-  return MP_OKAY;
-
-} /* end s_mp_sqr() */
-#endif
-
-/* }}} */
-
-/* {{{ s_mp_div(a, b) */
-
-/*
-  s_mp_div(a, b)
-
-  Compute a = a / b and b = a mod b.  Assumes b > a.
- */
-
-mp_err   s_mp_div(mp_int *a, mp_int *b)
-{
-  mp_int   quot, rem, t;
-  mp_word  q;
-  mp_err   res;
-  mp_digit d;
-  int      ix;
-
-  if(mp_cmp_z(b) == 0)
-    return MP_RANGE;
-
-  /* Shortcut if b is power of two */
-  if((ix = s_mp_ispow2(b)) >= 0) {
-    mp_copy(a, b);  /* need this for remainder */
-    s_mp_div_2d(a, (mp_digit)ix);
-    s_mp_mod_2d(b, (mp_digit)ix);
-
-    return MP_OKAY;
-  }
-
-  /* Allocate space to store the quotient */
-  if((res = mp_init_size(&quot, USED(a))) != MP_OKAY)
-    return res;
-
-  /* A working temporary for division     */
-  if((res = mp_init_size(&t, USED(a))) != MP_OKAY)
-    goto T;
-
-  /* Allocate space for the remainder     */
-  if((res = mp_init_size(&rem, USED(a))) != MP_OKAY)
-    goto REM;
-
-  /* Normalize to optimize guessing       */
-  d = s_mp_norm(a, b);
-
-  /* Perform the division itself...woo!   */
-  ix = USED(a) - 1;
-
-  while(ix >= 0) {
-    /* Find a partial substring of a which is at least b */
-    while(s_mp_cmp(&rem, b) < 0 && ix >= 0) {
-      if((res = s_mp_lshd(&rem, 1)) != MP_OKAY)
-	goto CLEANUP;
-
-      if((res = s_mp_lshd(&quot, 1)) != MP_OKAY)
-	goto CLEANUP;
-
-      DIGIT(&rem, 0) = DIGIT(a, ix);
-      s_mp_clamp(&rem);
-      --ix;
-    }
-
-    /* If we didn't find one, we're finished dividing    */
-    if(s_mp_cmp(&rem, b) < 0)
-      break;
-
-    /* Compute a guess for the next quotient digit       */
-    q = DIGIT(&rem, USED(&rem) - 1);
-    if(q <= DIGIT(b, USED(b) - 1) && USED(&rem) > 1)
-      q = (q << DIGIT_BIT) | DIGIT(&rem, USED(&rem) - 2);
-
-    q /= DIGIT(b, USED(b) - 1);
-
-    /* The guess can be as much as RADIX + 1 */
-    if(q >= RADIX)
-      q = RADIX - 1;
-
-    /* See what that multiplies out to                   */
-    mp_copy(b, &t);
-    if((res = s_mp_mul_d(&t, q)) != MP_OKAY)
-      goto CLEANUP;
-
-    /*
-       If it's too big, back it off.  We should not have to do this
-       more than once, or, in rare cases, twice.  Knuth describes a
-       method by which this could be reduced to a maximum of once, but
-       I didn't implement that here.
-     */
-    while(s_mp_cmp(&t, &rem) > 0) {
-      --q;
-      s_mp_sub(&t, b);
-    }
-
-    /* At this point, q should be the right next digit   */
-    if((res = s_mp_sub(&rem, &t)) != MP_OKAY)
-      goto CLEANUP;
-
-    /*
-      Include the digit in the quotient.  We allocated enough memory
-      for any quotient we could ever possibly get, so we should not
-      have to check for failures here
-     */
-    DIGIT(&quot, 0) = q;
-  }
-
-  /* Denormalize remainder                */
-  if(d != 0)
-    s_mp_div_2d(&rem, d);
-
-  s_mp_clamp(&quot);
-  s_mp_clamp(&rem);
-
-  /* Copy quotient back to output         */
-  s_mp_exch(&quot, a);
-
-  /* Copy remainder back to output        */
-  s_mp_exch(&rem, b);
-
-CLEANUP:
-  mp_clear(&rem);
-REM:
-  mp_clear(&t);
-T:
-  mp_clear(&quot);
-
-  return res;
-
-} /* end s_mp_div() */
-
-/* }}} */
-
-/* {{{ s_mp_2expt(a, k) */
-
-mp_err   s_mp_2expt(mp_int *a, mp_digit k)
-{
-  mp_err    res;
-  mp_size   dig, bit;
-
-  dig = k / DIGIT_BIT;
-  bit = k % DIGIT_BIT;
-
-  mp_zero(a);
-  if((res = s_mp_pad(a, dig + 1)) != MP_OKAY)
-    return res;
-
-  DIGIT(a, dig) |= (1 << bit);
-
-  return MP_OKAY;
-
-} /* end s_mp_2expt() */
-
-/* }}} */
-
-
-/* }}} */
-
-/* }}} */
-
-/* {{{ Primitive comparisons */
-
-/* {{{ s_mp_cmp(a, b) */
-
-/* Compare |a| <=> |b|, return 0 if equal, <0 if a<b, >0 if a>b           */
-int      s_mp_cmp(mp_int *a, mp_int *b)
-{
-  mp_size   ua = USED(a), ub = USED(b);
-
-  if(ua > ub)
-    return MP_GT;
-  else if(ua < ub)
-    return MP_LT;
-  else {
-    int      ix = ua - 1;
-    mp_digit *ap = DIGITS(a) + ix, *bp = DIGITS(b) + ix;
-
-    while(ix >= 0) {
-      if(*ap > *bp)
-	return MP_GT;
-      else if(*ap < *bp)
-	return MP_LT;
-
-      --ap; --bp; --ix;
-    }
-
-    return MP_EQ;
-  }
-
-} /* end s_mp_cmp() */
-
-/* }}} */
-
-/* {{{ s_mp_cmp_d(a, d) */
-
-/* Compare |a| <=> d, return 0 if equal, <0 if a<d, >0 if a>d             */
-int      s_mp_cmp_d(mp_int *a, mp_digit d)
-{
-  mp_size  ua = USED(a);
-  mp_digit *ap = DIGITS(a);
-
-  if(ua > 1)
-    return MP_GT;
-
-  if(*ap < d)
-    return MP_LT;
-  else if(*ap > d)
-    return MP_GT;
-  else
-    return MP_EQ;
-
-} /* end s_mp_cmp_d() */
-
-/* }}} */
-
-/* {{{ s_mp_ispow2(v) */
-
-/*
-  Returns -1 if the value is not a power of two; otherwise, it returns
-  k such that v = 2^k, i.e. lg(v).
- */
-int      s_mp_ispow2(mp_int *v)
-{
-  mp_digit d, *dp;
-  mp_size  uv = USED(v);
-  int      extra = 0, ix;
-
-  d = DIGIT(v, uv - 1); /* most significant digit of v */
-
-  while(d && ((d & 1) == 0)) {
-    d >>= 1;
-    ++extra;
-  }
-
-  if(d == 1) {
-    ix = uv - 2;
-    dp = DIGITS(v) + ix;
-
-    while(ix >= 0) {
-      if(*dp)
-	return -1; /* not a power of two */
-
-      --dp; --ix;
-    }
-
-    return ((uv - 1) * DIGIT_BIT) + extra;
-  }
-
-  return -1;
-
-} /* end s_mp_ispow2() */
-
-/* }}} */
-
-/* {{{ s_mp_ispow2d(d) */
-
-int      s_mp_ispow2d(mp_digit d)
-{
-  int   pow = 0;
-
-  while((d & 1) == 0) {
-    ++pow; d >>= 1;
-  }
-
-  if(d == 1)
-    return pow;
-
-  return -1;
-
-} /* end s_mp_ispow2d() */
-
-/* }}} */
-
-/* }}} */
-
-/* {{{ Primitive I/O helpers */
-
-/* {{{ s_mp_tovalue(ch, r) */
-
-/*
-  Convert the given character to its digit value, in the given radix.
-  If the given character is not understood in the given radix, -1 is
-  returned.  Otherwise the digit's numeric value is returned.
-
-  The results will be odd if you use a radix < 2 or > 62, you are
-  expected to know what you're up to.
- */
-int      s_mp_tovalue(char ch, int r)
-{
-  int    val, xch;
-
-  if(r > 36)
-    xch = ch;
-  else
-    xch = toupper(ch);
-
-  if(isdigit(xch))
-    val = xch - '0';
-  else if(isupper(xch))
-    val = xch - 'A' + 10;
-  else if(islower(xch))
-    val = xch - 'a' + 36;
-  else if(xch == '+')
-    val = 62;
-  else if(xch == '/')
-    val = 63;
-  else
-    return -1;
-
-  if(val < 0 || val >= r)
-    return -1;
-
-  return val;
-
-} /* end s_mp_tovalue() */
-
-/* }}} */
-
-/* {{{ s_mp_todigit(val, r, low) */
-
-/*
-  Convert val to a radix-r digit, if possible.  If val is out of range
-  for r, returns zero.  Otherwise, returns an ASCII character denoting
-  the value in the given radix.
-
-  The results may be odd if you use a radix < 2 or > 64, you are
-  expected to know what you're doing.
- */
-
-char     s_mp_todigit(int val, int r, int low)
-{
-  char   ch;
-
-  if(val < 0 || val >= r)
-    return 0;
-
-  ch = s_dmap_1[val];
-
-  if(r <= 36 && low)
-    ch = tolower(ch);
-
-  return ch;
-
-} /* end s_mp_todigit() */
-
-/* }}} */
-
-/* {{{ s_mp_outlen(bits, radix) */
-
-/*
-   Return an estimate for how long a string is needed to hold a radix
-   r representation of a number with 'bits' significant bits.
-
-   Does not include space for a sign or a NUL terminator.
- */
-int      s_mp_outlen(int bits, int r)
-{
-  return (int)((double)bits * LOG_V_2(r));
-
-} /* end s_mp_outlen() */
-
-/* }}} */
-
-/* }}} */
-
-/*------------------------------------------------------------------------*/
-/* HERE THERE BE DRAGONS                                                  */
-/* crc==4242132123, version==2, Sat Feb 02 06:43:52 2002 */
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/mtest/mpi.h b/libtommath/mtest/mpi.h
deleted file mode 100644
index 5accb52..0000000
--- a/libtommath/mtest/mpi.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
-    mpi.h
-
-    by Michael J. Fromberger <sting@linguist.dartmouth.edu>
-    Copyright (C) 1998 Michael J. Fromberger, All Rights Reserved
-
-    Arbitrary precision integer arithmetic library
-
-    $Id$
- */
-
-#ifndef _H_MPI_
-#define _H_MPI_
-
-#include "mpi-config.h"
-
-#define  MP_LT       -1
-#define  MP_EQ        0
-#define  MP_GT        1
-
-#if MP_DEBUG
-#undef MP_IOFUNC
-#define MP_IOFUNC 1
-#endif
-
-#if MP_IOFUNC
-#include <stdio.h>
-#include <ctype.h>
-#endif
-
-#include <limits.h>
-
-#define  MP_NEG  1
-#define  MP_ZPOS 0
-
-/* Included for compatibility... */
-#define  NEG     MP_NEG
-#define  ZPOS    MP_ZPOS
-
-#define  MP_OKAY          0 /* no error, all is well */
-#define  MP_YES           0 /* yes (boolean result)  */
-#define  MP_NO           -1 /* no (boolean result)   */
-#define  MP_MEM          -2 /* out of memory         */
-#define  MP_RANGE        -3 /* argument out of range */
-#define  MP_BADARG       -4 /* invalid parameter     */
-#define  MP_UNDEF        -5 /* answer is undefined   */
-#define  MP_LAST_CODE    MP_UNDEF
-
-#include "mpi-types.h"
-
-/* Included for compatibility... */
-#define DIGIT_BIT         MP_DIGIT_BIT
-#define DIGIT_MAX         MP_DIGIT_MAX
-
-/* Macros for accessing the mp_int internals           */
-#define  SIGN(MP)     ((MP)->sign)
-#define  USED(MP)     ((MP)->used)
-#define  ALLOC(MP)    ((MP)->alloc)
-#define  DIGITS(MP)   ((MP)->dp)
-#define  DIGIT(MP,N)  (MP)->dp[(N)]
-
-#if MP_ARGCHK == 1
-#define  ARGCHK(X,Y)  {if(!(X)){return (Y);}}
-#elif MP_ARGCHK == 2
-#include <assert.h>
-#define  ARGCHK(X,Y)  assert(X)
-#else
-#define  ARGCHK(X,Y)  /*  */
-#endif
-
-/* This defines the maximum I/O base (minimum is 2)   */
-#define MAX_RADIX         64
-
-typedef struct {
-  mp_sign       sign;    /* sign of this quantity      */
-  mp_size       alloc;   /* how many digits allocated  */
-  mp_size       used;    /* how many digits used       */
-  mp_digit     *dp;      /* the digits themselves      */
-} mp_int;
-
-/*------------------------------------------------------------------------*/
-/* Default precision                                                      */
-
-unsigned int mp_get_prec(void);
-void         mp_set_prec(unsigned int prec);
-
-/*------------------------------------------------------------------------*/
-/* Memory management                                                      */
-
-mp_err mp_init(mp_int *mp);
-mp_err mp_init_array(mp_int mp[], int count);
-mp_err mp_init_size(mp_int *mp, mp_size prec);
-mp_err mp_init_copy(mp_int *mp, mp_int *from);
-mp_err mp_copy(mp_int *from, mp_int *to);
-void   mp_exch(mp_int *mp1, mp_int *mp2);
-void   mp_clear(mp_int *mp);
-void   mp_clear_array(mp_int mp[], int count);
-void   mp_zero(mp_int *mp);
-void   mp_set(mp_int *mp, mp_digit d);
-mp_err mp_set_int(mp_int *mp, long z);
-mp_err mp_shrink(mp_int *a);
-
-
-/*------------------------------------------------------------------------*/
-/* Single digit arithmetic                                                */
-
-mp_err mp_add_d(mp_int *a, mp_digit d, mp_int *b);
-mp_err mp_sub_d(mp_int *a, mp_digit d, mp_int *b);
-mp_err mp_mul_d(mp_int *a, mp_digit d, mp_int *b);
-mp_err mp_mul_2(mp_int *a, mp_int *c);
-mp_err mp_div_d(mp_int *a, mp_digit d, mp_int *q, mp_digit *r);
-mp_err mp_div_2(mp_int *a, mp_int *c);
-mp_err mp_expt_d(mp_int *a, mp_digit d, mp_int *c);
-
-/*------------------------------------------------------------------------*/
-/* Sign manipulations                                                     */
-
-mp_err mp_abs(mp_int *a, mp_int *b);
-mp_err mp_neg(mp_int *a, mp_int *b);
-
-/*------------------------------------------------------------------------*/
-/* Full arithmetic                                                        */
-
-mp_err mp_add(mp_int *a, mp_int *b, mp_int *c);
-mp_err mp_sub(mp_int *a, mp_int *b, mp_int *c);
-mp_err mp_mul(mp_int *a, mp_int *b, mp_int *c);
-mp_err mp_mul_2d(mp_int *a, mp_digit d, mp_int *c);
-#if MP_SQUARE
-mp_err mp_sqr(mp_int *a, mp_int *b);
-#else
-#define mp_sqr(a, b) mp_mul(a, a, b)
-#endif
-mp_err mp_div(mp_int *a, mp_int *b, mp_int *q, mp_int *r);
-mp_err mp_div_2d(mp_int *a, mp_digit d, mp_int *q, mp_int *r);
-mp_err mp_expt(mp_int *a, mp_int *b, mp_int *c);
-mp_err mp_2expt(mp_int *a, mp_digit k);
-mp_err mp_sqrt(mp_int *a, mp_int *b);
-
-/*------------------------------------------------------------------------*/
-/* Modular arithmetic                                                     */
-
-#if MP_MODARITH
-mp_err mp_mod(mp_int *a, mp_int *m, mp_int *c);
-mp_err mp_mod_d(mp_int *a, mp_digit d, mp_digit *c);
-mp_err mp_addmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
-mp_err mp_submod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
-mp_err mp_mulmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
-#if MP_SQUARE
-mp_err mp_sqrmod(mp_int *a, mp_int *m, mp_int *c);
-#else
-#define mp_sqrmod(a, m, c) mp_mulmod(a, a, m, c)
-#endif
-mp_err mp_exptmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
-mp_err mp_exptmod_d(mp_int *a, mp_digit d, mp_int *m, mp_int *c);
-#endif /* MP_MODARITH */
-
-/*------------------------------------------------------------------------*/
-/* Comparisons                                                            */
-
-int    mp_cmp_z(mp_int *a);
-int    mp_cmp_d(mp_int *a, mp_digit d);
-int    mp_cmp(mp_int *a, mp_int *b);
-int    mp_cmp_mag(mp_int *a, mp_int *b);
-int    mp_cmp_int(mp_int *a, long z);
-int    mp_isodd(mp_int *a);
-int    mp_iseven(mp_int *a);
-
-/*------------------------------------------------------------------------*/
-/* Number theoretic                                                       */
-
-#if MP_NUMTH
-mp_err mp_gcd(mp_int *a, mp_int *b, mp_int *c);
-mp_err mp_lcm(mp_int *a, mp_int *b, mp_int *c);
-mp_err mp_xgcd(mp_int *a, mp_int *b, mp_int *g, mp_int *x, mp_int *y);
-mp_err mp_invmod(mp_int *a, mp_int *m, mp_int *c);
-#endif /* end MP_NUMTH */
-
-/*------------------------------------------------------------------------*/
-/* Input and output                                                       */
-
-#if MP_IOFUNC
-void   mp_print(mp_int *mp, FILE *ofp);
-#endif /* end MP_IOFUNC */
-
-/*------------------------------------------------------------------------*/
-/* Base conversion                                                        */
-
-#define BITS     1
-#define BYTES    CHAR_BIT
-
-mp_err mp_read_signed_bin(mp_int *mp, unsigned char *str, int len);
-int    mp_signed_bin_size(mp_int *mp);
-mp_err mp_to_signed_bin(mp_int *mp, unsigned char *str);
-
-mp_err mp_read_unsigned_bin(mp_int *mp, unsigned char *str, int len);
-int    mp_unsigned_bin_size(mp_int *mp);
-mp_err mp_to_unsigned_bin(mp_int *mp, unsigned char *str);
-
-int    mp_count_bits(mp_int *mp);
-
-#if MP_COMPAT_MACROS
-#define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
-#define mp_raw_size(mp)           mp_signed_bin_size(mp)
-#define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
-#define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len))
-#define mp_mag_size(mp)           mp_unsigned_bin_size(mp)
-#define mp_tomag(mp, str)         mp_to_unsigned_bin((mp), (str))
-#endif
-
-mp_err mp_read_radix(mp_int *mp, unsigned char *str, int radix);
-int    mp_radix_size(mp_int *mp, int radix);
-int    mp_value_radix_size(int num, int qty, int radix);
-mp_err mp_toradix(mp_int *mp, char *str, int radix);
-
-int    mp_char2value(char ch, int r);
-
-#define mp_tobinary(M, S)  mp_toradix((M), (S), 2)
-#define mp_tooctal(M, S)   mp_toradix((M), (S), 8)
-#define mp_todecimal(M, S) mp_toradix((M), (S), 10)
-#define mp_tohex(M, S)     mp_toradix((M), (S), 16)
-
-/*------------------------------------------------------------------------*/
-/* Error strings                                                          */
-
-const  char  *mp_strerror(mp_err ec);
-
-#endif /* end _H_MPI_ */
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/mtest/mtest.c b/libtommath/mtest/mtest.c
deleted file mode 100644
index 56b5a90..0000000
--- a/libtommath/mtest/mtest.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/* makes a bignum test harness with NUM tests per operation
- *
- * the output is made in the following format [one parameter per line]
-
-operation
-operand1
-operand2
-[... operandN]
-result1
-result2
-[... resultN]
-
-So for example "a * b mod n" would be
-
-mulmod
-a
-b
-n
-a*b mod n
-
-e.g. if a=3, b=4 n=11 then
-
-mulmod
-3
-4
-11
-1
-
- */
-
-#ifdef MP_8BIT
-#define THE_MASK 127
-#else
-#define THE_MASK 32767
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include "mpi.c"
-
-#ifdef LTM_MTEST_REAL_RAND
-#define getRandChar() fgetc(rng)
-FILE *rng;
-#else
-#define getRandChar() (rand()&0xFF)
-#endif
-
-void rand_num(mp_int *a)
-{
-   int size;
-   unsigned char buf[2048];
-   size_t sz;
-
-   size = 1 + ((getRandChar()<<8) + getRandChar()) % 101;
-   buf[0] = (getRandChar()&1)?1:0;
-#ifdef LTM_MTEST_REAL_RAND
-   sz = fread(buf+1, 1, size, rng);
-#else
-   sz = 1;
-   while (sz < (unsigned)size) {
-       buf[sz] = getRandChar();
-       ++sz;
-   }
-#endif
-   if (sz != (unsigned)size) {
-       fprintf(stderr, "\nWarning: fread failed\n\n");
-   }
-   while (buf[1] == 0) buf[1] = getRandChar();
-   mp_read_raw(a, buf, 1+size);
-}
-
-void rand_num2(mp_int *a)
-{
-   int size;
-   unsigned char buf[2048];
-   size_t sz;
-
-   size = 10 + ((getRandChar()<<8) + getRandChar()) % 101;
-   buf[0] = (getRandChar()&1)?1:0;
-#ifdef LTM_MTEST_REAL_RAND
-   sz = fread(buf+1, 1, size, rng);
-#else
-   sz = 1;
-   while (sz < (unsigned)size) {
-       buf[sz] = getRandChar();
-       ++sz;
-   }
-#endif
-   if (sz != (unsigned)size) {
-       fprintf(stderr, "\nWarning: fread failed\n\n");
-   }
-   while (buf[1] == 0) buf[1] = getRandChar();
-   mp_read_raw(a, buf, 1+size);
-}
-
-#define mp_to64(a, b) mp_toradix(a, b, 64)
-
-int main(int argc, char *argv[])
-{
-   int n, tmp;
-   long long max;
-   mp_int a, b, c, d, e;
-#ifdef MTEST_NO_FULLSPEED
-   clock_t t1;
-#endif
-   char buf[4096];
-
-   mp_init(&a);
-   mp_init(&b);
-   mp_init(&c);
-   mp_init(&d);
-   mp_init(&e);
-
-   if (argc > 1) {
-       max = strtol(argv[1], NULL, 0);
-       if (max < 0) {
-           if (max > -64) {
-               max = (1 << -(max)) + 1;
-           } else {
-               max = 1;
-           }
-       } else if (max == 0) {
-           max = 1;
-       }
-   }
-   else {
-       max = 0;
-   }
-
-
-   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
-/*
-   mp_set(&a, 1);
-   for (n = 1; n < 8192; n++) {
-       mp_mul(&a, &a, &c);
-       printf("mul\n");
-       mp_to64(&a, buf);
-       printf("%s\n%s\n", buf, buf);
-       mp_to64(&c, buf);
-       printf("%s\n", buf);
-
-       mp_add_d(&a, 1, &a);
-       mp_mul_2(&a, &a);
-       mp_sub_d(&a, 1, &a);
-   }
-*/
-
-#ifdef LTM_MTEST_REAL_RAND
-   rng = fopen("/dev/urandom", "rb");
-   if (rng == NULL) {
-      rng = fopen("/dev/random", "rb");
-      if (rng == NULL) {
-         fprintf(stderr, "\nWarning:  stdin used as random source\n\n");
-         rng = stdin;
-      }
-   }
-#else
-   srand(23);
-#endif
-
-#ifdef MTEST_NO_FULLSPEED
-   t1 = clock();
-#endif
-   for (;;) {
-#ifdef MTEST_NO_FULLSPEED
-      if (clock() - t1 > CLOCKS_PER_SEC) {
-         sleep(2);
-         t1 = clock();
-      }
-#endif
-       n = getRandChar() % 15;
-
-       if (max != 0) {
-           --max;
-           if (max == 0)
-             n = 255;
-       }
-
-   if (n == 0) {
-       /* add tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_add(&a, &b, &c);
-       printf("add\n");
-       mp_to64(&a, buf);
-       printf("%s\n", buf);
-       mp_to64(&b, buf);
-       printf("%s\n", buf);
-       mp_to64(&c, buf);
-       printf("%s\n", buf);
-   } else if (n == 1) {
-      /* sub tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_sub(&a, &b, &c);
-       printf("sub\n");
-       mp_to64(&a, buf);
-       printf("%s\n", buf);
-       mp_to64(&b, buf);
-       printf("%s\n", buf);
-       mp_to64(&c, buf);
-       printf("%s\n", buf);
-   } else if (n == 2) {
-       /* mul tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_mul(&a, &b, &c);
-       printf("mul\n");
-       mp_to64(&a, buf);
-       printf("%s\n", buf);
-       mp_to64(&b, buf);
-       printf("%s\n", buf);
-       mp_to64(&c, buf);
-       printf("%s\n", buf);
-   } else if (n == 3) {
-      /* div tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_div(&a, &b, &c, &d);
-       printf("div\n");
-       mp_to64(&a, buf);
-       printf("%s\n", buf);
-       mp_to64(&b, buf);
-       printf("%s\n", buf);
-       mp_to64(&c, buf);
-       printf("%s\n", buf);
-       mp_to64(&d, buf);
-       printf("%s\n", buf);
-   } else if (n == 4) {
-      /* sqr tests */
-       rand_num(&a);
-       mp_sqr(&a, &b);
-       printf("sqr\n");
-       mp_to64(&a, buf);
-       printf("%s\n", buf);
-       mp_to64(&b, buf);
-       printf("%s\n", buf);
-   } else if (n == 5) {
-      /* mul_2d test */
-      rand_num(&a);
-      mp_copy(&a, &b);
-      n = getRandChar() & 63;
-      mp_mul_2d(&b, n, &b);
-      mp_to64(&a, buf);
-      printf("mul2d\n");
-      printf("%s\n", buf);
-      printf("%d\n", n);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 6) {
-      /* div_2d test */
-      rand_num(&a);
-      mp_copy(&a, &b);
-      n = getRandChar() & 63;
-      mp_div_2d(&b, n, &b, NULL);
-      mp_to64(&a, buf);
-      printf("div2d\n");
-      printf("%s\n", buf);
-      printf("%d\n", n);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 7) {
-      /* gcd test */
-      rand_num(&a);
-      rand_num(&b);
-      a.sign = MP_ZPOS;
-      b.sign = MP_ZPOS;
-      mp_gcd(&a, &b, &c);
-      printf("gcd\n");
-      mp_to64(&a, buf);
-      printf("%s\n", buf);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-      mp_to64(&c, buf);
-      printf("%s\n", buf);
-   } else if (n == 8) {
-      /* lcm test */
-      rand_num(&a);
-      rand_num(&b);
-      a.sign = MP_ZPOS;
-      b.sign = MP_ZPOS;
-      mp_lcm(&a, &b, &c);
-      printf("lcm\n");
-      mp_to64(&a, buf);
-      printf("%s\n", buf);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-      mp_to64(&c, buf);
-      printf("%s\n", buf);
-   } else if (n == 9) {
-      /* exptmod test */
-      rand_num2(&a);
-      rand_num2(&b);
-      rand_num2(&c);
-//      if (c.dp[0]&1) mp_add_d(&c, 1, &c);
-      a.sign = b.sign = c.sign = 0;
-      mp_exptmod(&a, &b, &c, &d);
-      printf("expt\n");
-      mp_to64(&a, buf);
-      printf("%s\n", buf);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-      mp_to64(&c, buf);
-      printf("%s\n", buf);
-      mp_to64(&d, buf);
-      printf("%s\n", buf);
-   } else if (n == 10) {
-      /* invmod test */
-      rand_num2(&a);
-      rand_num2(&b);
-      b.sign = MP_ZPOS;
-      a.sign = MP_ZPOS;
-      mp_gcd(&a, &b, &c);
-      if (mp_cmp_d(&c, 1) != 0) continue;
-      if (mp_cmp_d(&b, 1) == 0) continue;
-      mp_invmod(&a, &b, &c);
-      printf("invmod\n");
-      mp_to64(&a, buf);
-      printf("%s\n", buf);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-      mp_to64(&c, buf);
-      printf("%s\n", buf);
-   } else if (n == 11) {
-      rand_num(&a);
-      mp_mul_2(&a, &a);
-      mp_div_2(&a, &b);
-      printf("div2\n");
-      mp_to64(&a, buf);
-      printf("%s\n", buf);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 12) {
-      rand_num2(&a);
-      mp_mul_2(&a, &b);
-      printf("mul2\n");
-      mp_to64(&a, buf);
-      printf("%s\n", buf);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 13) {
-      rand_num2(&a);
-      tmp = abs(rand()) & THE_MASK;
-      mp_add_d(&a, tmp, &b);
-      printf("add_d\n");
-      mp_to64(&a, buf);
-      printf("%s\n%d\n", buf, tmp);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 14) {
-      rand_num2(&a);
-      tmp = abs(rand()) & THE_MASK;
-      mp_sub_d(&a, tmp, &b);
-      printf("sub_d\n");
-      mp_to64(&a, buf);
-      printf("%s\n%d\n", buf, tmp);
-      mp_to64(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 255) {
-      printf("exit\n");
-      break;
-   }
-
-   }
-#ifdef LTM_MTEST_REAL_RAND
-   fclose(rng);
-#endif
-   return 0;
-}
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
diff --git a/libtommath/pics/design_process.sxd b/libtommath/pics/design_process.sxd
deleted file mode 100644
index 7414dbb..0000000
--- a/libtommath/pics/design_process.sxd
+++ /dev/null
diff --git a/libtommath/pics/design_process.tif b/libtommath/pics/design_process.tif
deleted file mode 100644
index 4a0c012..0000000
--- a/libtommath/pics/design_process.tif
+++ /dev/null
diff --git a/libtommath/pics/expt_state.sxd b/libtommath/pics/expt_state.sxd
deleted file mode 100644
index 6518404..0000000
--- a/libtommath/pics/expt_state.sxd
+++ /dev/null
diff --git a/libtommath/pics/expt_state.tif b/libtommath/pics/expt_state.tif
deleted file mode 100644
index 0aaee39..0000000
--- a/libtommath/pics/expt_state.tif
+++ /dev/null
diff --git a/libtommath/pics/makefile b/libtommath/pics/makefile
deleted file mode 100644
index 3ecb02f..0000000
--- a/libtommath/pics/makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# makes the images... yeah
-
-default:  pses
-
-design_process.ps: design_process.tif
-	tiff2ps -s -e design_process.tif > design_process.ps
-
-sliding_window.ps: sliding_window.tif
-	tiff2ps -s -e sliding_window.tif > sliding_window.ps
-	
-expt_state.ps: expt_state.tif
-	tiff2ps -s -e expt_state.tif > expt_state.ps
-
-primality.ps: primality.tif
-	tiff2ps -s -e primality.tif > primality.ps
-
-design_process.pdf: design_process.ps
-	epstopdf design_process.ps
-
-sliding_window.pdf: sliding_window.ps
-	epstopdf sliding_window.ps
-	
-expt_state.pdf: expt_state.ps
-	epstopdf expt_state.ps
-
-primality.pdf: primality.ps
-	epstopdf primality.ps
-
-
-pses: sliding_window.ps expt_state.ps primality.ps design_process.ps
-pdfes: sliding_window.pdf expt_state.pdf primality.pdf design_process.pdf
-
-clean:
-	rm -rf *.ps *.pdf .xvpics
-   
-\ No newline at end of file
diff --git a/libtommath/pics/primality.tif b/libtommath/pics/primality.tif
deleted file mode 100644
index 83aafe0..0000000
--- a/libtommath/pics/primality.tif
+++ /dev/null
diff --git a/libtommath/pics/radix.sxd b/libtommath/pics/radix.sxd
deleted file mode 100644
index b9eb9a0..0000000
--- a/libtommath/pics/radix.sxd
+++ /dev/null
diff --git a/libtommath/pics/sliding_window.sxd b/libtommath/pics/sliding_window.sxd
deleted file mode 100644
index 91e7c0d..0000000
--- a/libtommath/pics/sliding_window.sxd
+++ /dev/null
diff --git a/libtommath/pics/sliding_window.tif b/libtommath/pics/sliding_window.tif
deleted file mode 100644
index bb4cb96..0000000
--- a/libtommath/pics/sliding_window.tif
+++ /dev/null
diff --git a/libtommath/poster.out b/libtommath/poster.out
deleted file mode 100644
index e69de29..0000000
--- a/libtommath/poster.out
+++ /dev/null
diff --git a/libtommath/poster.pdf b/libtommath/poster.pdf
deleted file mode 100644
index 1f705cf..0000000
--- a/libtommath/poster.pdf
+++ /dev/null
diff --git a/libtommath/poster.tex b/libtommath/poster.tex
deleted file mode 100644
index e7388f4..0000000
--- a/libtommath/poster.tex
+++ /dev/null
@@ -1,35 +0,0 @@
-\documentclass[landscape,11pt]{article}
-\usepackage{amsmath, amssymb}
-\usepackage{hyperref}
-\begin{document}
-\hspace*{-3in}
-\begin{tabular}{llllll}
-$c = a + b$  & {\tt mp\_add(\&a, \&b, \&c)} & $b = 2a$  & {\tt mp\_mul\_2(\&a, \&b)} & \\
-$c = a - b$  & {\tt mp\_sub(\&a, \&b, \&c)} & $b = a/2$ & {\tt mp\_div\_2(\&a, \&b)} & \\
-$c = ab $   & {\tt mp\_mul(\&a, \&b, \&c)}  & $c = 2^ba$  & {\tt mp\_mul\_2d(\&a, b, \&c)}  \\
-$b = a^2 $  & {\tt mp\_sqr(\&a, \&b)}       & $c = a/2^b, d = a \mod 2^b$ & {\tt mp\_div\_2d(\&a, b, \&c, \&d)} \\
-$c = \lfloor a/b \rfloor, d = a \mod b$ & {\tt mp\_div(\&a, \&b, \&c, \&d)} & $c = a \mod 2^b $  & {\tt mp\_mod\_2d(\&a, b, \&c)}  \\
- && \\
-$a = b $  & {\tt mp\_set\_int(\&a, b)}  & $c = a \vee b$  & {\tt mp\_or(\&a, \&b, \&c)}  \\
-$b = a $  & {\tt mp\_copy(\&a, \&b)} & $c = a \wedge b$  & {\tt mp\_and(\&a, \&b, \&c)}  \\
- && $c = a \oplus b$  & {\tt mp\_xor(\&a, \&b, \&c)}  \\
- & \\
-$b = -a $  & {\tt mp\_neg(\&a, \&b)}  & $d = a + b \mod c$  & {\tt mp\_addmod(\&a, \&b, \&c, \&d)}  \\
-$b = |a| $  & {\tt mp\_abs(\&a, \&b)} & $d = a - b \mod c$  & {\tt mp\_submod(\&a, \&b, \&c, \&d)}  \\
- && $d = ab \mod c$  & {\tt mp\_mulmod(\&a, \&b, \&c, \&d)}  \\
-Compare $a$ and $b$ & {\tt mp\_cmp(\&a, \&b)} & $c = a^2 \mod b$  & {\tt mp\_sqrmod(\&a, \&b, \&c)}  \\
-Is Zero? & {\tt mp\_iszero(\&a)} & $c = a^{-1} \mod b$  & {\tt mp\_invmod(\&a, \&b, \&c)} \\
-Is Even? & {\tt mp\_iseven(\&a)} & $d = a^b \mod c$ & {\tt mp\_exptmod(\&a, \&b, \&c, \&d)} \\
-Is Odd ? & {\tt mp\_isodd(\&a)} \\
-&\\
-$\vert \vert a \vert \vert$ & {\tt mp\_unsigned\_bin\_size(\&a)} & $res$ = 1 if $a$ prime to $t$ rounds? & {\tt mp\_prime\_is\_prime(\&a, t, \&res)} \\
-$buf \leftarrow a$          & {\tt mp\_to\_unsigned\_bin(\&a, buf)} & Next prime after $a$ to $t$ rounds. & {\tt mp\_prime\_next\_prime(\&a, t, bbs\_style)} \\
-$a \leftarrow buf[0..len-1]$          & {\tt mp\_read\_unsigned\_bin(\&a, buf, len)} \\
-&\\
-$b = \sqrt{a}$ & {\tt mp\_sqrt(\&a, \&b)}  & $c = \mbox{gcd}(a, b)$ & {\tt mp\_gcd(\&a, \&b, \&c)} \\
-$c = a^{1/b}$ & {\tt mp\_n\_root(\&a, b, \&c)} & $c = \mbox{lcm}(a, b)$ & {\tt mp\_lcm(\&a, \&b, \&c)} \\
-&\\
-Greater Than & MP\_GT & Equal To & MP\_EQ \\
-Less Than & MP\_LT & Bits per digit & DIGIT\_BIT \\
-\end{tabular}
-\end{document}
diff --git a/libtommath/pre_gen/mpi.c b/libtommath/pre_gen/mpi.c
deleted file mode 100644
index 1b1052a..0000000
--- a/libtommath/pre_gen/mpi.c
+++ /dev/null
@@ -1,9525 +0,0 @@
-/* Start: bn_error.c */
-#include <tommath.h>
-#ifdef BN_ERROR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-static const struct {
-     int code;
-     char *msg;
-} msgs[] = {
-     { MP_OKAY, "Successful" },
-     { MP_MEM,  "Out of heap" },
-     { MP_VAL,  "Value out of range" }
-};
-
-/* return a char * string for a given code */
-char *mp_error_to_string(int code)
-{
-   int x;
-
-   /* scan the lookup table for the given message */
-   for (x = 0; x < (int)(sizeof(msgs) / sizeof(msgs[0])); x++) {
-       if (msgs[x].code == code) {
-          return msgs[x].msg;
-       }
-   }
-
-   /* generic reply for invalid code */
-   return "Invalid error code";
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_error.c */
-
-/* Start: bn_fast_mp_invmod.c */
-#include <tommath.h>
-#ifdef BN_FAST_MP_INVMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes the modular inverse via binary extended euclidean algorithm,
- * that is c = 1/a mod b
- *
- * Based on slow invmod except this is optimized for the case where b is
- * odd as per HAC Note 14.64 on pp. 610
- */
-int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x, y, u, v, B, D;
-  int     res, neg;
-
-  /* 2. [modified] b must be odd   */
-  if (mp_iseven (b) == 1) {
-    return MP_VAL;
-  }
-
-  /* init all our temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
-     return res;
-  }
-
-  /* x == modulus, y == value to invert */
-  if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-
-  /* we need y = |a| */
-  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-  mp_set (&D, 1);
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-    /* 4.2 if B is odd then */
-    if (mp_isodd (&B) == 1) {
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto LBL_ERR;
-      }
-    }
-    /* B = B/2 */
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-    /* 5.2 if D is odd then */
-    if (mp_isodd (&D) == 1) {
-      /* D = (D-x)/2 */
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto LBL_ERR;
-      }
-    }
-    /* D = D/2 */
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  } else {
-    /* v - v - u, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0) {
-    goto top;
-  }
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto LBL_ERR;
-  }
-
-  /* b is now the inverse */
-  neg = a->sign;
-  while (D.sign == MP_NEG) {
-    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-  mp_exch (&D, c);
-  c->sign = neg;
-  res = MP_OKAY;
-
-LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_fast_mp_invmod.c */
-
-/* Start: bn_fast_mp_montgomery_reduce.c */
-#include <tommath.h>
-#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes xR**-1 == x (mod N) via Montgomery Reduction
- *
- * This is an optimized implementation of montgomery_reduce
- * which uses the comba method to quickly calculate the columns of the
- * reduction.
- *
- * Based on Algorithm 14.32 on pp.601 of HAC.
-*/
-int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-{
-  int     ix, res, olduse;
-  mp_word W[MP_WARRAY];
-
-  /* get old used count */
-  olduse = x->used;
-
-  /* grow a as required */
-  if (x->alloc < n->used + 1) {
-    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* first we have to get the digits of the input into
-   * an array of double precision words W[...]
-   */
-  {
-    register mp_word *_W;
-    register mp_digit *tmpx;
-
-    /* alias for the W[] array */
-    _W   = W;
-
-    /* alias for the digits of  x*/
-    tmpx = x->dp;
-
-    /* copy the digits of a into W[0..a->used-1] */
-    for (ix = 0; ix < x->used; ix++) {
-      *_W++ = *tmpx++;
-    }
-
-    /* zero the high words of W[a->used..m->used*2] */
-    for (; ix < n->used * 2 + 1; ix++) {
-      *_W++ = 0;
-    }
-  }
-
-  /* now we proceed to zero successive digits
-   * from the least significant upwards
-   */
-  for (ix = 0; ix < n->used; ix++) {
-    /* mu = ai * m' mod b
-     *
-     * We avoid a double precision multiplication (which isn't required)
-     * by casting the value down to a mp_digit.  Note this requires
-     * that W[ix-1] have  the carry cleared (see after the inner loop)
-     */
-    register mp_digit mu;
-    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
-
-    /* a = a + mu * m * b**i
-     *
-     * This is computed in place and on the fly.  The multiplication
-     * by b**i is handled by offseting which columns the results
-     * are added to.
-     *
-     * Note the comba method normally doesn't handle carries in the
-     * inner loop In this case we fix the carry from the previous
-     * column since the Montgomery reduction requires digits of the
-     * result (so far) [see above] to work.  This is
-     * handled by fixing up one carry after the inner loop.  The
-     * carry fixups are done in order so after these loops the
-     * first m->used words of W[] have the carries fixed
-     */
-    {
-      register int iy;
-      register mp_digit *tmpn;
-      register mp_word *_W;
-
-      /* alias for the digits of the modulus */
-      tmpn = n->dp;
-
-      /* Alias for the columns set by an offset of ix */
-      _W = W + ix;
-
-      /* inner loop */
-      for (iy = 0; iy < n->used; iy++) {
-          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
-      }
-    }
-
-    /* now fix carry for next digit, W[ix+1] */
-    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-  }
-
-  /* now we have to propagate the carries and
-   * shift the words downward [all those least
-   * significant digits we zeroed].
-   */
-  {
-    register mp_digit *tmpx;
-    register mp_word *_W, *_W1;
-
-    /* nox fix rest of carries */
-
-    /* alias for current word */
-    _W1 = W + ix;
-
-    /* alias for next word, where the carry goes */
-    _W = W + ++ix;
-
-    for (; ix <= n->used * 2 + 1; ix++) {
-      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-    }
-
-    /* copy out, A = A/b**n
-     *
-     * The result is A/b**n but instead of converting from an
-     * array of mp_word to mp_digit than calling mp_rshd
-     * we just copy them in the right order
-     */
-
-    /* alias for destination word */
-    tmpx = x->dp;
-
-    /* alias for shifted double precision result */
-    _W = W + n->used;
-
-    for (ix = 0; ix < n->used + 1; ix++) {
-      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
-    }
-
-    /* zero oldused digits, if the input a was larger than
-     * m->used+1 we'll have to clear the digits
-     */
-    for (; ix < olduse; ix++) {
-      *tmpx++ = 0;
-    }
-  }
-
-  /* set the max used and clamp */
-  x->used = n->used + 1;
-  mp_clamp (x);
-
-  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (x, n) != MP_LT) {
-    return s_mp_sub (x, n, x);
-  }
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_fast_mp_montgomery_reduce.c */
-
-/* Start: bn_fast_s_mp_mul_digs.c */
-#include <tommath.h>
-#ifdef BN_FAST_S_MP_MUL_DIGS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Fast (comba) multiplier
- *
- * This is the fast column-array [comba] multiplier.  It is
- * designed to compute the columns of the product first
- * then handle the carries afterwards.  This has the effect
- * of making the nested loops that compute the columns very
- * simple and schedulable on super-scalar processors.
- *
- * This has been modified to produce a variable number of
- * digits of output so if say only a half-product is required
- * you don't have to compute the upper half (a feature
- * required for fast Barrett reduction).
- *
- * Based on Algorithm 14.12 on pp.595 of HAC.
- *
- */
-int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  int     olduse, res, pa, ix, iz;
-  mp_digit W[MP_WARRAY];
-  register mp_word  _W;
-
-  /* grow the destination as required */
-  if (c->alloc < digs) {
-    if ((res = mp_grow (c, digs)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* number of output digits to produce */
-  pa = MIN(digs, a->used + b->used);
-
-  /* clear the carry */
-  _W = 0;
-  for (ix = 0; ix < pa; ix++) {
-      int      tx, ty;
-      int      iy;
-      mp_digit *tmpx, *tmpy;
-
-      /* get offsets into the two bignums */
-      ty = MIN(b->used-1, ix);
-      tx = ix - ty;
-
-      /* setup temp aliases */
-      tmpx = a->dp + tx;
-      tmpy = b->dp + ty;
-
-      /* this is the number of times the loop will iterrate, essentially
-         while (tx++ < a->used && ty-- >= 0) { ... }
-       */
-      iy = MIN(a->used-tx, ty+1);
-
-      /* execute loop */
-      for (iz = 0; iz < iy; ++iz) {
-         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-
-      }
-
-      /* store term */
-      W[ix] = ((mp_digit)_W) & MP_MASK;
-
-      /* make next carry */
-      _W = _W >> ((mp_word)DIGIT_BIT);
- }
-
-  /* setup dest */
-  olduse  = c->used;
-  c->used = pa;
-
-  {
-    register mp_digit *tmpc;
-    tmpc = c->dp;
-    for (ix = 0; ix < pa+1; ix++) {
-      /* now extract the previous digit [below the carry] */
-      *tmpc++ = W[ix];
-    }
-
-    /* clear unused digits [that existed in the old copy of c] */
-    for (; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_fast_s_mp_mul_digs.c */
-
-/* Start: bn_fast_s_mp_mul_high_digs.c */
-#include <tommath.h>
-#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* this is a modified version of fast_s_mul_digs that only produces
- * output digits *above* digs.  See the comments for fast_s_mul_digs
- * to see how it works.
- *
- * This is used in the Barrett reduction since for one of the multiplications
- * only the higher digits were needed.  This essentially halves the work.
- *
- * Based on Algorithm 14.12 on pp.595 of HAC.
- */
-int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  int     olduse, res, pa, ix, iz;
-  mp_digit W[MP_WARRAY];
-  mp_word  _W;
-
-  /* grow the destination as required */
-  pa = a->used + b->used;
-  if (c->alloc < pa) {
-    if ((res = mp_grow (c, pa)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* number of output digits to produce */
-  pa = a->used + b->used;
-  _W = 0;
-  for (ix = digs; ix < pa; ix++) {
-      int      tx, ty, iy;
-      mp_digit *tmpx, *tmpy;
-
-      /* get offsets into the two bignums */
-      ty = MIN(b->used-1, ix);
-      tx = ix - ty;
-
-      /* setup temp aliases */
-      tmpx = a->dp + tx;
-      tmpy = b->dp + ty;
-
-      /* this is the number of times the loop will iterrate, essentially its
-         while (tx++ < a->used && ty-- >= 0) { ... }
-       */
-      iy = MIN(a->used-tx, ty+1);
-
-      /* execute loop */
-      for (iz = 0; iz < iy; iz++) {
-         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-      }
-
-      /* store term */
-      W[ix] = ((mp_digit)_W) & MP_MASK;
-
-      /* make next carry */
-      _W = _W >> ((mp_word)DIGIT_BIT);
-  }
-
-  /* setup dest */
-  olduse  = c->used;
-  c->used = pa;
-
-  {
-    register mp_digit *tmpc;
-
-    tmpc = c->dp + digs;
-    for (ix = digs; ix < pa; ix++) {
-      /* now extract the previous digit [below the carry] */
-      *tmpc++ = W[ix];
-    }
-
-    /* clear unused digits [that existed in the old copy of c] */
-    for (; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_fast_s_mp_mul_high_digs.c */
-
-/* Start: bn_fast_s_mp_sqr.c */
-#include <tommath.h>
-#ifdef BN_FAST_S_MP_SQR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* the jist of squaring...
- * you do like mult except the offset of the tmpx [one that
- * starts closer to zero] can't equal the offset of tmpy.
- * So basically you set up iy like before then you min it with
- * (ty-tx) so that it never happens.  You double all those
- * you add in the inner loop
-
-After that loop you do the squares and add them in.
-*/
-
-int fast_s_mp_sqr (mp_int * a, mp_int * b)
-{
-  int       olduse, res, pa, ix, iz;
-  mp_digit   W[MP_WARRAY], *tmpx;
-  mp_word   W1;
-
-  /* grow the destination as required */
-  pa = a->used + a->used;
-  if (b->alloc < pa) {
-    if ((res = mp_grow (b, pa)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* number of output digits to produce */
-  W1 = 0;
-  for (ix = 0; ix < pa; ix++) {
-      int      tx, ty, iy;
-      mp_word  _W;
-      mp_digit *tmpy;
-
-      /* clear counter */
-      _W = 0;
-
-      /* get offsets into the two bignums */
-      ty = MIN(a->used-1, ix);
-      tx = ix - ty;
-
-      /* setup temp aliases */
-      tmpx = a->dp + tx;
-      tmpy = a->dp + ty;
-
-      /* this is the number of times the loop will iterrate, essentially
-         while (tx++ < a->used && ty-- >= 0) { ... }
-       */
-      iy = MIN(a->used-tx, ty+1);
-
-      /* now for squaring tx can never equal ty
-       * we halve the distance since they approach at a rate of 2x
-       * and we have to round because odd cases need to be executed
-       */
-      iy = MIN(iy, (ty-tx+1)>>1);
-
-      /* execute loop */
-      for (iz = 0; iz < iy; iz++) {
-         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-      }
-
-      /* double the inner product and add carry */
-      _W = _W + _W + W1;
-
-      /* even columns have the square term in them */
-      if ((ix&1) == 0) {
-         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
-      }
-
-      /* store it */
-      W[ix] = (mp_digit)(_W & MP_MASK);
-
-      /* make next carry */
-      W1 = _W >> ((mp_word)DIGIT_BIT);
-  }
-
-  /* setup dest */
-  olduse  = b->used;
-  b->used = a->used+a->used;
-
-  {
-    mp_digit *tmpb;
-    tmpb = b->dp;
-    for (ix = 0; ix < pa; ix++) {
-      *tmpb++ = W[ix] & MP_MASK;
-    }
-
-    /* clear unused digits [that existed in the old copy of c] */
-    for (; ix < olduse; ix++) {
-      *tmpb++ = 0;
-    }
-  }
-  mp_clamp (b);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_fast_s_mp_sqr.c */
-
-/* Start: bn_mp_2expt.c */
-#include <tommath.h>
-#ifdef BN_MP_2EXPT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes a = 2**b
- *
- * Simple algorithm which zeroes the int, grows it then just sets one bit
- * as required.
- */
-int
-mp_2expt (mp_int * a, int b)
-{
-  int     res;
-
-  /* zero a as per default */
-  mp_zero (a);
-
-  /* grow a to accomodate the single bit */
-  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
-    return res;
-  }
-
-  /* set the used count of where the bit will go */
-  a->used = b / DIGIT_BIT + 1;
-
-  /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_2expt.c */
-
-/* Start: bn_mp_abs.c */
-#include <tommath.h>
-#ifdef BN_MP_ABS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* b = |a|
- *
- * Simple function copies the input and fixes the sign to positive
- */
-int
-mp_abs (mp_int * a, mp_int * b)
-{
-  int     res;
-
-  /* copy a to b */
-  if (a != b) {
-     if ((res = mp_copy (a, b)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  /* force the sign of b to positive */
-  b->sign = MP_ZPOS;
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_abs.c */
-
-/* Start: bn_mp_add.c */
-#include <tommath.h>
-#ifdef BN_MP_ADD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* high level addition (handles signs) */
-int mp_add (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     sa, sb, res;
-
-  /* get sign of both inputs */
-  sa = a->sign;
-  sb = b->sign;
-
-  /* handle two cases, not four */
-  if (sa == sb) {
-    /* both positive or both negative */
-    /* add their magnitudes, copy the sign */
-    c->sign = sa;
-    res = s_mp_add (a, b, c);
-  } else {
-    /* one positive, the other negative */
-    /* subtract the one with the greater magnitude from */
-    /* the one of the lesser magnitude.  The result gets */
-    /* the sign of the one with the greater magnitude. */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      c->sign = sb;
-      res = s_mp_sub (b, a, c);
-    } else {
-      c->sign = sa;
-      res = s_mp_sub (a, b, c);
-    }
-  }
-  return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_add.c */
-
-/* Start: bn_mp_add_d.c */
-#include <tommath.h>
-#ifdef BN_MP_ADD_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* single digit addition */
-int
-mp_add_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  int     res, ix, oldused;
-  mp_digit *tmpa, *tmpc, mu;
-
-  /* grow c as required */
-  if (c->alloc < a->used + 1) {
-     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
-        return res;
-     }
-  }
-
-  /* if a is negative and |a| >= b, call c = |a| - b */
-  if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
-     /* temporarily fix sign of a */
-     a->sign = MP_ZPOS;
-
-     /* c = |a| - b */
-     res = mp_sub_d(a, b, c);
-
-     /* fix sign  */
-     a->sign = c->sign = MP_NEG;
-
-     /* clamp */
-     mp_clamp(c);
-
-     return res;
-  }
-
-  /* old number of used digits in c */
-  oldused = c->used;
-
-  /* sign always positive */
-  c->sign = MP_ZPOS;
-
-  /* source alias */
-  tmpa    = a->dp;
-
-  /* destination alias */
-  tmpc    = c->dp;
-
-  /* if a is positive */
-  if (a->sign == MP_ZPOS) {
-     /* add digit, after this we're propagating
-      * the carry.
-      */
-     *tmpc   = *tmpa++ + b;
-     mu      = *tmpc >> DIGIT_BIT;
-     *tmpc++ &= MP_MASK;
-
-     /* now handle rest of the digits */
-     for (ix = 1; ix < a->used; ix++) {
-        *tmpc   = *tmpa++ + mu;
-        mu      = *tmpc >> DIGIT_BIT;
-        *tmpc++ &= MP_MASK;
-     }
-     /* set final carry */
-     ix++;
-     *tmpc++  = mu;
-
-     /* setup size */
-     c->used = a->used + 1;
-  } else {
-     /* a was negative and |a| < b */
-     c->used  = 1;
-
-     /* the result is a single digit */
-     if (a->used == 1) {
-        *tmpc++  =  b - a->dp[0];
-     } else {
-        *tmpc++  =  b;
-     }
-
-     /* setup count so the clearing of oldused
-      * can fall through correctly
-      */
-     ix       = 1;
-  }
-
-  /* now zero to oldused */
-  while (ix++ < oldused) {
-     *tmpc++ = 0;
-  }
-  mp_clamp(c);
-
-  return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_add_d.c */
-
-/* Start: bn_mp_addmod.c */
-#include <tommath.h>
-#ifdef BN_MP_ADDMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* d = a + b (mod c) */
-int
-mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_addmod.c */
-
-/* Start: bn_mp_and.c */
-#include <tommath.h>
-#ifdef BN_MP_AND_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* AND two ints together */
-int
-mp_and (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] &= x->dp[ix];
-  }
-
-  /* zero digits above the last from the smallest mp_int */
-  for (; ix < t.used; ix++) {
-    t.dp[ix] = 0;
-  }
-
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_and.c */
-
-/* Start: bn_mp_clamp.c */
-#include <tommath.h>
-#ifdef BN_MP_CLAMP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* trim unused digits
- *
- * This is used to ensure that leading zero digits are
- * trimed and the leading "used" digit will be non-zero
- * Typically very fast.  Also fixes the sign if there
- * are no more leading digits
- */
-void
-mp_clamp (mp_int * a)
-{
-  /* decrease used while the most significant digit is
-   * zero.
-   */
-  while (a->used > 0 && a->dp[a->used - 1] == 0) {
-    --(a->used);
-  }
-
-  /* reset the sign flag if used == 0 */
-  if (a->used == 0) {
-    a->sign = MP_ZPOS;
-  }
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_clamp.c */
-
-/* Start: bn_mp_clear.c */
-#include <tommath.h>
-#ifdef BN_MP_CLEAR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* clear one (frees)  */
-void
-mp_clear (mp_int * a)
-{
-  int i;
-
-  /* only do anything if a hasn't been freed previously */
-  if (a->dp != NULL) {
-    /* first zero the digits */
-    for (i = 0; i < a->used; i++) {
-        a->dp[i] = 0;
-    }
-
-    /* free ram */
-    XFREE(a->dp);
-
-    /* reset members to make debugging easier */
-    a->dp    = NULL;
-    a->alloc = a->used = 0;
-    a->sign  = MP_ZPOS;
-  }
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_clear.c */
-
-/* Start: bn_mp_clear_multi.c */
-#include <tommath.h>
-#ifdef BN_MP_CLEAR_MULTI_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-#include <stdarg.h>
-
-void mp_clear_multi(mp_int *mp, ...)
-{
-    mp_int* next_mp = mp;
-    va_list args;
-    va_start(args, mp);
-    while (next_mp != NULL) {
-        mp_clear(next_mp);
-        next_mp = va_arg(args, mp_int*);
-    }
-    va_end(args);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_clear_multi.c */
-
-/* Start: bn_mp_cmp.c */
-#include <tommath.h>
-#ifdef BN_MP_CMP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* compare two ints (signed)*/
-int
-mp_cmp (mp_int * a, mp_int * b)
-{
-  /* compare based on sign */
-  if (a->sign != b->sign) {
-     if (a->sign == MP_NEG) {
-        return MP_LT;
-     } else {
-        return MP_GT;
-     }
-  }
-
-  /* compare digits */
-  if (a->sign == MP_NEG) {
-     /* if negative compare opposite direction */
-     return mp_cmp_mag(b, a);
-  } else {
-     return mp_cmp_mag(a, b);
-  }
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_cmp.c */
-
-/* Start: bn_mp_cmp_d.c */
-#include <tommath.h>
-#ifdef BN_MP_CMP_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* compare a digit */
-int mp_cmp_d(mp_int * a, mp_digit b)
-{
-  /* compare based on sign */
-  if (a->sign == MP_NEG) {
-    return MP_LT;
-  }
-
-  /* compare based on magnitude */
-  if (a->used > 1) {
-    return MP_GT;
-  }
-
-  /* compare the only digit of a to b */
-  if (a->dp[0] > b) {
-    return MP_GT;
-  } else if (a->dp[0] < b) {
-    return MP_LT;
-  } else {
-    return MP_EQ;
-  }
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_cmp_d.c */
-
-/* Start: bn_mp_cmp_mag.c */
-#include <tommath.h>
-#ifdef BN_MP_CMP_MAG_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* compare maginitude of two ints (unsigned) */
-int mp_cmp_mag (mp_int * a, mp_int * b)
-{
-  int     n;
-  mp_digit *tmpa, *tmpb;
-
-  /* compare based on # of non-zero digits */
-  if (a->used > b->used) {
-    return MP_GT;
-  }
-
-  if (a->used < b->used) {
-    return MP_LT;
-  }
-
-  /* alias for a */
-  tmpa = a->dp + (a->used - 1);
-
-  /* alias for b */
-  tmpb = b->dp + (a->used - 1);
-
-  /* compare based on digits  */
-  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
-    if (*tmpa > *tmpb) {
-      return MP_GT;
-    }
-
-    if (*tmpa < *tmpb) {
-      return MP_LT;
-    }
-  }
-  return MP_EQ;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_cmp_mag.c */
-
-/* Start: bn_mp_cnt_lsb.c */
-#include <tommath.h>
-#ifdef BN_MP_CNT_LSB_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-static const int lnz[16] = {
-   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-};
-
-/* Counts the number of lsbs which are zero before the first zero bit */
-int mp_cnt_lsb(mp_int *a)
-{
-   int x;
-   mp_digit q, qq;
-
-   /* easy out */
-   if (mp_iszero(a) == 1) {
-      return 0;
-   }
-
-   /* scan lower digits until non-zero */
-   for (x = 0; x < a->used && a->dp[x] == 0; x++);
-   q = a->dp[x];
-   x *= DIGIT_BIT;
-
-   /* now scan this digit until a 1 is found */
-   if ((q & 1) == 0) {
-      do {
-         qq  = q & 15;
-         x  += lnz[qq];
-         q >>= 4;
-      } while (qq == 0);
-   }
-   return x;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_cnt_lsb.c */
-
-/* Start: bn_mp_copy.c */
-#include <tommath.h>
-#ifdef BN_MP_COPY_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* copy, b = a */
-int
-mp_copy (mp_int * a, mp_int * b)
-{
-  int     res, n;
-
-  /* if dst == src do nothing */
-  if (a == b) {
-    return MP_OKAY;
-  }
-
-  /* grow dest */
-  if (b->alloc < a->used) {
-     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-        return res;
-     }
-  }
-
-  /* zero b and copy the parameters over */
-  {
-    register mp_digit *tmpa, *tmpb;
-
-    /* pointer aliases */
-
-    /* source */
-    tmpa = a->dp;
-
-    /* destination */
-    tmpb = b->dp;
-
-    /* copy all the digits */
-    for (n = 0; n < a->used; n++) {
-      *tmpb++ = *tmpa++;
-    }
-
-    /* clear high digits */
-    for (; n < b->used; n++) {
-      *tmpb++ = 0;
-    }
-  }
-
-  /* copy used count and sign */
-  b->used = a->used;
-  b->sign = a->sign;
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_copy.c */
-
-/* Start: bn_mp_count_bits.c */
-#include <tommath.h>
-#ifdef BN_MP_COUNT_BITS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* returns the number of bits in an int */
-int
-mp_count_bits (mp_int * a)
-{
-  int     r;
-  mp_digit q;
-
-  /* shortcut */
-  if (a->used == 0) {
-    return 0;
-  }
-
-  /* get number of digits and add that */
-  r = (a->used - 1) * DIGIT_BIT;
-
-  /* take the last digit and count the bits in it */
-  q = a->dp[a->used - 1];
-  while (q > ((mp_digit) 0)) {
-    ++r;
-    q >>= ((mp_digit) 1);
-  }
-  return r;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_count_bits.c */
-
-/* Start: bn_mp_div.c */
-#include <tommath.h>
-#ifdef BN_MP_DIV_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-#ifdef BN_MP_DIV_SMALL
-
-/* slower bit-bang division... also smaller */
-int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-   mp_int ta, tb, tq, q;
-   int    res, n, n2;
-
-  /* is divisor zero ? */
-  if (mp_iszero (b) == 1) {
-    return MP_VAL;
-  }
-
-  /* if a < b then q=0, r = a */
-  if (mp_cmp_mag (a, b) == MP_LT) {
-    if (d != NULL) {
-      res = mp_copy (a, d);
-    } else {
-      res = MP_OKAY;
-    }
-    if (c != NULL) {
-      mp_zero (c);
-    }
-    return res;
-  }
-	
-  /* init our temps */
-  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
-     return res;
-  }
-
-
-  mp_set(&tq, 1);
-  n = mp_count_bits(a) - mp_count_bits(b);
-  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
-      ((res = mp_abs(b, &tb)) != MP_OKAY) ||
-      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
-      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
-      goto LBL_ERR;
-  }
-
-  while (n-- >= 0) {
-     if (mp_cmp(&tb, &ta) != MP_GT) {
-        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
-            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
-           goto LBL_ERR;
-        }
-     }
-     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
-         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
-           goto LBL_ERR;
-     }
-  }
-
-  /* now q == quotient and ta == remainder */
-  n  = a->sign;
-  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
-  if (c != NULL) {
-     mp_exch(c, &q);
-     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
-  }
-  if (d != NULL) {
-     mp_exch(d, &ta);
-     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
-  }
-LBL_ERR:
-   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
-   return res;
-}
-
-#else
-
-/* integer signed division.
- * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
- * HAC pp.598 Algorithm 14.20
- *
- * Note that the description in HAC is horribly
- * incomplete.  For example, it doesn't consider
- * the case where digits are removed from 'x' in
- * the inner loop.  It also doesn't consider the
- * case that y has fewer than three digits, etc..
- *
- * The overall algorithm is as described as
- * 14.20 from HAC but fixed to treat these cases.
-*/
-int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  mp_int  q, x, y, t1, t2;
-  int     res, n, t, i, norm, neg;
-
-  /* is divisor zero ? */
-  if (mp_iszero (b) == 1) {
-    return MP_VAL;
-  }
-
-  /* if a < b then q=0, r = a */
-  if (mp_cmp_mag (a, b) == MP_LT) {
-    if (d != NULL) {
-      res = mp_copy (a, d);
-    } else {
-      res = MP_OKAY;
-    }
-    if (c != NULL) {
-      mp_zero (c);
-    }
-    return res;
-  }
-
-  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
-    return res;
-  }
-  q.used = a->used + 2;
-
-  if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto LBL_Q;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto LBL_T1;
-  }
-
-  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto LBL_T2;
-  }
-
-  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto LBL_X;
-  }
-
-  /* fix the sign */
-  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  x.sign = y.sign = MP_ZPOS;
-
-  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
-  norm = mp_count_bits(&y) % DIGIT_BIT;
-  if (norm < (int)(DIGIT_BIT-1)) {
-     norm = (DIGIT_BIT-1) - norm;
-     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto LBL_Y;
-     }
-     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto LBL_Y;
-     }
-  } else {
-     norm = 0;
-  }
-
-  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-  n = x.used - 1;
-  t = y.used - 1;
-
-  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
-  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
-    goto LBL_Y;
-  }
-
-  while (mp_cmp (&x, &y) != MP_LT) {
-    ++(q.dp[n - t]);
-    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto LBL_Y;
-    }
-  }
-
-  /* reset y by shifting it back down */
-  mp_rshd (&y, n - t);
-
-  /* step 3. for i from n down to (t + 1) */
-  for (i = n; i >= (t + 1); i--) {
-    if (i > x.used) {
-      continue;
-    }
-
-    /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
-     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
-    if (x.dp[i] == y.dp[t]) {
-      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
-    } else {
-      mp_word tmp;
-      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
-      tmp |= ((mp_word) x.dp[i - 1]);
-      tmp /= ((mp_word) y.dp[t]);
-      if (tmp > (mp_word) MP_MASK)
-        tmp = MP_MASK;
-      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
-    }
-
-    /* while (q{i-t-1} * (yt * b + y{t-1})) >
-             xi * b**2 + xi-1 * b + xi-2
-
-       do q{i-t-1} -= 1;
-    */
-    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
-    do {
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
-
-      /* find left hand */
-      mp_zero (&t1);
-      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
-      t1.dp[1] = y.dp[t];
-      t1.used = 2;
-      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto LBL_Y;
-      }
-
-      /* find right hand */
-      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
-      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
-      t2.dp[2] = x.dp[i];
-      t2.used = 3;
-    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
-
-    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
-    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto LBL_Y;
-    }
-
-    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto LBL_Y;
-    }
-
-    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto LBL_Y;
-    }
-
-    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
-    if (x.sign == MP_NEG) {
-      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto LBL_Y;
-      }
-      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto LBL_Y;
-      }
-      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto LBL_Y;
-      }
-
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
-    }
-  }
-
-  /* now q is the quotient and x is the remainder
-   * [which we have to normalize]
-   */
-
-  /* get sign before writing to c */
-  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
-
-  if (c != NULL) {
-    mp_clamp (&q);
-    mp_exch (&q, c);
-    c->sign = neg;
-  }
-
-  if (d != NULL) {
-    mp_div_2d (&x, norm, &x, NULL);
-    mp_exch (&x, d);
-  }
-
-  res = MP_OKAY;
-
-LBL_Y:mp_clear (&y);
-LBL_X:mp_clear (&x);
-LBL_T2:mp_clear (&t2);
-LBL_T1:mp_clear (&t1);
-LBL_Q:mp_clear (&q);
-  return res;
-}
-
-#endif
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_div.c */
-
-/* Start: bn_mp_div_2.c */
-#include <tommath.h>
-#ifdef BN_MP_DIV_2_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* b = a/2 */
-int mp_div_2(mp_int * a, mp_int * b)
-{
-  int     x, res, oldused;
-
-  /* copy */
-  if (b->alloc < a->used) {
-    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  oldused = b->used;
-  b->used = a->used;
-  {
-    register mp_digit r, rr, *tmpa, *tmpb;
-
-    /* source alias */
-    tmpa = a->dp + b->used - 1;
-
-    /* dest alias */
-    tmpb = b->dp + b->used - 1;
-
-    /* carry */
-    r = 0;
-    for (x = b->used - 1; x >= 0; x--) {
-      /* get the carry for the next iteration */
-      rr = *tmpa & 1;
-
-      /* shift the current digit, add in carry and store */
-      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-
-      /* forward carry to next iteration */
-      r = rr;
-    }
-
-    /* zero excess digits */
-    tmpb = b->dp + b->used;
-    for (x = b->used; x < oldused; x++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->sign = a->sign;
-  mp_clamp (b);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_div_2.c */
-
-/* Start: bn_mp_div_2d.c */
-#include <tommath.h>
-#ifdef BN_MP_DIV_2D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
-int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
-{
-  mp_digit D, r, rr;
-  int     x, res;
-  mp_int  t;
-
-
-  /* if the shift count is <= 0 then we do no work */
-  if (b <= 0) {
-    res = mp_copy (a, c);
-    if (d != NULL) {
-      mp_zero (d);
-    }
-    return res;
-  }
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  /* get the remainder */
-  if (d != NULL) {
-    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-  }
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  /* shift by as many digits in the bit count */
-  if (b >= (int)DIGIT_BIT) {
-    mp_rshd (c, b / DIGIT_BIT);
-  }
-
-  /* shift any bit count < DIGIT_BIT */
-  D = (mp_digit) (b % DIGIT_BIT);
-  if (D != 0) {
-    register mp_digit *tmpc, mask, shift;
-
-    /* mask */
-    mask = (((mp_digit)1) << D) - 1;
-
-    /* shift for lsb */
-    shift = DIGIT_BIT - D;
-
-    /* alias */
-    tmpc = c->dp + (c->used - 1);
-
-    /* carry */
-    r = 0;
-    for (x = c->used - 1; x >= 0; x--) {
-      /* get the lower  bits of this word in a temp */
-      rr = *tmpc & mask;
-
-      /* shift the current word and mix in the carry bits from the previous word */
-      *tmpc = (*tmpc >> D) | (r << shift);
-      --tmpc;
-
-      /* set the carry to the carry bits of the current word found above */
-      r = rr;
-    }
-  }
-  mp_clamp (c);
-  if (d != NULL) {
-    mp_exch (&t, d);
-  }
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_div_2d.c */
-
-/* Start: bn_mp_div_3.c */
-#include <tommath.h>
-#ifdef BN_MP_DIV_3_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* divide by three (based on routine from MPI and the GMP manual) */
-int
-mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
-{
-  mp_int   q;
-  mp_word  w, t;
-  mp_digit b;
-  int      res, ix;
-
-  /* b = 2**DIGIT_BIT / 3 */
-  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
-
-  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-     return res;
-  }
-
-  q.used = a->used;
-  q.sign = a->sign;
-  w = 0;
-  for (ix = a->used - 1; ix >= 0; ix--) {
-     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-
-     if (w >= 3) {
-        /* multiply w by [1/3] */
-        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
-
-        /* now subtract 3 * [w/3] from w, to get the remainder */
-        w -= t+t+t;
-
-        /* fixup the remainder as required since
-         * the optimization is not exact.
-         */
-        while (w >= 3) {
-           t += 1;
-           w -= 3;
-        }
-      } else {
-        t = 0;
-      }
-      q.dp[ix] = (mp_digit)t;
-  }
-
-  /* [optional] store the remainder */
-  if (d != NULL) {
-     *d = (mp_digit)w;
-  }
-
-  /* [optional] store the quotient */
-  if (c != NULL) {
-     mp_clamp(&q);
-     mp_exch(&q, c);
-  }
-  mp_clear(&q);
-
-  return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_div_3.c */
-
-/* Start: bn_mp_div_d.c */
-#include <tommath.h>
-#ifdef BN_MP_DIV_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-static int s_is_power_of_two(mp_digit b, int *p)
-{
-   int x;
-
-   /* fast return if no power of two */
-   if ((b==0) || (b & (b-1))) {
-      return 0;
-   }
-
-   for (x = 0; x < DIGIT_BIT; x++) {
-      if (b == (((mp_digit)1)<<x)) {
-         *p = x;
-         return 1;
-      }
-   }
-   return 0;
-}
-
-/* single digit division (based on routine from MPI) */
-int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
-{
-  mp_int  q;
-  mp_word w;
-  mp_digit t;
-  int     res, ix;
-
-  /* cannot divide by zero */
-  if (b == 0) {
-     return MP_VAL;
-  }
-
-  /* quick outs */
-  if (b == 1 || mp_iszero(a) == 1) {
-     if (d != NULL) {
-        *d = 0;
-     }
-     if (c != NULL) {
-        return mp_copy(a, c);
-     }
-     return MP_OKAY;
-  }
-
-  /* power of two ? */
-  if (s_is_power_of_two(b, &ix) == 1) {
-     if (d != NULL) {
-        *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
-     }
-     if (c != NULL) {
-        return mp_div_2d(a, ix, c, NULL);
-     }
-     return MP_OKAY;
-  }
-
-#ifdef BN_MP_DIV_3_C
-  /* three? */
-  if (b == 3) {
-     return mp_div_3(a, c, d);
-  }
-#endif
-
-  /* no easy answer [c'est la vie].  Just division */
-  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-     return res;
-  }
-
-  q.used = a->used;
-  q.sign = a->sign;
-  w = 0;
-  for (ix = a->used - 1; ix >= 0; ix--) {
-     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-
-     if (w >= b) {
-        t = (mp_digit)(w / b);
-        w -= ((mp_word)t) * ((mp_word)b);
-      } else {
-        t = 0;
-      }
-      q.dp[ix] = (mp_digit)t;
-  }
-
-  if (d != NULL) {
-     *d = (mp_digit)w;
-  }
-
-  if (c != NULL) {
-     mp_clamp(&q);
-     mp_exch(&q, c);
-  }
-  mp_clear(&q);
-
-  return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_div_d.c */
-
-/* Start: bn_mp_dr_is_modulus.c */
-#include <tommath.h>
-#ifdef BN_MP_DR_IS_MODULUS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines if a number is a valid DR modulus */
-int mp_dr_is_modulus(mp_int *a)
-{
-   int ix;
-
-   /* must be at least two digits */
-   if (a->used < 2) {
-      return 0;
-   }
-
-   /* must be of the form b**k - a [a <= b] so all
-    * but the first digit must be equal to -1 (mod b).
-    */
-   for (ix = 1; ix < a->used; ix++) {
-       if (a->dp[ix] != MP_MASK) {
-          return 0;
-       }
-   }
-   return 1;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_dr_is_modulus.c */
-
-/* Start: bn_mp_dr_reduce.c */
-#include <tommath.h>
-#ifdef BN_MP_DR_REDUCE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
- *
- * Based on algorithm from the paper
- *
- * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Joong Lee,
- *          POSTECH Information Research Laboratories
- *
- * The modulus must be of a special format [see manual]
- *
- * Has been modified to use algorithm 7.10 from the LTM book instead
- *
- * Input x must be in the range 0 <= x <= (n-1)**2
- */
-int
-mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
-{
-  int      err, i, m;
-  mp_word  r;
-  mp_digit mu, *tmpx1, *tmpx2;
-
-  /* m = digits in modulus */
-  m = n->used;
-
-  /* ensure that "x" has at least 2m digits */
-  if (x->alloc < m + m) {
-    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
-      return err;
-    }
-  }
-
-/* top of loop, this is where the code resumes if
- * another reduction pass is required.
- */
-top:
-  /* aliases for digits */
-  /* alias for lower half of x */
-  tmpx1 = x->dp;
-
-  /* alias for upper half of x, or x/B**m */
-  tmpx2 = x->dp + m;
-
-  /* set carry to zero */
-  mu = 0;
-
-  /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
-  for (i = 0; i < m; i++) {
-      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
-      *tmpx1++  = (mp_digit)(r & MP_MASK);
-      mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
-  }
-
-  /* set final carry */
-  *tmpx1++ = mu;
-
-  /* zero words above m */
-  for (i = m + 1; i < x->used; i++) {
-      *tmpx1++ = 0;
-  }
-
-  /* clamp, sub and return */
-  mp_clamp (x);
-
-  /* if x >= n then subtract and reduce again
-   * Each successive "recursion" makes the input smaller and smaller.
-   */
-  if (mp_cmp_mag (x, n) != MP_LT) {
-    s_mp_sub(x, n, x);
-    goto top;
-  }
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_dr_reduce.c */
-
-/* Start: bn_mp_dr_setup.c */
-#include <tommath.h>
-#ifdef BN_MP_DR_SETUP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines the setup value */
-void mp_dr_setup(mp_int *a, mp_digit *d)
-{
-   /* the casts are required if DIGIT_BIT is one less than
-    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
-    */
-   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
-        ((mp_word)a->dp[0]));
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_dr_setup.c */
-
-/* Start: bn_mp_exch.c */
-#include <tommath.h>
-#ifdef BN_MP_EXCH_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* swap the elements of two integers, for cases where you can't simply swap the
- * mp_int pointers around
- */
-void
-mp_exch (mp_int * a, mp_int * b)
-{
-  mp_int  t;
-
-  t  = *a;
-  *a = *b;
-  *b = t;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_exch.c */
-
-/* Start: bn_mp_expt_d.c */
-#include <tommath.h>
-#ifdef BN_MP_EXPT_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* calculate c = a**b  using a square-multiply algorithm */
-int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  int     res, x;
-  mp_int  g;
-
-  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
-    return res;
-  }
-
-  /* set initial result */
-  mp_set (c, 1);
-
-  for (x = 0; x < (int) DIGIT_BIT; x++) {
-    /* square */
-    if ((res = mp_sqr (c, c)) != MP_OKAY) {
-      mp_clear (&g);
-      return res;
-    }
-
-    /* if the bit is set multiply */
-    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
-      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
-         mp_clear (&g);
-         return res;
-      }
-    }
-
-    /* shift to next bit */
-    b <<= 1;
-  }
-
-  mp_clear (&g);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_expt_d.c */
-
-/* Start: bn_mp_exptmod.c */
-#include <tommath.h>
-#ifdef BN_MP_EXPTMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-
-/* this is a shell function that calls either the normal or Montgomery
- * exptmod functions.  Originally the call to the montgomery code was
- * embedded in the normal function but that wasted alot of stack space
- * for nothing (since 99% of the time the Montgomery code would be called)
- */
-int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  int dr;
-
-  /* modulus P must be positive */
-  if (P->sign == MP_NEG) {
-     return MP_VAL;
-  }
-
-  /* if exponent X is negative we have to recurse */
-  if (X->sign == MP_NEG) {
-#ifdef BN_MP_INVMOD_C
-     mp_int tmpG, tmpX;
-     int err;
-
-     /* first compute 1/G mod P */
-     if ((err = mp_init(&tmpG)) != MP_OKAY) {
-        return err;
-     }
-     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
-        mp_clear(&tmpG);
-        return err;
-     }
-
-     /* now get |X| */
-     if ((err = mp_init(&tmpX)) != MP_OKAY) {
-        mp_clear(&tmpG);
-        return err;
-     }
-     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
-        mp_clear_multi(&tmpG, &tmpX, NULL);
-        return err;
-     }
-
-     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
-     err = mp_exptmod(&tmpG, &tmpX, P, Y);
-     mp_clear_multi(&tmpG, &tmpX, NULL);
-     return err;
-#else
-     /* no invmod */
-     return MP_VAL;
-#endif
-  }
-
-/* modified diminished radix reduction */
-#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defined(BN_S_MP_EXPTMOD_C)
-  if (mp_reduce_is_2k_l(P) == MP_YES) {
-     return s_mp_exptmod(G, X, P, Y, 1);
-  }
-#endif
-
-#ifdef BN_MP_DR_IS_MODULUS_C
-  /* is it a DR modulus? */
-  dr = mp_dr_is_modulus(P);
-#else
-  /* default to no */
-  dr = 0;
-#endif
-
-#ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a unrestricted DR modulus? */
-  if (dr == 0) {
-     dr = mp_reduce_is_2k(P) << 1;
-  }
-#endif
-
-  /* if the modulus is odd or dr != 0 use the montgomery method */
-#ifdef BN_MP_EXPTMOD_FAST_C
-  if (mp_isodd (P) == 1 || dr !=  0) {
-    return mp_exptmod_fast (G, X, P, Y, dr);
-  } else {
-#endif
-#ifdef BN_S_MP_EXPTMOD_C
-    /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y, 0);
-#else
-    /* no exptmod for evens */
-    return MP_VAL;
-#endif
-#ifdef BN_MP_EXPTMOD_FAST_C
-  }
-#endif
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_exptmod.c */
-
-/* Start: bn_mp_exptmod_fast.c */
-#include <tommath.h>
-#ifdef BN_MP_EXPTMOD_FAST_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
- *
- * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
- * The value of k changes based on the size of the exponent.
- *
- * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
- */
-
-#ifdef MP_LOW_MEM
-   #define TAB_SIZE 32
-#else
-   #define TAB_SIZE 256
-#endif
-
-int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
-{
-  mp_int  M[TAB_SIZE], res;
-  mp_digit buf, mp;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-  /* use a pointer to the reduction algorithm.  This allows us to use
-   * one of many reduction algorithms without modding the guts of
-   * the code with if statements everywhere.
-   */
-  int     (*redux)(mp_int*,mp_int*,mp_digit);
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-  if (winsize > 5) {
-     winsize = 5;
-  }
-#endif
-
-  /* init M array */
-  /* init first cell */
-  if ((err = mp_init(&M[1])) != MP_OKAY) {
-     return err;
-  }
-
-  /* now init the second half of the array */
-  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
-    if ((err = mp_init(&M[x])) != MP_OKAY) {
-      for (y = 1<<(winsize-1); y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      mp_clear(&M[1]);
-      return err;
-    }
-  }
-
-  /* determine and setup reduction code */
-  if (redmode == 0) {
-#ifdef BN_MP_MONTGOMERY_SETUP_C
-     /* now setup montgomery  */
-     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto LBL_M;
-     }
-#else
-     err = MP_VAL;
-     goto LBL_M;
-#endif
-
-     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
-#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
-     if (((P->used * 2 + 1) < MP_WARRAY) &&
-          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-        redux = fast_mp_montgomery_reduce;
-     } else
-#endif
-     {
-#ifdef BN_MP_MONTGOMERY_REDUCE_C
-        /* use slower baseline Montgomery method */
-        redux = mp_montgomery_reduce;
-#else
-        err = MP_VAL;
-        goto LBL_M;
-#endif
-     }
-  } else if (redmode == 1) {
-#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
-     /* setup DR reduction for moduli of the form B**k - b */
-     mp_dr_setup(P, &mp);
-     redux = mp_dr_reduce;
-#else
-     err = MP_VAL;
-     goto LBL_M;
-#endif
-  } else {
-#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
-     /* setup DR reduction for moduli of the form 2**k - b */
-     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto LBL_M;
-     }
-     redux = mp_reduce_2k;
-#else
-     err = MP_VAL;
-     goto LBL_M;
-#endif
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto LBL_M;
-  }
-
-  /* create M table
-   *
-
-   *
-   * The first half of the table is not computed though accept for M[0] and M[1]
-   */
-
-  if (redmode == 0) {
-#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-     /* now we need R mod m */
-     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto LBL_RES;
-     }
-#else
-     err = MP_VAL;
-     goto LBL_RES;
-#endif
-
-     /* now set M[1] to G * R mod m */
-     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto LBL_RES;
-     }
-  } else {
-     mp_set(&res, 1);
-     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto LBL_RES;
-     }
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto LBL_RES;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto LBL_RES;
-    }
-    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto LBL_RES;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto LBL_RES;
-    }
-    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto LBL_RES;
-    }
-  }
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = 0;
-  bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      /* if digidx == -1 we are out of digits so break */
-      if (digidx == -1) {
-        break;
-      }
-      /* read next digit and reset bitcnt */
-      buf    = X->dp[digidx--];
-      bitcnt = (int)DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0) {
-      continue;
-    }
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode    = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-        if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-
-      /* empty window and reset */
-      bitcpy = 0;
-      bitbuf = 0;
-      mode   = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-
-      /* get next bit of the window */
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-        if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-      }
-    }
-  }
-
-  if (redmode == 0) {
-     /* fixup result if Montgomery reduction is used
-      * recall that any value in a Montgomery system is
-      * actually multiplied by R mod n.  So we have
-      * to reduce one more time to cancel out the factor
-      * of R.
-      */
-     if ((err = redux(&res, P, mp)) != MP_OKAY) {
-       goto LBL_RES;
-     }
-  }
-
-  /* swap res with Y */
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-LBL_RES:mp_clear (&res);
-LBL_M:
-  mp_clear(&M[1]);
-  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
-#endif
-
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_exptmod_fast.c */
-
-/* Start: bn_mp_exteuclid.c */
-#include <tommath.h>
-#ifdef BN_MP_EXTEUCLID_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Extended euclidean algorithm of (a, b) produces
-   a*u1 + b*u2 = u3
- */
-int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
-{
-   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
-   int err;
-
-   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* initialize, (u1,u2,u3) = (1,0,a) */
-   mp_set(&u1, 1);
-   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
-
-   /* initialize, (v1,v2,v3) = (0,1,b) */
-   mp_set(&v2, 1);
-   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
-
-   /* loop while v3 != 0 */
-   while (mp_iszero(&v3) == MP_NO) {
-       /* q = u3/v3 */
-       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
-
-       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
-       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
-       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
-       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
-
-       /* (u1,u2,u3) = (v1,v2,v3) */
-       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
-
-       /* (v1,v2,v3) = (t1,t2,t3) */
-       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
-   }
-
-   /* make sure U3 >= 0 */
-   if (u3.sign == MP_NEG) {
-      mp_neg(&u1, &u1);
-      mp_neg(&u2, &u2);
-      mp_neg(&u3, &u3);
-   }
-
-   /* copy result out */
-   if (U1 != NULL) { mp_exch(U1, &u1); }
-   if (U2 != NULL) { mp_exch(U2, &u2); }
-   if (U3 != NULL) { mp_exch(U3, &u3); }
-
-   err = MP_OKAY;
-_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
-   return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_exteuclid.c */
-
-/* Start: bn_mp_fread.c */
-#include <tommath.h>
-#ifdef BN_MP_FREAD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* read a bigint from a file stream in ASCII */
-int mp_fread(mp_int *a, int radix, FILE *stream)
-{
-   int err, ch, neg, y;
-
-   /* clear a */
-   mp_zero(a);
-
-   /* if first digit is - then set negative */
-   ch = fgetc(stream);
-   if (ch == '-') {
-      neg = MP_NEG;
-      ch = fgetc(stream);
-   } else {
-      neg = MP_ZPOS;
-   }
-
-   for (;;) {
-      /* find y in the radix map */
-      for (y = 0; y < radix; y++) {
-          if (mp_s_rmap[y] == ch) {
-             break;
-          }
-      }
-      if (y == radix) {
-         break;
-      }
-
-      /* shift up and add */
-      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
-         return err;
-      }
-      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
-         return err;
-      }
-
-      ch = fgetc(stream);
-   }
-   if (mp_cmp_d(a, 0) != MP_EQ) {
-      a->sign = neg;
-   }
-
-   return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_fread.c */
-
-/* Start: bn_mp_fwrite.c */
-#include <tommath.h>
-#ifdef BN_MP_FWRITE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-int mp_fwrite(mp_int *a, int radix, FILE *stream)
-{
-   char *buf;
-   int err, len, x;
-
-   if ((err = mp_radix_size(a, radix, &len)) != MP_OKAY) {
-      return err;
-   }
-
-   buf = OPT_CAST(char) XMALLOC (len);
-   if (buf == NULL) {
-      return MP_MEM;
-   }
-
-   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
-      XFREE (buf);
-      return err;
-   }
-
-   for (x = 0; x < len; x++) {
-       if (fputc(buf[x], stream) == EOF) {
-          XFREE (buf);
-          return MP_VAL;
-       }
-   }
-
-   XFREE (buf);
-   return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_fwrite.c */
-
-/* Start: bn_mp_gcd.c */
-#include <tommath.h>
-#ifdef BN_MP_GCD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Greatest Common Divisor using the binary method */
-int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  u, v;
-  int     k, u_lsb, v_lsb, res;
-
-  /* either zero than gcd is the largest */
-  if (mp_iszero (a) == MP_YES) {
-    return mp_abs (b, c);
-  }
-  if (mp_iszero (b) == MP_YES) {
-    return mp_abs (a, c);
-  }
-
-  /* get copies of a and b we can modify */
-  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto LBL_U;
-  }
-
-  /* must be positive for the remainder of the algorithm */
-  u.sign = v.sign = MP_ZPOS;
-
-  /* B1.  Find the common power of two for u and v */
-  u_lsb = mp_cnt_lsb(&u);
-  v_lsb = mp_cnt_lsb(&v);
-  k     = MIN(u_lsb, v_lsb);
-
-  if (k > 0) {
-     /* divide the power of two out */
-     if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
-        goto LBL_V;
-     }
-
-     if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
-        goto LBL_V;
-     }
-  }
-
-  /* divide any remaining factors of two out */
-  if (u_lsb != k) {
-     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
-        goto LBL_V;
-     }
-  }
-
-  if (v_lsb != k) {
-     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
-        goto LBL_V;
-     }
-  }
-
-  while (mp_iszero(&v) == 0) {
-     /* make sure v is the largest */
-     if (mp_cmp_mag(&u, &v) == MP_GT) {
-        /* swap u and v to make sure v is >= u */
-        mp_exch(&u, &v);
-     }
-
-     /* subtract smallest from largest */
-     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
-        goto LBL_V;
-     }
-
-     /* Divide out all factors of two */
-     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
-        goto LBL_V;
-     }
-  }
-
-  /* multiply by 2**k which we divided out at the beginning */
-  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
-     goto LBL_V;
-  }
-  c->sign = MP_ZPOS;
-  res = MP_OKAY;
-LBL_V:mp_clear (&u);
-LBL_U:mp_clear (&v);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_gcd.c */
-
-/* Start: bn_mp_get_int.c */
-#include <tommath.h>
-#ifdef BN_MP_GET_INT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* get the lower 32-bits of an mp_int */
-unsigned long mp_get_int(mp_int * a)
-{
-  int i;
-  unsigned long res;
-
-  if (a->used == 0) {
-     return 0;
-  }
-
-  /* get number of digits of the lsb we have to read */
-  i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
-
-  /* get most significant digit of result */
-  res = DIGIT(a,i);
-
-  while (--i >= 0) {
-    res = (res << DIGIT_BIT) | DIGIT(a,i);
-  }
-
-  /* force result to 32-bits always so it is consistent on non 32-bit platforms */
-  return res & 0xFFFFFFFFUL;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_get_int.c */
-
-/* Start: bn_mp_grow.c */
-#include <tommath.h>
-#ifdef BN_MP_GROW_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* grow as required */
-int mp_grow (mp_int * a, int size)
-{
-  int     i;
-  mp_digit *tmp;
-
-  /* if the alloc size is smaller alloc more ram */
-  if (a->alloc < size) {
-    /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size % MP_PREC);
-
-    /* reallocate the array a->dp
-     *
-     * We store the return in a temporary variable
-     * in case the operation failed we don't want
-     * to overwrite the dp member of a.
-     */
-    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
-    if (tmp == NULL) {
-      /* reallocation failed but "a" is still valid [can be freed] */
-      return MP_MEM;
-    }
-
-    /* reallocation succeeded so set a->dp */
-    a->dp = tmp;
-
-    /* zero excess digits */
-    i        = a->alloc;
-    a->alloc = size;
-    for (; i < a->alloc; i++) {
-      a->dp[i] = 0;
-    }
-  }
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_grow.c */
-
-/* Start: bn_mp_init.c */
-#include <tommath.h>
-#ifdef BN_MP_INIT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* init a new mp_int */
-int mp_init (mp_int * a)
-{
-  int i;
-
-  /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
-  if (a->dp == NULL) {
-    return MP_MEM;
-  }
-
-  /* set the digits to zero */
-  for (i = 0; i < MP_PREC; i++) {
-      a->dp[i] = 0;
-  }
-
-  /* set the used to zero, allocated digits to the default precision
-   * and sign to positive */
-  a->used  = 0;
-  a->alloc = MP_PREC;
-  a->sign  = MP_ZPOS;
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_init.c */
-
-/* Start: bn_mp_init_copy.c */
-#include <tommath.h>
-#ifdef BN_MP_INIT_COPY_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* creates "a" then copies b into it */
-int mp_init_copy (mp_int * a, mp_int * b)
-{
-  int     res;
-
-  if ((res = mp_init (a)) != MP_OKAY) {
-    return res;
-  }
-  return mp_copy (b, a);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_init_copy.c */
-
-/* Start: bn_mp_init_multi.c */
-#include <tommath.h>
-#ifdef BN_MP_INIT_MULTI_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-#include <stdarg.h>
-
-int mp_init_multi(mp_int *mp, ...)
-{
-    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
-    int n = 0;                 /* Number of ok inits */
-    mp_int* cur_arg = mp;
-    va_list args;
-
-    va_start(args, mp);        /* init args to next argument from caller */
-    while (cur_arg != NULL) {
-        if (mp_init(cur_arg) != MP_OKAY) {
-            /* Oops - error! Back-track and mp_clear what we already
-               succeeded in init-ing, then return error.
-            */
-            va_list clean_args;
-
-            /* end the current list */
-            va_end(args);
-
-            /* now start cleaning up */
-            cur_arg = mp;
-            va_start(clean_args, mp);
-            while (n--) {
-                mp_clear(cur_arg);
-                cur_arg = va_arg(clean_args, mp_int*);
-            }
-            va_end(clean_args);
-            res = MP_MEM;
-            break;
-        }
-        n++;
-        cur_arg = va_arg(args, mp_int*);
-    }
-    va_end(args);
-    return res;                /* Assumed ok, if error flagged above. */
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_init_multi.c */
-
-/* Start: bn_mp_init_set.c */
-#include <tommath.h>
-#ifdef BN_MP_INIT_SET_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* initialize and set a digit */
-int mp_init_set (mp_int * a, mp_digit b)
-{
-  int err;
-  if ((err = mp_init(a)) != MP_OKAY) {
-     return err;
-  }
-  mp_set(a, b);
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_init_set.c */
-
-/* Start: bn_mp_init_set_int.c */
-#include <tommath.h>
-#ifdef BN_MP_INIT_SET_INT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* initialize and set a digit */
-int mp_init_set_int (mp_int * a, unsigned long b)
-{
-  int err;
-  if ((err = mp_init(a)) != MP_OKAY) {
-     return err;
-  }
-  return mp_set_int(a, b);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_init_set_int.c */
-
-/* Start: bn_mp_init_size.c */
-#include <tommath.h>
-#ifdef BN_MP_INIT_SIZE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* init an mp_init for a given size */
-int mp_init_size (mp_int * a, int size)
-{
-  int x;
-
-  /* pad size so there are always extra digits */
-  size += (MP_PREC * 2) - (size % MP_PREC);	
-
-  /* alloc mem */
-  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
-  if (a->dp == NULL) {
-    return MP_MEM;
-  }
-
-  /* set the members */
-  a->used  = 0;
-  a->alloc = size;
-  a->sign  = MP_ZPOS;
-
-  /* zero the digits */
-  for (x = 0; x < size; x++) {
-      a->dp[x] = 0;
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_init_size.c */
-
-/* Start: bn_mp_invmod.c */
-#include <tommath.h>
-#ifdef BN_MP_INVMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* hac 14.61, pp608 */
-int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  /* b cannot be negative */
-  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
-    return MP_VAL;
-  }
-
-#ifdef BN_FAST_MP_INVMOD_C
-  /* if the modulus is odd we can use a faster routine instead */
-  if (mp_isodd (b) == 1) {
-    return fast_mp_invmod (a, b, c);
-  }
-#endif
-
-#ifdef BN_MP_INVMOD_SLOW_C
-  return mp_invmod_slow(a, b, c);
-#endif
-
-  return MP_VAL;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_invmod.c */
-
-/* Start: bn_mp_invmod_slow.c */
-#include <tommath.h>
-#ifdef BN_MP_INVMOD_SLOW_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* hac 14.61, pp608 */
-int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x, y, u, v, A, B, C, D;
-  int     res;
-
-  /* b cannot be negative */
-  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
-    return MP_VAL;
-  }
-
-  /* init temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v,
-                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
-     return res;
-  }
-
-  /* x = a, y = b */
-  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
-      goto LBL_ERR;
-  }
-  if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-
-  /* 2. [modified] if x,y are both even then return an error! */
-  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
-    res = MP_VAL;
-    goto LBL_ERR;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto LBL_ERR;
-  }
-  mp_set (&A, 1);
-  mp_set (&D, 1);
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-    /* 4.2 if A or B is odd then */
-    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
-      /* A = (A+y)/2, B = (B-x)/2 */
-      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-    }
-    /* A = A/2, B = B/2 */
-    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-    /* 5.2 if C or D is odd then */
-    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
-      /* C = (C+y)/2, D = (D-x)/2 */
-      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-    }
-    /* C = C/2, D = D/2 */
-    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, A = A - C, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-
-    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  } else {
-    /* v - v - u, C = C - A, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-
-    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto LBL_ERR;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0)
-    goto top;
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto LBL_ERR;
-  }
-
-  /* if its too low */
-  while (mp_cmp_d(&C, 0) == MP_LT) {
-      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-  }
-
-  /* too big */
-  while (mp_cmp_mag(&C, b) != MP_LT) {
-      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-  }
-
-  /* C is now the inverse */
-  mp_exch (&C, c);
-  res = MP_OKAY;
-LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_invmod_slow.c */
-
-/* Start: bn_mp_is_square.c */
-#include <tommath.h>
-#ifdef BN_MP_IS_SQUARE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Check if remainders are possible squares - fast exclude non-squares */
-static const char rem_128[128] = {
- 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1
-};
-
-static const char rem_105[105] = {
- 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
- 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
- 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
- 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
- 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
- 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
-};
-
-/* Store non-zero to ret if arg is square, and zero if not */
-int mp_is_square(mp_int *arg,int *ret)
-{
-  int           res;
-  mp_digit      c;
-  mp_int        t;
-  unsigned long r;
-
-  /* Default to Non-square :) */
-  *ret = MP_NO;
-
-  if (arg->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  /* digits used?  (TSD) */
-  if (arg->used == 0) {
-     return MP_OKAY;
-  }
-
-  /* First check mod 128 (suppose that DIGIT_BIT is at least 7) */
-  if (rem_128[127 & DIGIT(arg,0)] == 1) {
-     return MP_OKAY;
-  }
-
-  /* Next check mod 105 (3*5*7) */
-  if ((res = mp_mod_d(arg,105,&c)) != MP_OKAY) {
-     return res;
-  }
-  if (rem_105[c] == 1) {
-     return MP_OKAY;
-  }
-
-
-  if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
-     return res;
-  }
-  if ((res = mp_mod(arg,&t,&t)) != MP_OKAY) {
-     goto ERR;
-  }
-  r = mp_get_int(&t);
-  /* Check for other prime modules, note it's not an ERROR but we must
-   * free "t" so the easiest way is to goto ERR.  We know that res
-   * is already equal to MP_OKAY from the mp_mod call
-   */
-  if ( (1L<<(r%11)) & 0x5C4L )             goto ERR;
-  if ( (1L<<(r%13)) & 0x9E4L )             goto ERR;
-  if ( (1L<<(r%17)) & 0x5CE8L )            goto ERR;
-  if ( (1L<<(r%19)) & 0x4F50CL )           goto ERR;
-  if ( (1L<<(r%23)) & 0x7ACCA0L )          goto ERR;
-  if ( (1L<<(r%29)) & 0xC2EDD0CL )         goto ERR;
-  if ( (1L<<(r%31)) & 0x6DE2B848L )        goto ERR;
-
-  /* Final check - is sqr(sqrt(arg)) == arg ? */
-  if ((res = mp_sqrt(arg,&t)) != MP_OKAY) {
-     goto ERR;
-  }
-  if ((res = mp_sqr(&t,&t)) != MP_OKAY) {
-     goto ERR;
-  }
-
-  *ret = (mp_cmp_mag(&t,arg) == MP_EQ) ? MP_YES : MP_NO;
-ERR:mp_clear(&t);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_is_square.c */
-
-/* Start: bn_mp_jacobi.c */
-#include <tommath.h>
-#ifdef BN_MP_JACOBI_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes the jacobi c = (a | n) (or Legendre if n is prime)
- * HAC pp. 73 Algorithm 2.149
- */
-int mp_jacobi (mp_int * a, mp_int * p, int *c)
-{
-  mp_int  a1, p1;
-  int     k, s, r, res;
-  mp_digit residue;
-
-  /* if p <= 0 return MP_VAL */
-  if (mp_cmp_d(p, 0) != MP_GT) {
-     return MP_VAL;
-  }
-
-  /* step 1.  if a == 0, return 0 */
-  if (mp_iszero (a) == 1) {
-    *c = 0;
-    return MP_OKAY;
-  }
-
-  /* step 2.  if a == 1, return 1 */
-  if (mp_cmp_d (a, 1) == MP_EQ) {
-    *c = 1;
-    return MP_OKAY;
-  }
-
-  /* default */
-  s = 0;
-
-  /* step 3.  write a = a1 * 2**k  */
-  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&p1)) != MP_OKAY) {
-    goto LBL_A1;
-  }
-
-  /* divide out larger power of two */
-  k = mp_cnt_lsb(&a1);
-  if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
-     goto LBL_P1;
-  }
-
-  /* step 4.  if e is even set s=1 */
-  if ((k & 1) == 0) {
-    s = 1;
-  } else {
-    /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
-    residue = p->dp[0] & 7;
-
-    if (residue == 1 || residue == 7) {
-      s = 1;
-    } else if (residue == 3 || residue == 5) {
-      s = -1;
-    }
-  }
-
-  /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-  if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) {
-    s = -s;
-  }
-
-  /* if a1 == 1 we're done */
-  if (mp_cmp_d (&a1, 1) == MP_EQ) {
-    *c = s;
-  } else {
-    /* n1 = n mod a1 */
-    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
-      goto LBL_P1;
-    }
-    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
-      goto LBL_P1;
-    }
-    *c = s * r;
-  }
-
-  /* done */
-  res = MP_OKAY;
-LBL_P1:mp_clear (&p1);
-LBL_A1:mp_clear (&a1);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_jacobi.c */
-
-/* Start: bn_mp_karatsuba_mul.c */
-#include <tommath.h>
-#ifdef BN_MP_KARATSUBA_MUL_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* c = |a| * |b| using Karatsuba Multiplication using
- * three half size multiplications
- *
- * Let B represent the radix [e.g. 2**DIGIT_BIT] and
- * let n represent half of the number of digits in
- * the min(a,b)
- *
- * a = a1 * B**n + a0
- * b = b1 * B**n + b0
- *
- * Then, a * b =>
-   a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
- *
- * Note that a1b1 and a0b0 are used twice and only need to be
- * computed once.  So in total three half size (half # of
- * digit) multiplications are performed, a0b0, a1b1 and
- * (a1+b1)(a0+b0)
- *
- * Note that a multiplication of half the digits requires
- * 1/4th the number of single precision multiplications so in
- * total after one call 25% of the single precision multiplications
- * are saved.  Note also that the call to mp_mul can end up back
- * in this function if the a0, a1, b0, or b1 are above the threshold.
- * This is known as divide-and-conquer and leads to the famous
- * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
- * the standard O(N**2) that the baseline/comba methods use.
- * Generally though the overhead of this method doesn't pay off
- * until a certain size (N ~ 80) is reached.
- */
-int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
-  int     B, err;
-
-  /* default the return code to an error */
-  err = MP_MEM;
-
-  /* min # of digits */
-  B = MIN (a->used, b->used);
-
-  /* now divide in two */
-  B = B >> 1;
-
-  /* init copy all the temps */
-  if (mp_init_size (&x0, B) != MP_OKAY)
-    goto ERR;
-  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-    goto X0;
-  if (mp_init_size (&y0, B) != MP_OKAY)
-    goto X1;
-  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
-    goto Y0;
-
-  /* init temps */
-  if (mp_init_size (&t1, B * 2) != MP_OKAY)
-    goto Y1;
-  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-    goto T1;
-  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
-    goto X0Y0;
-
-  /* now shift the digits */
-  x0.used = y0.used = B;
-  x1.used = a->used - B;
-  y1.used = b->used - B;
-
-  {
-    register int x;
-    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-
-    /* we copy the digits directly instead of using higher level functions
-     * since we also need to shift the digits
-     */
-    tmpa = a->dp;
-    tmpb = b->dp;
-
-    tmpx = x0.dp;
-    tmpy = y0.dp;
-    for (x = 0; x < B; x++) {
-      *tmpx++ = *tmpa++;
-      *tmpy++ = *tmpb++;
-    }
-
-    tmpx = x1.dp;
-    for (x = B; x < a->used; x++) {
-      *tmpx++ = *tmpa++;
-    }
-
-    tmpy = y1.dp;
-    for (x = B; x < b->used; x++) {
-      *tmpy++ = *tmpb++;
-    }
-  }
-
-  /* only need to clamp the lower words since by definition the
-   * upper words x1/y1 must have a known number of digits
-   */
-  mp_clamp (&x0);
-  mp_clamp (&y0);
-
-  /* now calc the products x0y0 and x1y1 */
-  /* after this x0 is no longer required, free temp [x0==t2]! */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
-    goto X1Y1;          /* x0y0 = x0*y0 */
-  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-    goto X1Y1;          /* x1y1 = x1*y1 */
-
-  /* now calc x1+x0 and y1+y0 */
-  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x1 - x0 */
-  if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
-    goto X1Y1;          /* t2 = y1 - y0 */
-  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
-
-  /* add x0y0 */
-  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
-    goto X1Y1;          /* t2 = x0y0 + x1y1 */
-  if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
-
-  /* shift by B */
-  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
-
-  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + t1 */
-  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
-
-  /* Algorithm succeeded set the return code to MP_OKAY */
-  err = MP_OKAY;
-
-X1Y1:mp_clear (&x1y1);
-X0Y0:mp_clear (&x0y0);
-T1:mp_clear (&t1);
-Y1:mp_clear (&y1);
-Y0:mp_clear (&y0);
-X1:mp_clear (&x1);
-X0:mp_clear (&x0);
-ERR:
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_karatsuba_mul.c */
-
-/* Start: bn_mp_karatsuba_sqr.c */
-#include <tommath.h>
-#ifdef BN_MP_KARATSUBA_SQR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Karatsuba squaring, computes b = a*a using three
- * half size squarings
- *
- * See comments of karatsuba_mul for details.  It
- * is essentially the same algorithm but merely
- * tuned to perform recursive squarings.
- */
-int mp_karatsuba_sqr (mp_int * a, mp_int * b)
-{
-  mp_int  x0, x1, t1, t2, x0x0, x1x1;
-  int     B, err;
-
-  err = MP_MEM;
-
-  /* min # of digits */
-  B = a->used;
-
-  /* now divide in two */
-  B = B >> 1;
-
-  /* init copy all the temps */
-  if (mp_init_size (&x0, B) != MP_OKAY)
-    goto ERR;
-  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-    goto X0;
-
-  /* init temps */
-  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
-    goto X1;
-  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
-    goto T1;
-  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
-    goto T2;
-  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
-    goto X0X0;
-
-  {
-    register int x;
-    register mp_digit *dst, *src;
-
-    src = a->dp;
-
-    /* now shift the digits */
-    dst = x0.dp;
-    for (x = 0; x < B; x++) {
-      *dst++ = *src++;
-    }
-
-    dst = x1.dp;
-    for (x = B; x < a->used; x++) {
-      *dst++ = *src++;
-    }
-  }
-
-  x0.used = B;
-  x1.used = a->used - B;
-
-  mp_clamp (&x0);
-
-  /* now calc the products x0*x0 and x1*x1 */
-  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;           /* x0x0 = x0*x0 */
-  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;           /* x1x1 = x1*x1 */
-
-  /* now calc (x1+x0)**2 */
-  if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x1 - x0 */
-  if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
-
-  /* add x0y0 */
-  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;           /* t2 = x0x0 + x1x1 */
-  if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
-
-  /* shift by B */
-  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
-  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;           /* x1x1 = x1x1 << 2*B */
-
-  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + t1 */
-  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
-
-  err = MP_OKAY;
-
-X1X1:mp_clear (&x1x1);
-X0X0:mp_clear (&x0x0);
-T2:mp_clear (&t2);
-T1:mp_clear (&t1);
-X1:mp_clear (&x1);
-X0:mp_clear (&x0);
-ERR:
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_karatsuba_sqr.c */
-
-/* Start: bn_mp_lcm.c */
-#include <tommath.h>
-#ifdef BN_MP_LCM_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes least common multiple as |a*b|/(a, b) */
-int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res;
-  mp_int  t1, t2;
-
-
-  if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
-    return res;
-  }
-
-  /* t1 = get the GCD of the two inputs */
-  if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
-    goto LBL_T;
-  }
-
-  /* divide the smallest by the GCD */
-  if (mp_cmp_mag(a, b) == MP_LT) {
-     /* store quotient in t2 such that t2 * b is the LCM */
-     if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
-        goto LBL_T;
-     }
-     res = mp_mul(b, &t2, c);
-  } else {
-     /* store quotient in t2 such that t2 * a is the LCM */
-     if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
-        goto LBL_T;
-     }
-     res = mp_mul(a, &t2, c);
-  }
-
-  /* fix the sign to positive */
-  c->sign = MP_ZPOS;
-
-LBL_T:
-  mp_clear_multi (&t1, &t2, NULL);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_lcm.c */
-
-/* Start: bn_mp_lshd.c */
-#include <tommath.h>
-#ifdef BN_MP_LSHD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* shift left a certain amount of digits */
-int mp_lshd (mp_int * a, int b)
-{
-  int     x, res;
-
-  /* if its less than zero return */
-  if (b <= 0) {
-    return MP_OKAY;
-  }
-
-  /* grow to fit the new digits */
-  if (a->alloc < a->used + b) {
-     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  {
-    register mp_digit *top, *bottom;
-
-    /* increment the used by the shift amount then copy upwards */
-    a->used += b;
-
-    /* top */
-    top = a->dp + a->used - 1;
-
-    /* base */
-    bottom = a->dp + a->used - 1 - b;
-
-    /* much like mp_rshd this is implemented using a sliding window
-     * except the window goes the otherway around.  Copying from
-     * the bottom to the top.  see bn_mp_rshd.c for more info.
-     */
-    for (x = a->used - 1; x >= b; x--) {
-      *top-- = *bottom--;
-    }
-
-    /* zero the lower digits */
-    top = a->dp;
-    for (x = 0; x < b; x++) {
-      *top++ = 0;
-    }
-  }
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_lshd.c */
-
-/* Start: bn_mp_mod.c */
-#include <tommath.h>
-#ifdef BN_MP_MOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* c = a mod b, 0 <= c < b */
-int
-mp_mod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  if (t.sign != b->sign) {
-    res = mp_add (b, &t, c);
-  } else {
-    res = MP_OKAY;
-    mp_exch (&t, c);
-  }
-
-  mp_clear (&t);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mod.c */
-
-/* Start: bn_mp_mod_2d.c */
-#include <tommath.h>
-#ifdef BN_MP_MOD_2D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* calc a value mod 2**b */
-int
-mp_mod_2d (mp_int * a, int b, mp_int * c)
-{
-  int     x, res;
-
-  /* if b is <= 0 then zero the int */
-  if (b <= 0) {
-    mp_zero (c);
-    return MP_OKAY;
-  }
-
-  /* if the modulus is larger than the value than return */
-  if (b >= (int) (a->used * DIGIT_BIT)) {
-    res = mp_copy (a, c);
-    return res;
-  }
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    return res;
-  }
-
-  /* zero digits above the last digit of the modulus */
-  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
-    c->dp[x] = 0;
-  }
-  /* clear the digit that is not completely outside/inside the modulus */
-  c->dp[b / DIGIT_BIT] &=
-    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
-  mp_clamp (c);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mod_2d.c */
-
-/* Start: bn_mp_mod_d.c */
-#include <tommath.h>
-#ifdef BN_MP_MOD_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-int
-mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
-{
-  return mp_div_d(a, b, NULL, c);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mod_d.c */
-
-/* Start: bn_mp_montgomery_calc_normalization.c */
-#include <tommath.h>
-#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/*
- * shifts with subtractions when the result is greater than b.
- *
- * The method is slightly modified to shift B unconditionally upto just under
- * the leading bit of b.  This saves alot of multiple precision shifting.
- */
-int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
-{
-  int     x, bits, res;
-
-  /* how many bits of last digit does b use */
-  bits = mp_count_bits (b) % DIGIT_BIT;
-
-  if (b->used > 1) {
-     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
-        return res;
-     }
-  } else {
-     mp_set(a, 1);
-     bits = 1;
-  }
-
-
-  /* now compute C = A * B mod b */
-  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
-    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
-      return res;
-    }
-    if (mp_cmp_mag (a, b) != MP_LT) {
-      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
-        return res;
-      }
-    }
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_montgomery_calc_normalization.c */
-
-/* Start: bn_mp_montgomery_reduce.c */
-#include <tommath.h>
-#ifdef BN_MP_MONTGOMERY_REDUCE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes xR**-1 == x (mod N) via Montgomery Reduction */
-int
-mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-{
-  int     ix, res, digs;
-  mp_digit mu;
-
-  /* can the fast reduction [comba] method be used?
-   *
-   * Note that unlike in mul you're safely allowed *less*
-   * than the available columns [255 per default] since carries
-   * are fixed up in the inner loop.
-   */
-  digs = n->used * 2 + 1;
-  if ((digs < MP_WARRAY) &&
-      n->used <
-      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_mp_montgomery_reduce (x, n, rho);
-  }
-
-  /* grow the input as required */
-  if (x->alloc < digs) {
-    if ((res = mp_grow (x, digs)) != MP_OKAY) {
-      return res;
-    }
-  }
-  x->used = digs;
-
-  for (ix = 0; ix < n->used; ix++) {
-    /* mu = ai * rho mod b
-     *
-     * The value of rho must be precalculated via
-     * montgomery_setup() such that
-     * it equals -1/n0 mod b this allows the
-     * following inner loop to reduce the
-     * input one digit at a time
-     */
-    mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
-
-    /* a = a + mu * m * b**i */
-    {
-      register int iy;
-      register mp_digit *tmpn, *tmpx, u;
-      register mp_word r;
-
-      /* alias for digits of the modulus */
-      tmpn = n->dp;
-
-      /* alias for the digits of x [the input] */
-      tmpx = x->dp + ix;
-
-      /* set the carry to zero */
-      u = 0;
-
-      /* Multiply and add in place */
-      for (iy = 0; iy < n->used; iy++) {
-        /* compute product and sum */
-        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
-                  ((mp_word) u) + ((mp_word) * tmpx);
-
-        /* get carry */
-        u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-
-        /* fix digit */
-        *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
-      }
-      /* At this point the ix'th digit of x should be zero */
-
-
-      /* propagate carries upwards as required*/
-      while (u) {
-        *tmpx   += u;
-        u        = *tmpx >> DIGIT_BIT;
-        *tmpx++ &= MP_MASK;
-      }
-    }
-  }
-
-  /* at this point the n.used'th least
-   * significant digits of x are all zero
-   * which means we can shift x to the
-   * right by n.used digits and the
-   * residue is unchanged.
-   */
-
-  /* x = x/b**n.used */
-  mp_clamp(x);
-  mp_rshd (x, n->used);
-
-  /* if x >= n then x = x - n */
-  if (mp_cmp_mag (x, n) != MP_LT) {
-    return s_mp_sub (x, n, x);
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_montgomery_reduce.c */
-
-/* Start: bn_mp_montgomery_setup.c */
-#include <tommath.h>
-#ifdef BN_MP_MONTGOMERY_SETUP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* setups the montgomery reduction stuff */
-int
-mp_montgomery_setup (mp_int * n, mp_digit * rho)
-{
-  mp_digit x, b;
-
-/* fast inversion mod 2**k
- *
- * Based on the fact that
- *
- * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
- *                    =>  2*X*A - X*X*A*A = 1
- *                    =>  2*(1) - (1)     = 1
- */
-  b = n->dp[0];
-
-  if ((b & 1) == 0) {
-    return MP_VAL;
-  }
-
-  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
-#if !defined(MP_8BIT)
-  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
-#endif
-#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-#endif
-#ifdef MP_64BIT
-  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
-#endif
-
-  /* rho = -1/m mod b */
-  *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_montgomery_setup.c */
-
-/* Start: bn_mp_mul.c */
-#include <tommath.h>
-#ifdef BN_MP_MUL_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* high level multiplication (handles sign) */
-int mp_mul (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, neg;
-  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-
-  /* use Toom-Cook? */
-#ifdef BN_MP_TOOM_MUL_C
-  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
-    res = mp_toom_mul(a, b, c);
-  } else
-#endif
-#ifdef BN_MP_KARATSUBA_MUL_C
-  /* use Karatsuba? */
-  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
-    res = mp_karatsuba_mul (a, b, c);
-  } else
-#endif
-  {
-    /* can we use the fast multiplier?
-     *
-     * The fast multiplier can be used if the output will
-     * have less than MP_WARRAY digits and the number of
-     * digits won't affect carry propagation
-     */
-    int     digs = a->used + b->used + 1;
-
-#ifdef BN_FAST_S_MP_MUL_DIGS_C
-    if ((digs < MP_WARRAY) &&
-        MIN(a->used, b->used) <=
-        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-      res = fast_s_mp_mul_digs (a, b, c, digs);
-    } else
-#endif
-#ifdef BN_S_MP_MUL_DIGS_C
-      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
-#else
-      res = MP_VAL;
-#endif
-
-  }
-  c->sign = (c->used > 0) ? neg : MP_ZPOS;
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mul.c */
-
-/* Start: bn_mp_mul_2.c */
-#include <tommath.h>
-#ifdef BN_MP_MUL_2_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* b = a*2 */
-int mp_mul_2(mp_int * a, mp_int * b)
-{
-  int     x, res, oldused;
-
-  /* grow to accomodate result */
-  if (b->alloc < a->used + 1) {
-    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  oldused = b->used;
-  b->used = a->used;
-
-  {
-    register mp_digit r, rr, *tmpa, *tmpb;
-
-    /* alias for source */
-    tmpa = a->dp;
-
-    /* alias for dest */
-    tmpb = b->dp;
-
-    /* carry */
-    r = 0;
-    for (x = 0; x < a->used; x++) {
-
-      /* get what will be the *next* carry bit from the
-       * MSB of the current digit
-       */
-      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
-
-      /* now shift up this digit, add in the carry [from the previous] */
-      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
-
-      /* copy the carry that would be from the source
-       * digit into the next iteration
-       */
-      r = rr;
-    }
-
-    /* new leading digit? */
-    if (r != 0) {
-      /* add a MSB which is always 1 at this point */
-      *tmpb = 1;
-      ++(b->used);
-    }
-
-    /* now zero any excess digits on the destination
-     * that we didn't write to
-     */
-    tmpb = b->dp + b->used;
-    for (x = b->used; x < oldused; x++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->sign = a->sign;
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mul_2.c */
-
-/* Start: bn_mp_mul_2d.c */
-#include <tommath.h>
-#ifdef BN_MP_MUL_2D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* shift left by a certain bit count */
-int mp_mul_2d (mp_int * a, int b, mp_int * c)
-{
-  mp_digit d;
-  int      res;
-
-  /* copy */
-  if (a != c) {
-     if ((res = mp_copy (a, c)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
-     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
-       return res;
-     }
-  }
-
-  /* shift by as many digits in the bit count */
-  if (b >= (int)DIGIT_BIT) {
-    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* shift any bit count < DIGIT_BIT */
-  d = (mp_digit) (b % DIGIT_BIT);
-  if (d != 0) {
-    register mp_digit *tmpc, shift, mask, r, rr;
-    register int x;
-
-    /* bitmask for carries */
-    mask = (((mp_digit)1) << d) - 1;
-
-    /* shift for msbs */
-    shift = DIGIT_BIT - d;
-
-    /* alias */
-    tmpc = c->dp;
-
-    /* carry */
-    r    = 0;
-    for (x = 0; x < c->used; x++) {
-      /* get the higher bits of the current word */
-      rr = (*tmpc >> shift) & mask;
-
-      /* shift the current word and OR in the carry */
-      *tmpc = ((*tmpc << d) | r) & MP_MASK;
-      ++tmpc;
-
-      /* set the carry to the carry bits of the current word */
-      r = rr;
-    }
-
-    /* set final carry */
-    if (r != 0) {
-       c->dp[(c->used)++] = r;
-    }
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mul_2d.c */
-
-/* Start: bn_mp_mul_d.c */
-#include <tommath.h>
-#ifdef BN_MP_MUL_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* multiply by a digit */
-int
-mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_digit u, *tmpa, *tmpc;
-  mp_word  r;
-  int      ix, res, olduse;
-
-  /* make sure c is big enough to hold a*b */
-  if (c->alloc < a->used + 1) {
-    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* get the original destinations used count */
-  olduse = c->used;
-
-  /* set the sign */
-  c->sign = a->sign;
-
-  /* alias for a->dp [source] */
-  tmpa = a->dp;
-
-  /* alias for c->dp [dest] */
-  tmpc = c->dp;
-
-  /* zero carry */
-  u = 0;
-
-  /* compute columns */
-  for (ix = 0; ix < a->used; ix++) {
-    /* compute product and carry sum for this term */
-    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
-
-    /* mask off higher bits to get a single digit */
-    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-    /* send carry into next iteration */
-    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-  }
-
-  /* store final carry [if any] and increment ix offset  */
-  *tmpc++ = u;
-  ++ix;
-
-  /* now zero digits above the top */
-  while (ix++ < olduse) {
-     *tmpc++ = 0;
-  }
-
-  /* set used count */
-  c->used = a->used + 1;
-  mp_clamp(c);
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mul_d.c */
-
-/* Start: bn_mp_mulmod.c */
-#include <tommath.h>
-#ifdef BN_MP_MULMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* d = a * b (mod c) */
-int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_mulmod.c */
-
-/* Start: bn_mp_n_root.c */
-#include <tommath.h>
-#ifdef BN_MP_N_ROOT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* find the n'th root of an integer
- *
- * Result found such that (c)**b <= a and (c+1)**b > a
- *
- * This algorithm uses Newton's approximation
- * x[i+1] = x[i] - f(x[i])/f'(x[i])
- * which will find the root in log(N) time where
- * each step involves a fair bit.  This is not meant to
- * find huge roots [square and cube, etc].
- */
-int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t1, t2, t3;
-  int     res, neg;
-
-  /* input must be positive if b is even */
-  if ((b & 1) == 0 && a->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  if ((res = mp_init (&t1)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto LBL_T1;
-  }
-
-  if ((res = mp_init (&t3)) != MP_OKAY) {
-    goto LBL_T2;
-  }
-
-  /* if a is negative fudge the sign but keep track */
-  neg     = a->sign;
-  a->sign = MP_ZPOS;
-
-  /* t2 = 2 */
-  mp_set (&t2, 2);
-
-  do {
-    /* t1 = t2 */
-    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
-
-    /* t3 = t1**(b-1) */
-    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    /* numerator */
-    /* t2 = t1**b */
-    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    /* t2 = t1**b - a */
-    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    /* denominator */
-    /* t3 = t1**(b-1) * b  */
-    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    /* t3 = (t1**b - a)/(b * t1**(b-1)) */
-    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-  }  while (mp_cmp (&t1, &t2) != MP_EQ);
-
-  /* result can be off by a few so check */
-  for (;;) {
-    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
-      goto LBL_T3;
-    }
-
-    if (mp_cmp (&t2, a) == MP_GT) {
-      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-         goto LBL_T3;
-      }
-    } else {
-      break;
-    }
-  }
-
-  /* reset the sign of a first */
-  a->sign = neg;
-
-  /* set the result */
-  mp_exch (&t1, c);
-
-  /* set the sign of the result */
-  c->sign = neg;
-
-  res = MP_OKAY;
-
-LBL_T3:mp_clear (&t3);
-LBL_T2:mp_clear (&t2);
-LBL_T1:mp_clear (&t1);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_n_root.c */
-
-/* Start: bn_mp_neg.c */
-#include <tommath.h>
-#ifdef BN_MP_NEG_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* b = -a */
-int mp_neg (mp_int * a, mp_int * b)
-{
-  int     res;
-  if (a != b) {
-     if ((res = mp_copy (a, b)) != MP_OKAY) {
-        return res;
-     }
-  }
-
-  if (mp_iszero(b) != MP_YES) {
-     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-  } else {
-     b->sign = MP_ZPOS;
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_neg.c */
-
-/* Start: bn_mp_or.c */
-#include <tommath.h>
-#ifdef BN_MP_OR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* OR two ints together */
-int mp_or (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] |= x->dp[ix];
-  }
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_or.c */
-
-/* Start: bn_mp_prime_fermat.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_FERMAT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* performs one Fermat test.
- *
- * If "a" were prime then b**a == b (mod a) since the order of
- * the multiplicative sub-group would be phi(a) = a-1.  That means
- * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
- *
- * Sets result to 1 if the congruence holds, or zero otherwise.
- */
-int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
-{
-  mp_int  t;
-  int     err;
-
-  /* default to composite  */
-  *result = MP_NO;
-
-  /* ensure b > 1 */
-  if (mp_cmp_d(b, 1) != MP_GT) {
-     return MP_VAL;
-  }
-
-  /* init t */
-  if ((err = mp_init (&t)) != MP_OKAY) {
-    return err;
-  }
-
-  /* compute t = b**a mod a */
-  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
-    goto LBL_T;
-  }
-
-  /* is it equal to b? */
-  if (mp_cmp (&t, b) == MP_EQ) {
-    *result = MP_YES;
-  }
-
-  err = MP_OKAY;
-LBL_T:mp_clear (&t);
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_fermat.c */
-
-/* Start: bn_mp_prime_is_divisible.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_IS_DIVISIBLE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines if an integers is divisible by one
- * of the first PRIME_SIZE primes or not
- *
- * sets result to 0 if not, 1 if yes
- */
-int mp_prime_is_divisible (mp_int * a, int *result)
-{
-  int     err, ix;
-  mp_digit res;
-
-  /* default to not */
-  *result = MP_NO;
-
-  for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* what is a mod LBL_prime_tab[ix] */
-    if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
-      return err;
-    }
-
-    /* is the residue zero? */
-    if (res == 0) {
-      *result = MP_YES;
-      return MP_OKAY;
-    }
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_is_divisible.c */
-
-/* Start: bn_mp_prime_is_prime.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_IS_PRIME_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* performs a variable number of rounds of Miller-Rabin
- *
- * Probability of error after t rounds is no more than
-
- *
- * Sets result to 1 if probably prime, 0 otherwise
- */
-int mp_prime_is_prime (mp_int * a, int t, int *result)
-{
-  mp_int  b;
-  int     ix, err, res;
-
-  /* default to no */
-  *result = MP_NO;
-
-  /* valid value of t? */
-  if (t <= 0 || t > PRIME_SIZE) {
-    return MP_VAL;
-  }
-
-  /* is the input equal to one of the primes in the table? */
-  for (ix = 0; ix < PRIME_SIZE; ix++) {
-      if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
-         *result = 1;
-         return MP_OKAY;
-      }
-  }
-
-  /* first perform trial division */
-  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
-    return err;
-  }
-
-  /* return if it was trivially divisible */
-  if (res == MP_YES) {
-    return MP_OKAY;
-  }
-
-  /* now perform the miller-rabin rounds */
-  if ((err = mp_init (&b)) != MP_OKAY) {
-    return err;
-  }
-
-  for (ix = 0; ix < t; ix++) {
-    /* set the prime */
-    mp_set (&b, ltm_prime_tab[ix]);
-
-    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
-      goto LBL_B;
-    }
-
-    if (res == MP_NO) {
-      goto LBL_B;
-    }
-  }
-
-  /* passed the test */
-  *result = MP_YES;
-LBL_B:mp_clear (&b);
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_is_prime.c */
-
-/* Start: bn_mp_prime_miller_rabin.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_MILLER_RABIN_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Miller-Rabin test of "a" to the base of "b" as described in
- * HAC pp. 139 Algorithm 4.24
- *
- * Sets result to 0 if definitely composite or 1 if probably prime.
- * Randomly the chance of error is no more than 1/4 and often
- * very much lower.
- */
-int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
-{
-  mp_int  n1, y, r;
-  int     s, j, err;
-
-  /* default */
-  *result = MP_NO;
-
-  /* ensure b > 1 */
-  if (mp_cmp_d(b, 1) != MP_GT) {
-     return MP_VAL;
-  }
-
-  /* get n1 = a - 1 */
-  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
-    return err;
-  }
-  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
-    goto LBL_N1;
-  }
-
-  /* set 2**s * r = n1 */
-  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
-    goto LBL_N1;
-  }
-
-  /* count the number of least significant bits
-   * which are zero
-   */
-  s = mp_cnt_lsb(&r);
-
-  /* now divide n - 1 by 2**s */
-  if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
-    goto LBL_R;
-  }
-
-  /* compute y = b**r mod a */
-  if ((err = mp_init (&y)) != MP_OKAY) {
-    goto LBL_R;
-  }
-  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
-    goto LBL_Y;
-  }
-
-  /* if y != 1 and y != n1 do */
-  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
-    j = 1;
-    /* while j <= s-1 and y != n1 */
-    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
-      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-         goto LBL_Y;
-      }
-
-      /* if y == 1 then composite */
-      if (mp_cmp_d (&y, 1) == MP_EQ) {
-         goto LBL_Y;
-      }
-
-      ++j;
-    }
-
-    /* if y != n1 then composite */
-    if (mp_cmp (&y, &n1) != MP_EQ) {
-      goto LBL_Y;
-    }
-  }
-
-  /* probably prime now */
-  *result = MP_YES;
-LBL_Y:mp_clear (&y);
-LBL_R:mp_clear (&r);
-LBL_N1:mp_clear (&n1);
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_miller_rabin.c */
-
-/* Start: bn_mp_prime_next_prime.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_NEXT_PRIME_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* finds the next prime after the number "a" using "t" trials
- * of Miller-Rabin.
- *
- * bbs_style = 1 means the prime must be congruent to 3 mod 4
- */
-int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
-{
-   int      err, res, x, y;
-   mp_digit res_tab[PRIME_SIZE], step, kstep;
-   mp_int   b;
-
-   /* ensure t is valid */
-   if (t <= 0 || t > PRIME_SIZE) {
-      return MP_VAL;
-   }
-
-   /* force positive */
-   a->sign = MP_ZPOS;
-
-   /* simple algo if a is less than the largest prime in the table */
-   if (mp_cmp_d(a, ltm_prime_tab[PRIME_SIZE-1]) == MP_LT) {
-      /* find which prime it is bigger than */
-      for (x = PRIME_SIZE - 2; x >= 0; x--) {
-          if (mp_cmp_d(a, ltm_prime_tab[x]) != MP_LT) {
-             if (bbs_style == 1) {
-                /* ok we found a prime smaller or
-                 * equal [so the next is larger]
-                 *
-                 * however, the prime must be
-                 * congruent to 3 mod 4
-                 */
-                if ((ltm_prime_tab[x + 1] & 3) != 3) {
-                   /* scan upwards for a prime congruent to 3 mod 4 */
-                   for (y = x + 1; y < PRIME_SIZE; y++) {
-                       if ((ltm_prime_tab[y] & 3) == 3) {
-                          mp_set(a, ltm_prime_tab[y]);
-                          return MP_OKAY;
-                       }
-                   }
-                }
-             } else {
-                mp_set(a, ltm_prime_tab[x + 1]);
-                return MP_OKAY;
-             }
-          }
-      }
-      /* at this point a maybe 1 */
-      if (mp_cmp_d(a, 1) == MP_EQ) {
-         mp_set(a, 2);
-         return MP_OKAY;
-      }
-      /* fall through to the sieve */
-   }
-
-   /* generate a prime congruent to 3 mod 4 or 1/3 mod 4? */
-   if (bbs_style == 1) {
-      kstep   = 4;
-   } else {
-      kstep   = 2;
-   }
-
-   /* at this point we will use a combination of a sieve and Miller-Rabin */
-
-   if (bbs_style == 1) {
-      /* if a mod 4 != 3 subtract the correct value to make it so */
-      if ((a->dp[0] & 3) != 3) {
-         if ((err = mp_sub_d(a, (a->dp[0] & 3) + 1, a)) != MP_OKAY) { return err; };
-      }
-   } else {
-      if (mp_iseven(a) == 1) {
-         /* force odd */
-         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) {
-            return err;
-         }
-      }
-   }
-
-   /* generate the restable */
-   for (x = 1; x < PRIME_SIZE; x++) {
-      if ((err = mp_mod_d(a, ltm_prime_tab[x], res_tab + x)) != MP_OKAY) {
-         return err;
-      }
-   }
-
-   /* init temp used for Miller-Rabin Testing */
-   if ((err = mp_init(&b)) != MP_OKAY) {
-      return err;
-   }
-
-   for (;;) {
-      /* skip to the next non-trivially divisible candidate */
-      step = 0;
-      do {
-         /* y == 1 if any residue was zero [e.g. cannot be prime] */
-         y     =  0;
-
-         /* increase step to next candidate */
-         step += kstep;
-
-         /* compute the new residue without using division */
-         for (x = 1; x < PRIME_SIZE; x++) {
-             /* add the step to each residue */
-             res_tab[x] += kstep;
-
-             /* subtract the modulus [instead of using division] */
-             if (res_tab[x] >= ltm_prime_tab[x]) {
-                res_tab[x]  -= ltm_prime_tab[x];
-             }
-
-             /* set flag if zero */
-             if (res_tab[x] == 0) {
-                y = 1;
-             }
-         }
-      } while (y == 1 && step < ((((mp_digit)1)<<DIGIT_BIT) - kstep));
-
-      /* add the step */
-      if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
-         goto LBL_ERR;
-      }
-
-      /* if didn't pass sieve and step == MAX then skip test */
-      if (y == 1 && step >= ((((mp_digit)1)<<DIGIT_BIT) - kstep)) {
-         continue;
-      }
-
-      /* is this prime? */
-      for (x = 0; x < t; x++) {
-          mp_set(&b, ltm_prime_tab[x]);
-          if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
-             goto LBL_ERR;
-          }
-          if (res == MP_NO) {
-             break;
-          }
-      }
-
-      if (res == MP_YES) {
-         break;
-      }
-   }
-
-   err = MP_OKAY;
-LBL_ERR:
-   mp_clear(&b);
-   return err;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_next_prime.c */
-
-/* Start: bn_mp_prime_rabin_miller_trials.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_RABIN_MILLER_TRIALS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-
-static const struct {
-   int k, t;
-} sizes[] = {
-{   128,    28 },
-{   256,    16 },
-{   384,    10 },
-{   512,     7 },
-{   640,     6 },
-{   768,     5 },
-{   896,     4 },
-{  1024,     4 }
-};
-
-/* returns # of RM trials required for a given bit size */
-int mp_prime_rabin_miller_trials(int size)
-{
-   int x;
-
-   for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
-       if (sizes[x].k == size) {
-          return sizes[x].t;
-       } else if (sizes[x].k > size) {
-          return (x == 0) ? sizes[0].t : sizes[x - 1].t;
-       }
-   }
-   return sizes[x-1].t + 1;
-}
-
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_rabin_miller_trials.c */
-
-/* Start: bn_mp_prime_random_ex.c */
-#include <tommath.h>
-#ifdef BN_MP_PRIME_RANDOM_EX_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* makes a truly random prime of a given size (bits),
- *
- * Flags are as follows:
- *
- *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
- *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
- *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
- *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
- *
- * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
- * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
- * so it can be NULL
- *
- */
-
-/* This is possibly the mother of all prime generation functions, muahahahahaha! */
-int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
-{
-   unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
-   int res, err, bsize, maskOR_msb_offset;
-
-   /* sanity check the input */
-   if (size <= 1 || t <= 0) {
-      return MP_VAL;
-   }
-
-   /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
-   if (flags & LTM_PRIME_SAFE) {
-      flags |= LTM_PRIME_BBS;
-   }
-
-   /* calc the byte size */
-   bsize = (size>>3) + ((size&7)?1:0);
-
-   /* we need a buffer of bsize bytes */
-   tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
-   if (tmp == NULL) {
-      return MP_MEM;
-   }
-
-   /* calc the maskAND value for the MSbyte*/
-   maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
-
-   /* calc the maskOR_msb */
-   maskOR_msb        = 0;
-   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
-   if (flags & LTM_PRIME_2MSB_ON) {
-      maskOR_msb       |= 0x80 >> ((9 - size) & 7);
-   }
-
-   /* get the maskOR_lsb */
-   maskOR_lsb         = 1;
-   if (flags & LTM_PRIME_BBS) {
-      maskOR_lsb     |= 3;
-   }
-
-   do {
-      /* read the bytes */
-      if (cb(tmp, bsize, dat) != bsize) {
-         err = MP_VAL;
-         goto error;
-      }
-
-      /* work over the MSbyte */
-      tmp[0]    &= maskAND;
-      tmp[0]    |= 1 << ((size - 1) & 7);
-
-      /* mix in the maskORs */
-      tmp[maskOR_msb_offset]   |= maskOR_msb;
-      tmp[bsize-1]             |= maskOR_lsb;
-
-      /* read it in */
-      if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }
-
-      /* is it prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
-      if (res == MP_NO) {
-         continue;
-      }
-
-      if (flags & LTM_PRIME_SAFE) {
-         /* see if (a-1)/2 is prime */
-         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
-         if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
-
-         /* is it prime? */
-         if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
-      }
-   } while (res == MP_NO);
-
-   if (flags & LTM_PRIME_SAFE) {
-      /* restore a to the original value */
-      if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
-      if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
-   }
-
-   err = MP_OKAY;
-error:
-   XFREE(tmp);
-   return err;
-}
-
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_prime_random_ex.c */
-
-/* Start: bn_mp_radix_size.c */
-#include <tommath.h>
-#ifdef BN_MP_RADIX_SIZE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* returns size of ASCII reprensentation */
-int mp_radix_size (mp_int * a, int radix, int *size)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-
-  *size = 0;
-
-  /* special case for binary */
-  if (radix == 2) {
-    *size = mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-    return MP_OKAY;
-  }
-
-  /* make sure the radix is in range */
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if (mp_iszero(a) == MP_YES) {
-    *size = 2;
-    return MP_OKAY;
-  }
-
-  /* digs is the digit count */
-  digs = 0;
-
-  /* if it's negative add one for the sign */
-  if (a->sign == MP_NEG) {
-    ++digs;
-  }
-
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  /* force temp to positive */
-  t.sign = MP_ZPOS;
-
-  /* fetch out all of the digits */
-  while (mp_iszero (&t) == MP_NO) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    ++digs;
-  }
-  mp_clear (&t);
-
-  /* return digs + 1, the 1 is for the NULL byte that would be required. */
-  *size = digs + 1;
-  return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_radix_size.c */
-
-/* Start: bn_mp_radix_smap.c */
-#include <tommath.h>
-#ifdef BN_MP_RADIX_SMAP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* chars used in radix conversions */
-const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_radix_smap.c */
-
-/* Start: bn_mp_rand.c */
-#include <tommath.h>
-#ifdef BN_MP_RAND_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* makes a pseudo-random int of a given size */
-int
-mp_rand (mp_int * a, int digits)
-{
-  int     res;
-  mp_digit d;
-
-  mp_zero (a);
-  if (digits <= 0) {
-    return MP_OKAY;
-  }
-
-  /* first place a random non-zero digit */
-  do {
-    d = ((mp_digit) abs (rand ())) & MP_MASK;
-  } while (d == 0);
-
-  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
-    return res;
-  }
-
-  while (--digits > 0) {
-    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
-      return res;
-    }
-
-    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_rand.c */
-
-/* Start: bn_mp_read_radix.c */
-#include <tommath.h>
-#ifdef BN_MP_READ_RADIX_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* read a string [ASCII] in a given radix */
-int mp_read_radix (mp_int * a, const char *str, int radix)
-{
-  int     y, res, neg;
-  char    ch;
-
-  /* zero the digit bignum */
-  mp_zero(a);
-
-  /* make sure the radix is ok */
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  /* if the leading digit is a
-   * minus set the sign to negative.
-   */
-  if (*str == '-') {
-    ++str;
-    neg = MP_NEG;
-  } else {
-    neg = MP_ZPOS;
-  }
-
-  /* set the integer to the default of zero */
-  mp_zero (a);
-
-  /* process each digit of the string */
-  while (*str) {
-    /* if the radix < 36 the conversion is case insensitive
-     * this allows numbers like 1AB and 1ab to represent the same  value
-     * [e.g. in hex]
-     */
-    ch = (char) ((radix < 36) ? toupper ((int)*str) : *str);
-    for (y = 0; y < 64; y++) {
-      if (ch == mp_s_rmap[y]) {
-         break;
-      }
-    }
-
-    /* if the char was found in the map
-     * and is less than the given radix add it
-     * to the number, otherwise exit the loop.
-     */
-    if (y < radix) {
-      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
-         return res;
-      }
-      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
-         return res;
-      }
-    } else {
-      break;
-    }
-    ++str;
-  }
-
-  /* set the sign only if a != 0 */
-  if (mp_iszero(a) != 1) {
-     a->sign = neg;
-  }
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_read_radix.c */
-
-/* Start: bn_mp_read_signed_bin.c */
-#include <tommath.h>
-#ifdef BN_MP_READ_SIGNED_BIN_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int mp_read_signed_bin (mp_int * a, const unsigned char *b, int c)
-{
-  int     res;
-
-  /* read magnitude */
-  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
-    return res;
-  }
-
-  /* first byte is 0 for positive, non-zero for negative */
-  if (b[0] == 0) {
-     a->sign = MP_ZPOS;
-  } else {
-     a->sign = MP_NEG;
-  }
-
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_read_signed_bin.c */
-
-/* Start: bn_mp_read_unsigned_bin.c */
-#include <tommath.h>
-#ifdef BN_MP_READ_UNSIGNED_BIN_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
-{
-  int     res;
-
-  /* make sure there are at least two digits */
-  if (a->alloc < 2) {
-     if ((res = mp_grow(a, 2)) != MP_OKAY) {
-        return res;
-     }
-  }
-
-  /* zero the int */
-  mp_zero (a);
-
-  /* read the bytes in */
-  while (c-- > 0) {
-    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
-      return res;
-    }
-
-#ifndef MP_8BIT
-      a->dp[0] |= *b++;
-      a->used += 1;
-#else
-      a->dp[0] = (*b & MP_MASK);
-      a->dp[1] |= ((*b++ >> 7U) & 1);
-      a->used += 2;
-#endif
-  }
-  mp_clamp (a);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_read_unsigned_bin.c */
-
-/* Start: bn_mp_reduce.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* reduces x mod m, assumes 0 < x < m**2, mu is
- * precomputed via mp_reduce_setup.
- * From HAC pp.604 Algorithm 14.42
- */
-int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-{
-  mp_int  q;
-  int     res, um = m->used;
-
-  /* q = x */
-  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
-    return res;
-  }
-
-  /* q1 = x / b**(k-1)  */
-  mp_rshd (&q, um - 1);
-
-  /* according to HAC this optimization is ok */
-  if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
-    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-  } else {
-#ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-#else
-    {
-      res = MP_VAL;
-      goto CLEANUP;
-    }
-#endif
-  }
-
-  /* q3 = q2 / b**(k+1) */
-  mp_rshd (&q, um + 1);
-
-  /* x = x mod b**(k+1), quick (no division) */
-  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* q = q * m mod b**(k+1), quick (no division) */
-  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* x = x - q */
-  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* If x < 0, add b**(k+1) to it */
-  if (mp_cmp_d (x, 0) == MP_LT) {
-    mp_set (&q, 1);
-    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-      goto CLEANUP;
-    if ((res = mp_add (x, &q, x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  /* Back off if it's too big */
-  while (mp_cmp (x, m) != MP_LT) {
-    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-  }
-
-CLEANUP:
-  mp_clear (&q);
-
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce.c */
-
-/* Start: bn_mp_reduce_2k.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_2K_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* reduces a modulo n where n is of the form 2**p - d */
-int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
-{
-   mp_int q;
-   int    p, res;
-
-   if ((res = mp_init(&q)) != MP_OKAY) {
-      return res;
-   }
-
-   p = mp_count_bits(n);
-top:
-   /* q = a/2**p, a = a mod 2**p */
-   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
-      goto ERR;
-   }
-
-   if (d != 1) {
-      /* q = q * d */
-      if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
-         goto ERR;
-      }
-   }
-
-   /* a = a + q */
-   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
-      goto ERR;
-   }
-
-   if (mp_cmp_mag(a, n) != MP_LT) {
-      s_mp_sub(a, n, a);
-      goto top;
-   }
-
-ERR:
-   mp_clear(&q);
-   return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_2k.c */
-
-/* Start: bn_mp_reduce_2k_l.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_2K_L_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* reduces a modulo n where n is of the form 2**p - d
-   This differs from reduce_2k since "d" can be larger
-   than a single digit.
-*/
-int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
-{
-   mp_int q;
-   int    p, res;
-
-   if ((res = mp_init(&q)) != MP_OKAY) {
-      return res;
-   }
-
-   p = mp_count_bits(n);
-top:
-   /* q = a/2**p, a = a mod 2**p */
-   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
-      goto ERR;
-   }
-
-   /* q = q * d */
-   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) {
-      goto ERR;
-   }
-
-   /* a = a + q */
-   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
-      goto ERR;
-   }
-
-   if (mp_cmp_mag(a, n) != MP_LT) {
-      s_mp_sub(a, n, a);
-      goto top;
-   }
-
-ERR:
-   mp_clear(&q);
-   return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_2k_l.c */
-
-/* Start: bn_mp_reduce_2k_setup.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_2K_SETUP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines the setup value */
-int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
-{
-   int res, p;
-   mp_int tmp;
-
-   if ((res = mp_init(&tmp)) != MP_OKAY) {
-      return res;
-   }
-
-   p = mp_count_bits(a);
-   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
-      mp_clear(&tmp);
-      return res;
-   }
-
-   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
-      mp_clear(&tmp);
-      return res;
-   }
-
-   *d = tmp.dp[0];
-   mp_clear(&tmp);
-   return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_2k_setup.c */
-
-/* Start: bn_mp_reduce_2k_setup_l.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_2K_SETUP_L_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines the setup value */
-int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
-{
-   int    res;
-   mp_int tmp;
-
-   if ((res = mp_init(&tmp)) != MP_OKAY) {
-      return res;
-   }
-
-   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
-      goto ERR;
-   }
-
-   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
-      goto ERR;
-   }
-
-ERR:
-   mp_clear(&tmp);
-   return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_2k_setup_l.c */
-
-/* Start: bn_mp_reduce_is_2k.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_IS_2K_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines if mp_reduce_2k can be used */
-int mp_reduce_is_2k(mp_int *a)
-{
-   int ix, iy, iw;
-   mp_digit iz;
-
-   if (a->used == 0) {
-      return MP_NO;
-   } else if (a->used == 1) {
-      return MP_YES;
-   } else if (a->used > 1) {
-      iy = mp_count_bits(a);
-      iz = 1;
-      iw = 1;
-
-      /* Test every bit from the second digit up, must be 1 */
-      for (ix = DIGIT_BIT; ix < iy; ix++) {
-          if ((a->dp[iw] & iz) == 0) {
-             return MP_NO;
-          }
-          iz <<= 1;
-          if (iz > (mp_digit)MP_MASK) {
-             ++iw;
-             iz = 1;
-          }
-      }
-   }
-   return MP_YES;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_is_2k.c */
-
-/* Start: bn_mp_reduce_is_2k_l.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_IS_2K_L_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* determines if reduce_2k_l can be used */
-int mp_reduce_is_2k_l(mp_int *a)
-{
-   int ix, iy;
-
-   if (a->used == 0) {
-      return MP_NO;
-   } else if (a->used == 1) {
-      return MP_YES;
-   } else if (a->used > 1) {
-      /* if more than half of the digits are -1 we're sold */
-      for (iy = ix = 0; ix < a->used; ix++) {
-          if (a->dp[ix] == MP_MASK) {
-              ++iy;
-          }
-      }
-      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
-
-   }
-   return MP_NO;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_is_2k_l.c */
-
-/* Start: bn_mp_reduce_setup.c */
-#include <tommath.h>
-#ifdef BN_MP_REDUCE_SETUP_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* pre-calculate the value required for Barrett reduction
- * For a given modulus "b" it calulates the value required in "a"
- */
-int mp_reduce_setup (mp_int * a, mp_int * b)
-{
-  int     res;
-
-  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
-    return res;
-  }
-  return mp_div (a, b, a, NULL);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_reduce_setup.c */
-
-/* Start: bn_mp_rshd.c */
-#include <tommath.h>
-#ifdef BN_MP_RSHD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* shift right a certain amount of digits */
-void mp_rshd (mp_int * a, int b)
-{
-  int     x;
-
-  /* if b <= 0 then ignore it */
-  if (b <= 0) {
-    return;
-  }
-
-  /* if b > used then simply zero it and return */
-  if (a->used <= b) {
-    mp_zero (a);
-    return;
-  }
-
-  {
-    register mp_digit *bottom, *top;
-
-    /* shift the digits down */
-
-    /* bottom */
-    bottom = a->dp;
-
-    /* top [offset into digits] */
-    top = a->dp + b;
-
-    /* this is implemented as a sliding window where
-     * the window is b-digits long and digits from
-     * the top of the window are copied to the bottom
-     *
-     * e.g.
-
-     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-                 /\                   |      ---->
-                  \-------------------/      ---->
-     */
-    for (x = 0; x < (a->used - b); x++) {
-      *bottom++ = *top++;
-    }
-
-    /* zero the top digits */
-    for (; x < a->used; x++) {
-      *bottom++ = 0;
-    }
-  }
-
-  /* remove excess digits */
-  a->used -= b;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_rshd.c */
-
-/* Start: bn_mp_set.c */
-#include <tommath.h>
-#ifdef BN_MP_SET_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* set to a digit */
-void mp_set (mp_int * a, mp_digit b)
-{
-  mp_zero (a);
-  a->dp[0] = b & MP_MASK;
-  a->used  = (a->dp[0] != 0) ? 1 : 0;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_set.c */
-
-/* Start: bn_mp_set_int.c */
-#include <tommath.h>
-#ifdef BN_MP_SET_INT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* set a 32-bit const */
-int mp_set_int (mp_int * a, unsigned long b)
-{
-  int     x, res;
-
-  mp_zero (a);
-
-  /* set four bits at a time */
-  for (x = 0; x < 8; x++) {
-    /* shift the number up four bits */
-    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
-      return res;
-    }
-
-    /* OR in the top four bits of the source */
-    a->dp[0] |= (b >> 28) & 15;
-
-    /* shift the source up to the next four bits */
-    b <<= 4;
-
-    /* ensure that digits are not clamped off */
-    a->used += 1;
-  }
-  mp_clamp (a);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_set_int.c */
-
-/* Start: bn_mp_shrink.c */
-#include <tommath.h>
-#ifdef BN_MP_SHRINK_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* shrink a bignum */
-int mp_shrink (mp_int * a)
-{
-  mp_digit *tmp;
-  int used = 1;
-
-  if(a->used > 0)
-    used = a->used;
-
-  if (a->alloc != used) {
-    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * used)) == NULL) {
-      return MP_MEM;
-    }
-    a->dp    = tmp;
-    a->alloc = used;
-  }
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_shrink.c */
-
-/* Start: bn_mp_signed_bin_size.c */
-#include <tommath.h>
-#ifdef BN_MP_SIGNED_BIN_SIZE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* get the size for an signed equivalent */
-int mp_signed_bin_size (mp_int * a)
-{
-  return 1 + mp_unsigned_bin_size (a);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_signed_bin_size.c */
-
-/* Start: bn_mp_sqr.c */
-#include <tommath.h>
-#ifdef BN_MP_SQR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* computes b = a*a */
-int
-mp_sqr (mp_int * a, mp_int * b)
-{
-  int     res;
-
-#ifdef BN_MP_TOOM_SQR_C
-  /* use Toom-Cook? */
-  if (a->used >= TOOM_SQR_CUTOFF) {
-    res = mp_toom_sqr(a, b);
-  /* Karatsuba? */
-  } else
-#endif
-#ifdef BN_MP_KARATSUBA_SQR_C
-if (a->used >= KARATSUBA_SQR_CUTOFF) {
-    res = mp_karatsuba_sqr (a, b);
-  } else
-#endif
-  {
-#ifdef BN_FAST_S_MP_SQR_C
-    /* can we use the fast comba multiplier? */
-    if ((a->used * 2 + 1) < MP_WARRAY &&
-         a->used <
-         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
-      res = fast_s_mp_sqr (a, b);
-    } else
-#endif
-#ifdef BN_S_MP_SQR_C
-      res = s_mp_sqr (a, b);
-#else
-      res = MP_VAL;
-#endif
-  }
-  b->sign = MP_ZPOS;
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_sqr.c */
-
-/* Start: bn_mp_sqrmod.c */
-#include <tommath.h>
-#ifdef BN_MP_SQRMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* c = a * a (mod b) */
-int
-mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res;
-  mp_int  t;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, b, c);
-  mp_clear (&t);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_sqrmod.c */
-
-/* Start: bn_mp_sqrt.c */
-#include <tommath.h>
-
-#ifdef BN_MP_SQRT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* this function is less generic than mp_n_root, simpler and faster */
-int mp_sqrt(mp_int *arg, mp_int *ret)
-{
-  int res;
-  mp_int t1,t2;
-
-  /* must be positive */
-  if (arg->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  /* easy out */
-  if (mp_iszero(arg) == MP_YES) {
-    mp_zero(ret);
-    return MP_OKAY;
-  }
-
-  if ((res = mp_init_copy(&t1, arg)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init(&t2)) != MP_OKAY) {
-    goto E2;
-  }
-
-  /* First approx. (not very bad for large arg) */
-  mp_rshd (&t1,t1.used/2);
-
-  /* t1 > 0  */
-  if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
-    goto E1;
-  }
-  if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
-    goto E1;
-  }
-  if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
-    goto E1;
-  }
-  /* And now t1 > sqrt(arg) */
-  do {
-    if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
-      goto E1;
-    }
-    if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
-      goto E1;
-    }
-    if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
-      goto E1;
-    }
-    /* t1 >= sqrt(arg) >= t2 at this point */
-  } while (mp_cmp_mag(&t1,&t2) == MP_GT);
-
-  mp_exch(&t1,ret);
-
-E1: mp_clear(&t2);
-E2: mp_clear(&t1);
-  return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_sqrt.c */
-
-/* Start: bn_mp_sub.c */
-#include <tommath.h>
-#ifdef BN_MP_SUB_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* high level subtraction (handles signs) */
-int
-mp_sub (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     sa, sb, res;
-
-  sa = a->sign;
-  sb = b->sign;
-
-  if (sa != sb) {
-    /* subtract a negative from a positive, OR */
-    /* subtract a positive from a negative. */
-    /* In either case, ADD their magnitudes, */
-    /* and use the sign of the first number. */
-    c->sign = sa;
-    res = s_mp_add (a, b, c);
-  } else {
-    /* subtract a positive from a positive, OR */
-    /* subtract a negative from a negative. */
-    /* First, take the difference between their */
-    /* magnitudes, then... */
-    if (mp_cmp_mag (a, b) != MP_LT) {
-      /* Copy the sign from the first */
-      c->sign = sa;
-      /* The first has a larger or equal magnitude */
-      res = s_mp_sub (a, b, c);
-    } else {
-      /* The result has the *opposite* sign from */
-      /* the first number. */
-      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-      /* The second has a larger magnitude */
-      res = s_mp_sub (b, a, c);
-    }
-  }
-  return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_sub.c */
-
-/* Start: bn_mp_sub_d.c */
-#include <tommath.h>
-#ifdef BN_MP_SUB_D_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* single digit subtraction */
-int
-mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_digit *tmpa, *tmpc, mu;
-  int       res, ix, oldused;
-
-  /* grow c as required */
-  if (c->alloc < a->used + 1) {
-     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
-        return res;
-     }
-  }
-
-  /* if a is negative just do an unsigned
-   * addition [with fudged signs]
-   */
-  if (a->sign == MP_NEG) {
-     a->sign = MP_ZPOS;
-     res     = mp_add_d(a, b, c);
-     a->sign = c->sign = MP_NEG;
-
-     /* clamp */
-     mp_clamp(c);
-
-     return res;
-  }
-
-  /* setup regs */
-  oldused = c->used;
-  tmpa    = a->dp;
-  tmpc    = c->dp;
-
-  /* if a <= b simply fix the single digit */
-  if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
-     if (a->used == 1) {
-        *tmpc++ = b - *tmpa;
-     } else {
-        *tmpc++ = b;
-     }
-     ix      = 1;
-
-     /* negative/1digit */
-     c->sign = MP_NEG;
-     c->used = 1;
-  } else {
-     /* positive/size */
-     c->sign = MP_ZPOS;
-     c->used = a->used;
-
-     /* subtract first digit */
-     *tmpc    = *tmpa++ - b;
-     mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
-     *tmpc++ &= MP_MASK;
-
-     /* handle rest of the digits */
-     for (ix = 1; ix < a->used; ix++) {
-        *tmpc    = *tmpa++ - mu;
-        mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
-        *tmpc++ &= MP_MASK;
-     }
-  }
-
-  /* zero excess digits */
-  while (ix++ < oldused) {
-     *tmpc++ = 0;
-  }
-  mp_clamp(c);
-  return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_sub_d.c */
-
-/* Start: bn_mp_submod.c */
-#include <tommath.h>
-#ifdef BN_MP_SUBMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* d = a - b (mod c) */
-int
-mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_submod.c */
-
-/* Start: bn_mp_to_signed_bin.c */
-#include <tommath.h>
-#ifdef BN_MP_TO_SIGNED_BIN_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* store in signed [big endian] format */
-int mp_to_signed_bin (mp_int * a, unsigned char *b)
-{
-  int     res;
-
-  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
-    return res;
-  }
-  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_to_signed_bin.c */
-
-/* Start: bn_mp_to_signed_bin_n.c */
-#include <tommath.h>
-#ifdef BN_MP_TO_SIGNED_BIN_N_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* store in signed [big endian] format */
-int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
-{
-   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
-      return MP_VAL;
-   }
-   *outlen = mp_signed_bin_size(a);
-   return mp_to_signed_bin(a, b);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_to_signed_bin_n.c */
-
-/* Start: bn_mp_to_unsigned_bin.c */
-#include <tommath.h>
-#ifdef BN_MP_TO_UNSIGNED_BIN_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* store in unsigned [big endian] format */
-int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
-{
-  int     x, res;
-  mp_int  t;
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  x = 0;
-  while (mp_iszero (&t) == 0) {
-#ifndef MP_8BIT
-      b[x++] = (unsigned char) (t.dp[0] & 255);
-#else
-      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
-#endif
-    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-  }
-  bn_reverse (b, x);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_to_unsigned_bin.c */
-
-/* Start: bn_mp_to_unsigned_bin_n.c */
-#include <tommath.h>
-#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* store in unsigned [big endian] format */
-int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
-{
-   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
-      return MP_VAL;
-   }
-   *outlen = mp_unsigned_bin_size(a);
-   return mp_to_unsigned_bin(a, b);
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_to_unsigned_bin_n.c */
-
-/* Start: bn_mp_toom_mul.c */
-#include <tommath.h>
-#ifdef BN_MP_TOOM_MUL_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* multiplication using the Toom-Cook 3-way algorithm
- *
- * Much more complicated than Karatsuba but has a lower
- * asymptotic running time of O(N**1.464).  This algorithm is
- * only particularly useful on VERY large inputs
- * (we're talking 1000s of digits here...).
-*/
-int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-{
-    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-    int res, B;
-
-    /* init temps */
-    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4,
-                             &a0, &a1, &a2, &b0, &b1,
-                             &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
-       return res;
-    }
-
-    /* B */
-    B = MIN(a->used, b->used) / 3;
-
-    /* a = a2 * B**2 + a1 * B + a0 */
-    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a1, B);
-    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-
-    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a2, B*2);
-
-    /* b = b2 * B**2 + b1 * B + b0 */
-    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&b1, B);
-    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-
-    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&b2, B*2);
-
-    /* w0 = a0*b0 */
-    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* w4 = a2 * b2 */
-    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
-       goto ERR;
-    }
-
-
-    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* now solve the matrix
-
-       0  0  0  0  1
-       1  2  4  8  16
-       1  1  1  1  1
-       16 8  4  2  1
-       1  0  0  0  0
-
-       using 12 subtractions, 4 shifts,
-              2 small divisions and 1 small multiplication
-     */
-
-     /* r1 - r4 */
-     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r0 */
-     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/2 */
-     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/2 */
-     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r2 - r0 - r4 */
-     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - 8r0 */
-     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - 8r4 */
-     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* 3r2 - r1 - r3 */
-     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/3 */
-     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/3 */
-     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-
-     /* at this point shift W[n] by B*n */
-     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
-        goto ERR;
-     }
-
-     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
-        goto ERR;
-     }
-
-ERR:
-     mp_clear_multi(&w0, &w1, &w2, &w3, &w4,
-                    &a0, &a1, &a2, &b0, &b1,
-                    &b2, &tmp1, &tmp2, NULL);
-     return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_toom_mul.c */
-
-/* Start: bn_mp_toom_sqr.c */
-#include <tommath.h>
-#ifdef BN_MP_TOOM_SQR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* squaring using Toom-Cook 3-way algorithm */
-int
-mp_toom_sqr(mp_int *a, mp_int *b)
-{
-    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
-    int res, B;
-
-    /* init temps */
-    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
-       return res;
-    }
-
-    /* B */
-    B = a->used / 3;
-
-    /* a = a2 * B**2 + a1 * B + a0 */
-    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a1, B);
-    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-
-    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a2, B*2);
-
-    /* w0 = a0*a0 */
-    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* w4 = a2 * a2 */
-    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* w1 = (a2 + 2(a1 + 2a0))**2 */
-    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* w3 = (a0 + 2(a1 + 2a2))**2 */
-    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
-       goto ERR;
-    }
-
-
-    /* w2 = (a2 + a1 + a0)**2 */
-    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    /* now solve the matrix
-
-       0  0  0  0  1
-       1  2  4  8  16
-       1  1  1  1  1
-       16 8  4  2  1
-       1  0  0  0  0
-
-       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
-     */
-
-     /* r1 - r4 */
-     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r0 */
-     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/2 */
-     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/2 */
-     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r2 - r0 - r4 */
-     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - 8r0 */
-     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - 8r4 */
-     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* 3r2 - r1 - r3 */
-     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/3 */
-     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/3 */
-     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-
-     /* at this point shift W[n] by B*n */
-     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
-        goto ERR;
-     }
-
-     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
-        goto ERR;
-     }
-
-ERR:
-     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
-     return res;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_toom_sqr.c */
-
-/* Start: bn_mp_toradix.c */
-#include <tommath.h>
-#ifdef BN_MP_TORADIX_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int mp_toradix (mp_int * a, char *str, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  /* check range of the radix */
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  /* quick out if its zero */
-  if (mp_iszero(a) == 1) {
-     *str++ = '0';
-     *str = '\0';
-     return MP_OKAY;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  /* if it is negative output a - */
-  if (t.sign == MP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = mp_s_rmap[d];
-    ++digs;
-  }
-
-  /* reverse the digits of the string.  In this case _s points
-   * to the first digit [exluding the sign] of the number]
-   */
-  bn_reverse ((unsigned char *)_s, digs);
-
-  /* append a NULL so the string is properly terminated */
-  *str = '\0';
-
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_toradix.c */
-
-/* Start: bn_mp_toradix_n.c */
-#include <tommath.h>
-#ifdef BN_MP_TORADIX_N_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* stores a bignum as a ASCII string in a given radix (2..64)
- *
- * Stores upto maxlen-1 chars and always a NULL byte
- */
-int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  /* check range of the maxlen, radix */
-  if (maxlen < 2 || radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  /* quick out if its zero */
-  if (mp_iszero(a) == MP_YES) {
-     *str++ = '0';
-     *str = '\0';
-     return MP_OKAY;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  /* if it is negative output a - */
-  if (t.sign == MP_NEG) {
-    /* we have to reverse our digits later... but not the - sign!! */
-    ++_s;
-
-    /* store the flag and mark the number as positive */
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-
-    /* subtract a char */
-    --maxlen;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if (--maxlen < 1) {
-       /* no more room */
-       break;
-    }
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = mp_s_rmap[d];
-    ++digs;
-  }
-
-  /* reverse the digits of the string.  In this case _s points
-   * to the first digit [exluding the sign] of the number
-   */
-  bn_reverse ((unsigned char *)_s, digs);
-
-  /* append a NULL so the string is properly terminated */
-  *str = '\0';
-
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_toradix_n.c */
-
-/* Start: bn_mp_unsigned_bin_size.c */
-#include <tommath.h>
-#ifdef BN_MP_UNSIGNED_BIN_SIZE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* get the size for an unsigned equivalent */
-int mp_unsigned_bin_size (mp_int * a)
-{
-  int     size = mp_count_bits (a);
-  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_unsigned_bin_size.c */
-
-/* Start: bn_mp_xor.c */
-#include <tommath.h>
-#ifdef BN_MP_XOR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* XOR two ints together */
-int
-mp_xor (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-     t.dp[ix] ^= x->dp[ix];
-  }
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_xor.c */
-
-/* Start: bn_mp_zero.c */
-#include <tommath.h>
-#ifdef BN_MP_ZERO_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* set to zero */
-void mp_zero (mp_int * a)
-{
-  int       n;
-  mp_digit *tmp;
-
-  a->sign = MP_ZPOS;
-  a->used = 0;
-
-  tmp = a->dp;
-  for (n = 0; n < a->alloc; n++) {
-     *tmp++ = 0;
-  }
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_mp_zero.c */
-
-/* Start: bn_prime_tab.c */
-#include <tommath.h>
-#ifdef BN_PRIME_TAB_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-const mp_digit ltm_prime_tab[] = {
-  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
-#ifndef MP_8BIT
-  0x0083,
-  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-
-  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-
-  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-
-  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
-#endif
-};
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_prime_tab.c */
-
-/* Start: bn_reverse.c */
-#include <tommath.h>
-#ifdef BN_REVERSE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* reverse an array, used for radix code */
-void
-bn_reverse (unsigned char *s, int len)
-{
-  int     ix, iy;
-  unsigned char t;
-
-  ix = 0;
-  iy = len - 1;
-  while (ix < iy) {
-    t     = s[ix];
-    s[ix] = s[iy];
-    s[iy] = t;
-    ++ix;
-    --iy;
-  }
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_reverse.c */
-
-/* Start: bn_s_mp_add.c */
-#include <tommath.h>
-#ifdef BN_S_MP_ADD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* low level addition, based on HAC pp.594, Algorithm 14.7 */
-int
-s_mp_add (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int *x;
-  int     olduse, res, min, max;
-
-  /* find sizes, we let |a| <= |b| which means we have to sort
-   * them.  "x" will point to the input with the most digits
-   */
-  if (a->used > b->used) {
-    min = b->used;
-    max = a->used;
-    x = a;
-  } else {
-    min = a->used;
-    max = b->used;
-    x = b;
-  }
-
-  /* init result */
-  if (c->alloc < max + 1) {
-    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* get old used digit count and set new one */
-  olduse = c->used;
-  c->used = max + 1;
-
-  {
-    register mp_digit u, *tmpa, *tmpb, *tmpc;
-    register int i;
-
-    /* alias for digit pointers */
-
-    /* first input */
-    tmpa = a->dp;
-
-    /* second input */
-    tmpb = b->dp;
-
-    /* destination */
-    tmpc = c->dp;
-
-    /* zero the carry */
-    u = 0;
-    for (i = 0; i < min; i++) {
-      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-      *tmpc = *tmpa++ + *tmpb++ + u;
-
-      /* U = carry bit of T[i] */
-      u = *tmpc >> ((mp_digit)DIGIT_BIT);
-
-      /* take away carry bit from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* now copy higher words if any, that is in A+B
-     * if A or B has more digits add those in
-     */
-    if (min != max) {
-      for (; i < max; i++) {
-        /* T[i] = X[i] + U */
-        *tmpc = x->dp[i] + u;
-
-        /* U = carry bit of T[i] */
-        u = *tmpc >> ((mp_digit)DIGIT_BIT);
-
-        /* take away carry bit from T[i] */
-        *tmpc++ &= MP_MASK;
-      }
-    }
-
-    /* add carry */
-    *tmpc++ = u;
-
-    /* clear digits above oldused */
-    for (i = c->used; i < olduse; i++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_s_mp_add.c */
-
-/* Start: bn_s_mp_exptmod.c */
-#include <tommath.h>
-#ifdef BN_S_MP_EXPTMOD_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-#ifdef MP_LOW_MEM
-   #define TAB_SIZE 32
-#else
-   #define TAB_SIZE 256
-#endif
-
-int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
-{
-  mp_int  M[TAB_SIZE], res, mu;
-  mp_digit buf;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-  int (*redux)(mp_int*,mp_int*,mp_int*);
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-    if (winsize > 5) {
-       winsize = 5;
-    }
-#endif
-
-  /* init M array */
-  /* init first cell */
-  if ((err = mp_init(&M[1])) != MP_OKAY) {
-     return err;
-  }
-
-  /* now init the second half of the array */
-  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
-    if ((err = mp_init(&M[x])) != MP_OKAY) {
-      for (y = 1<<(winsize-1); y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      mp_clear(&M[1]);
-      return err;
-    }
-  }
-
-  /* create mu, used for Barrett reduction */
-  if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto LBL_M;
-  }
-
-  if (redmode == 0) {
-     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-        goto LBL_MU;
-     }
-     redux = mp_reduce;
-  } else {
-     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
-        goto LBL_MU;
-     }
-     redux = mp_reduce_2k_l;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the base,
-   * e.g. M[x] = G**x mod P
-   *
-   * The first half of the table is not
-   * computed though accept for M[0] and M[1]
-   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto LBL_MU;
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring
-   * M[1] (winsize-1) times
-   */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto LBL_MU;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    /* square it */
-    if ((err = mp_sqr (&M[1 << (winsize - 1)],
-                       &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto LBL_MU;
-    }
-
-    /* reduce modulo P */
-    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto LBL_MU;
-    }
-  }
-
-  /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-   * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-   */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto LBL_MU;
-    }
-    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
-      goto LBL_MU;
-    }
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto LBL_MU;
-  }
-  mp_set (&res, 1);
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = 0;
-  bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      /* if digidx == -1 we are out of digits */
-      if (digidx == -1) {
-        break;
-      }
-      /* read next digit and reset the bitcnt */
-      buf    = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0) {
-      continue;
-    }
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode    = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-
-      /* empty window and reset */
-      bitcpy = 0;
-      bitbuf = 0;
-      mode   = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
-        goto LBL_RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
-          goto LBL_RES;
-        }
-      }
-    }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-LBL_RES:mp_clear (&res);
-LBL_MU:mp_clear (&mu);
-LBL_M:
-  mp_clear(&M[1]);
-  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_s_mp_exptmod.c */
-
-/* Start: bn_s_mp_mul_digs.c */
-#include <tommath.h>
-#ifdef BN_S_MP_MUL_DIGS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* multiplies |a| * |b| and only computes upto digs digits of result
- * HAC pp. 595, Algorithm 14.12  Modified so you can control how
- * many digits of output are created.
- */
-int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  mp_int  t;
-  int     res, pa, pb, ix, iy;
-  mp_digit u;
-  mp_word r;
-  mp_digit tmpx, *tmpt, *tmpy;
-
-  /* can we use the fast multiplier? */
-  if (((digs) < MP_WARRAY) &&
-      MIN (a->used, b->used) <
-          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_s_mp_mul_digs (a, b, c, digs);
-  }
-
-  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
-    return res;
-  }
-  t.used = digs;
-
-  /* compute the digits of the product directly */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* set the carry to zero */
-    u = 0;
-
-    /* limit ourselves to making digs digits of output */
-    pb = MIN (b->used, digs - ix);
-
-    /* setup some aliases */
-    /* copy of the digit from a used within the nested loop */
-    tmpx = a->dp[ix];
-
-    /* an alias for the destination shifted ix places */
-    tmpt = t.dp + ix;
-
-    /* an alias for the digits of b */
-    tmpy = b->dp;
-
-    /* compute the columns of the output and propagate the carry */
-    for (iy = 0; iy < pb; iy++) {
-      /* compute the column as a mp_word */
-      r       = ((mp_word)*tmpt) +
-                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-                ((mp_word) u);
-
-      /* the new column is the lower part of the result */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* get the carry word from the result */
-      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    /* set carry if it is placed below digs */
-    if (ix + iy < digs) {
-      *tmpt = u;
-    }
-  }
-
-  mp_clamp (&t);
-  mp_exch (&t, c);
-
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_s_mp_mul_digs.c */
-
-/* Start: bn_s_mp_mul_high_digs.c */
-#include <tommath.h>
-#ifdef BN_S_MP_MUL_HIGH_DIGS_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* multiplies |a| * |b| and does not compute the lower digs digits
- * [meant to get the higher part of the product]
- */
-int
-s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  mp_int  t;
-  int     res, pa, pb, ix, iy;
-  mp_digit u;
-  mp_word r;
-  mp_digit tmpx, *tmpt, *tmpy;
-
-  /* can we use the fast multiplier? */
-#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
-  if (((a->used + b->used + 1) < MP_WARRAY)
-      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_s_mp_mul_high_digs (a, b, c, digs);
-  }
-#endif
-
-  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
-    return res;
-  }
-  t.used = a->used + b->used + 1;
-
-  pa = a->used;
-  pb = b->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* clear the carry */
-    u = 0;
-
-    /* left hand side of A[ix] * B[iy] */
-    tmpx = a->dp[ix];
-
-    /* alias to the address of where the digits will be stored */
-    tmpt = &(t.dp[digs]);
-
-    /* alias for where to read the right hand side from */
-    tmpy = b->dp + (digs - ix);
-
-    for (iy = digs - ix; iy < pb; iy++) {
-      /* calculate the double precision result */
-      r       = ((mp_word)*tmpt) +
-                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-                ((mp_word) u);
-
-      /* get the lower part */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* carry the carry */
-      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    *tmpt = u;
-  }
-  mp_clamp (&t);
-  mp_exch (&t, c);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_s_mp_mul_high_digs.c */
-
-/* Start: bn_s_mp_sqr.c */
-#include <tommath.h>
-#ifdef BN_S_MP_SQR_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int s_mp_sqr (mp_int * a, mp_int * b)
-{
-  mp_int  t;
-  int     res, ix, iy, pa;
-  mp_word r;
-  mp_digit u, tmpx, *tmpt;
-
-  pa = a->used;
-  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
-    return res;
-  }
-
-  /* default used is maximum possible size */
-  t.used = 2*pa + 1;
-
-  for (ix = 0; ix < pa; ix++) {
-    /* first calculate the digit at 2*ix */
-    /* calculate double precision result */
-    r = ((mp_word) t.dp[2*ix]) +
-        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
-
-    /* store lower part in result */
-    t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-
-    /* get the carry */
-    u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-
-    /* left hand side of A[ix] * A[iy] */
-    tmpx        = a->dp[ix];
-
-    /* alias for where to store the results */
-    tmpt        = t.dp + (2*ix + 1);
-
-    for (iy = ix + 1; iy < pa; iy++) {
-      /* first calculate the product */
-      r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
-
-      /* now calculate the double precision result, note we use
-       * addition instead of *2 since it's easier to optimize
-       */
-      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
-
-      /* store lower part */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* get carry */
-      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-    }
-    /* propagate upwards */
-    while (u != ((mp_digit) 0)) {
-      r       = ((mp_word) *tmpt) + ((mp_word) u);
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-    }
-  }
-
-  mp_clamp (&t);
-  mp_exch (&t, b);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_s_mp_sqr.c */
-
-/* Start: bn_s_mp_sub.c */
-#include <tommath.h>
-#ifdef BN_S_MP_SUB_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
-int
-s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     olduse, res, min, max;
-
-  /* find sizes */
-  min = b->used;
-  max = a->used;
-
-  /* init result */
-  if (c->alloc < max) {
-    if ((res = mp_grow (c, max)) != MP_OKAY) {
-      return res;
-    }
-  }
-  olduse = c->used;
-  c->used = max;
-
-  {
-    register mp_digit u, *tmpa, *tmpb, *tmpc;
-    register int i;
-
-    /* alias for digit pointers */
-    tmpa = a->dp;
-    tmpb = b->dp;
-    tmpc = c->dp;
-
-    /* set carry to zero */
-    u = 0;
-    for (i = 0; i < min; i++) {
-      /* T[i] = A[i] - B[i] - U */
-      *tmpc = *tmpa++ - *tmpb++ - u;
-
-      /* U = carry bit of T[i]
-       * Note this saves performing an AND operation since
-       * if a carry does occur it will propagate all the way to the
-       * MSB.  As a result a single shift is enough to get the carry
-       */
-      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-
-      /* Clear carry from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* now copy higher words if any, e.g. if A has more digits than B  */
-    for (; i < max; i++) {
-      /* T[i] = A[i] - U */
-      *tmpc = *tmpa++ - u;
-
-      /* U = carry bit of T[i] */
-      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-
-      /* Clear carry from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* clear digits above used (since we may not have grown result above) */
-    for (i = c->used; i < olduse; i++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bn_s_mp_sub.c */
-
-/* Start: bncore.c */
-#include <tommath.h>
-#ifdef BNCORE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://math.libtomcrypt.com
- */
-
-/* Known optimal configurations
-
- CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
--------------------------------------------------------------
- Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
- AMD Athlon64           /GCC v3.4.4   /        80/       120/LTM 0.35
-
-*/
-
-int     KARATSUBA_MUL_CUTOFF = 80,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 120,     /* Min. number of digits before Karatsuba squaring is used. */
-
-        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
-        TOOM_SQR_CUTOFF      = 400;
-#endif
-
-/* $Source$ */
-/* $Revision$ */
-/* $Date$ */
-
-/* End: bncore.c */
-
-
-/* EOF */
diff --git a/libtommath/pretty.build b/libtommath/pretty.build
deleted file mode 100644
index a708b8a..0000000
--- a/libtommath/pretty.build
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/perl -w
-#
-# Cute little builder for perl 
-# Total waste of development time...
-#
-# This will build all the object files and then the archive .a file
-# requires GCC, GNU make and a sense of humour.
-#
-# Tom St Denis
-use strict;
-
-my $count = 0;
-my $starttime = time;
-my $rate  = 0;
-print "Scanning for source files...\n";
-foreach my $filename (glob "*.c") {
-       ++$count;
-}
-print "Source files to build: $count\nBuilding...\n";
-my $i = 0;
-my $lines = 0;
-my $filesbuilt = 0;
-foreach my $filename (glob "*.c") {
-       printf("Building %3.2f%%, ", (++$i/$count)*100.0);
-       if ($i % 4 == 0) { print "/, "; }
-       if ($i % 4 == 1) { print "-, "; }
-       if ($i % 4 == 2) { print "\\, "; }
-       if ($i % 4 == 3) { print "|, "; }
-       if ($rate > 0) {
-           my $tleft = ($count - $i) / $rate;
-           my $tsec  = $tleft%60;
-           my $tmin  = ($tleft/60)%60;
-           my $thour = ($tleft/3600)%60;
-           printf("%2d:%02d:%02d left, ", $thour, $tmin, $tsec);
-       }
-       my $cnt = ($i/$count)*30.0;
-       my $x   = 0;
-       print "[";
-       for (; $x < $cnt; $x++) { print "#"; }
-       for (; $x < 30; $x++)   { print " "; }
-       print "]\r";
-       my $tmp = $filename;
-       $tmp =~ s/\.c/".o"/ge;
-       if (open(SRC, "<$tmp")) {
-          close SRC;
-       } else {
-          !system("make $tmp > /dev/null 2>/dev/null") or die "\nERROR: Failed to make $tmp!!!\n";
-          open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
-          ++$lines while (<SRC>);
-          close SRC or die "Error closing $filename after reading: $!";
-          ++$filesbuilt;
-       }      
-
-       # update timer 
-       if (time != $starttime) {
-          my $delay = time - $starttime;
-          $rate = $i/$delay;
-       }
-}
-
-# finish building the library 
-printf("\nFinished building source (%d seconds, %3.2f files per second).\n", time - $starttime, $rate);
-print "Compiled approximately $filesbuilt files and $lines lines of code.\n";
-print "Doing final make (building archive...)\n";
-!system("make > /dev/null 2>/dev/null") or die "\nERROR: Failed to perform last make command!!!\n";
-print "done.\n";
-\ No newline at end of file
diff --git a/libtommath/tombc/grammar.txt b/libtommath/tombc/grammar.txt
deleted file mode 100644
index a780e75..0000000
--- a/libtommath/tombc/grammar.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-program       := program statement | statement | empty
-statement     := { statement }                                                                              | 
-                 identifier = numexpression;                                                                | 
-                 identifier[numexpression] = numexpression;                                                 |
-                 function(expressionlist);                                                                  | 
-                 for (identifer = numexpression; numexpression; identifier = numexpression) { statement }   |
-                 while (numexpression) { statement }                                                        | 
-                 if (numexpresion) { statement } elif                                                       | 
-                 break;                                                                                     | 
-                 continue;                                                                                  
-                 
-elif          := else statement | empty
-function      := abs | countbits | exptmod | jacobi | print | isprime | nextprime | issquare | readinteger | exit
-expressionlist := expressionlist, expression | expression
-
-// LR(1) !!!?
-expression    := string | numexpression
-numexpression := cmpexpr && cmpexpr | cmpexpr \|\| cmpexpr | cmpexpr
-cmpexpr       := boolexpr  < boolexpr | boolexpr  > boolexpr | boolexpr == boolexpr | 
-                 boolexpr <= boolexpr | boolexpr >= boolexpr | boolexpr
-boolexpr      := shiftexpr & shiftexpr | shiftexpr ^ shiftexpr | shiftexpr \| shiftexpr | shiftexpr
-shiftexpr     := addsubexpr << addsubexpr | addsubexpr >> addsubexpr | addsubexpr
-addsubexpr    := mulexpr + mulexpr | mulexpr - mulexpr | mulexpr
-mulexpr       := expr * expr       | expr / expr | expr % expr | expr
-expr          := -nexpr | nexpr 
-nexpr         := integer | identifier | ( numexpression ) | identifier[numexpression] 
-
-identifier    := identifer digits | identifier alpha | alpha
-alpha         := a ... z | A ... Z
-integer       := hexnumber | digits 
-hexnumber     := 0xhexdigits
-hexdigits     := hexdigits hexdigit | hexdigit
-hexdigit      := 0 ... 9 | a ... f | A ... F
-digits        := digits digit | digit 
-digit         := 0 ... 9
diff --git a/libtommath/tommath.h b/libtommath/tommath.h
index da0e473..3865949 100644
--- a/libtommath/tommath.h
+++ b/libtommath/tommath.h
@@ -62,7 +62,7 @@ extern "C" {
    typedef signed long long     long64;
 #endif
 
-   typedef uint64_t mp_digit;
+   typedef ulong64 mp_digit;
 #if defined(_WIN32)
    typedef unsigned __int128    mp_word;
 #elif defined(__GNUC__)
@@ -84,7 +84,7 @@ extern "C" {
 #endif
 
    typedef uint32_t             mp_digit;
-   typedef uint64_t             mp_word;
+   typedef ulong64              mp_word;
 
 #ifdef MP_31BIT
    /* this is an extension that uses 31-bit digits */
@@ -564,7 +564,7 @@ int mp_fwrite(mp_int *a, int radix, FILE *stream);
 #define mp_tohex(M, S)     mp_toradix((M), (S), 16)
 
 #ifdef __cplusplus
-}
+   }
 #endif
 
 #endif
diff --git a/libtommath/tommath.out b/libtommath/tommath.out
deleted file mode 100644
index de4aada..0000000
--- a/libtommath/tommath.out
+++ /dev/null
@@ -1,139 +0,0 @@
-\BOOKMARK [0][-]{chapter.1}{Introduction}{}% 1
-\BOOKMARK [1][-]{section.1.1}{Multiple Precision Arithmetic}{chapter.1}% 2
-\BOOKMARK [2][-]{subsection.1.1.1}{What is Multiple Precision Arithmetic?}{section.1.1}% 3
-\BOOKMARK [2][-]{subsection.1.1.2}{The Need for Multiple Precision Arithmetic}{section.1.1}% 4
-\BOOKMARK [2][-]{subsection.1.1.3}{Benefits of Multiple Precision Arithmetic}{section.1.1}% 5
-\BOOKMARK [1][-]{section.1.2}{Purpose of This Text}{chapter.1}% 6
-\BOOKMARK [1][-]{section.1.3}{Discussion and Notation}{chapter.1}% 7
-\BOOKMARK [2][-]{subsection.1.3.1}{Notation}{section.1.3}% 8
-\BOOKMARK [2][-]{subsection.1.3.2}{Precision Notation}{section.1.3}% 9
-\BOOKMARK [2][-]{subsection.1.3.3}{Algorithm Inputs and Outputs}{section.1.3}% 10
-\BOOKMARK [2][-]{subsection.1.3.4}{Mathematical Expressions}{section.1.3}% 11
-\BOOKMARK [2][-]{subsection.1.3.5}{Work Effort}{section.1.3}% 12
-\BOOKMARK [1][-]{section.1.4}{Exercises}{chapter.1}% 13
-\BOOKMARK [1][-]{section.1.5}{Introduction to LibTomMath}{chapter.1}% 14
-\BOOKMARK [2][-]{subsection.1.5.1}{What is LibTomMath?}{section.1.5}% 15
-\BOOKMARK [2][-]{subsection.1.5.2}{Goals of LibTomMath}{section.1.5}% 16
-\BOOKMARK [1][-]{section.1.6}{Choice of LibTomMath}{chapter.1}% 17
-\BOOKMARK [2][-]{subsection.1.6.1}{Code Base}{section.1.6}% 18
-\BOOKMARK [2][-]{subsection.1.6.2}{API Simplicity}{section.1.6}% 19
-\BOOKMARK [2][-]{subsection.1.6.3}{Optimizations}{section.1.6}% 20
-\BOOKMARK [2][-]{subsection.1.6.4}{Portability and Stability}{section.1.6}% 21
-\BOOKMARK [2][-]{subsection.1.6.5}{Choice}{section.1.6}% 22
-\BOOKMARK [0][-]{chapter.2}{Getting Started}{}% 23
-\BOOKMARK [1][-]{section.2.1}{Library Basics}{chapter.2}% 24
-\BOOKMARK [1][-]{section.2.2}{What is a Multiple Precision Integer?}{chapter.2}% 25
-\BOOKMARK [2][-]{subsection.2.2.1}{The mp\137int Structure}{section.2.2}% 26
-\BOOKMARK [1][-]{section.2.3}{Argument Passing}{chapter.2}% 27
-\BOOKMARK [1][-]{section.2.4}{Return Values}{chapter.2}% 28
-\BOOKMARK [1][-]{section.2.5}{Initialization and Clearing}{chapter.2}% 29
-\BOOKMARK [2][-]{subsection.2.5.1}{Initializing an mp\137int}{section.2.5}% 30
-\BOOKMARK [2][-]{subsection.2.5.2}{Clearing an mp\137int}{section.2.5}% 31
-\BOOKMARK [1][-]{section.2.6}{Maintenance Algorithms}{chapter.2}% 32
-\BOOKMARK [2][-]{subsection.2.6.1}{Augmenting an mp\137int's Precision}{section.2.6}% 33
-\BOOKMARK [2][-]{subsection.2.6.2}{Initializing Variable Precision mp\137ints}{section.2.6}% 34
-\BOOKMARK [2][-]{subsection.2.6.3}{Multiple Integer Initializations and Clearings}{section.2.6}% 35
-\BOOKMARK [2][-]{subsection.2.6.4}{Clamping Excess Digits}{section.2.6}% 36
-\BOOKMARK [0][-]{chapter.3}{Basic Operations}{}% 37
-\BOOKMARK [1][-]{section.3.1}{Introduction}{chapter.3}% 38
-\BOOKMARK [1][-]{section.3.2}{Assigning Values to mp\137int Structures}{chapter.3}% 39
-\BOOKMARK [2][-]{subsection.3.2.1}{Copying an mp\137int}{section.3.2}% 40
-\BOOKMARK [2][-]{subsection.3.2.2}{Creating a Clone}{section.3.2}% 41
-\BOOKMARK [1][-]{section.3.3}{Zeroing an Integer}{chapter.3}% 42
-\BOOKMARK [1][-]{section.3.4}{Sign Manipulation}{chapter.3}% 43
-\BOOKMARK [2][-]{subsection.3.4.1}{Absolute Value}{section.3.4}% 44
-\BOOKMARK [2][-]{subsection.3.4.2}{Integer Negation}{section.3.4}% 45
-\BOOKMARK [1][-]{section.3.5}{Small Constants}{chapter.3}% 46
-\BOOKMARK [2][-]{subsection.3.5.1}{Setting Small Constants}{section.3.5}% 47
-\BOOKMARK [2][-]{subsection.3.5.2}{Setting Large Constants}{section.3.5}% 48
-\BOOKMARK [1][-]{section.3.6}{Comparisons}{chapter.3}% 49
-\BOOKMARK [2][-]{subsection.3.6.1}{Unsigned Comparisions}{section.3.6}% 50
-\BOOKMARK [2][-]{subsection.3.6.2}{Signed Comparisons}{section.3.6}% 51
-\BOOKMARK [0][-]{chapter.4}{Basic Arithmetic}{}% 52
-\BOOKMARK [1][-]{section.4.1}{Introduction}{chapter.4}% 53
-\BOOKMARK [1][-]{section.4.2}{Addition and Subtraction}{chapter.4}% 54
-\BOOKMARK [2][-]{subsection.4.2.1}{Low Level Addition}{section.4.2}% 55
-\BOOKMARK [2][-]{subsection.4.2.2}{Low Level Subtraction}{section.4.2}% 56
-\BOOKMARK [2][-]{subsection.4.2.3}{High Level Addition}{section.4.2}% 57
-\BOOKMARK [2][-]{subsection.4.2.4}{High Level Subtraction}{section.4.2}% 58
-\BOOKMARK [1][-]{section.4.3}{Bit and Digit Shifting}{chapter.4}% 59
-\BOOKMARK [2][-]{subsection.4.3.1}{Multiplication by Two}{section.4.3}% 60
-\BOOKMARK [2][-]{subsection.4.3.2}{Division by Two}{section.4.3}% 61
-\BOOKMARK [1][-]{section.4.4}{Polynomial Basis Operations}{chapter.4}% 62
-\BOOKMARK [2][-]{subsection.4.4.1}{Multiplication by x}{section.4.4}% 63
-\BOOKMARK [2][-]{subsection.4.4.2}{Division by x}{section.4.4}% 64
-\BOOKMARK [1][-]{section.4.5}{Powers of Two}{chapter.4}% 65
-\BOOKMARK [2][-]{subsection.4.5.1}{Multiplication by Power of Two}{section.4.5}% 66
-\BOOKMARK [2][-]{subsection.4.5.2}{Division by Power of Two}{section.4.5}% 67
-\BOOKMARK [2][-]{subsection.4.5.3}{Remainder of Division by Power of Two}{section.4.5}% 68
-\BOOKMARK [0][-]{chapter.5}{Multiplication and Squaring}{}% 69
-\BOOKMARK [1][-]{section.5.1}{The Multipliers}{chapter.5}% 70
-\BOOKMARK [1][-]{section.5.2}{Multiplication}{chapter.5}% 71
-\BOOKMARK [2][-]{subsection.5.2.1}{The Baseline Multiplication}{section.5.2}% 72
-\BOOKMARK [2][-]{subsection.5.2.2}{Faster Multiplication by the ``Comba'' Method}{section.5.2}% 73
-\BOOKMARK [2][-]{subsection.5.2.3}{Polynomial Basis Multiplication}{section.5.2}% 74
-\BOOKMARK [2][-]{subsection.5.2.4}{Karatsuba Multiplication}{section.5.2}% 75
-\BOOKMARK [2][-]{subsection.5.2.5}{Toom-Cook 3-Way Multiplication}{section.5.2}% 76
-\BOOKMARK [2][-]{subsection.5.2.6}{Signed Multiplication}{section.5.2}% 77
-\BOOKMARK [1][-]{section.5.3}{Squaring}{chapter.5}% 78
-\BOOKMARK [2][-]{subsection.5.3.1}{The Baseline Squaring Algorithm}{section.5.3}% 79
-\BOOKMARK [2][-]{subsection.5.3.2}{Faster Squaring by the ``Comba'' Method}{section.5.3}% 80
-\BOOKMARK [2][-]{subsection.5.3.3}{Polynomial Basis Squaring}{section.5.3}% 81
-\BOOKMARK [2][-]{subsection.5.3.4}{Karatsuba Squaring}{section.5.3}% 82
-\BOOKMARK [2][-]{subsection.5.3.5}{Toom-Cook Squaring}{section.5.3}% 83
-\BOOKMARK [2][-]{subsection.5.3.6}{High Level Squaring}{section.5.3}% 84
-\BOOKMARK [0][-]{chapter.6}{Modular Reduction}{}% 85
-\BOOKMARK [1][-]{section.6.1}{Basics of Modular Reduction}{chapter.6}% 86
-\BOOKMARK [1][-]{section.6.2}{The Barrett Reduction}{chapter.6}% 87
-\BOOKMARK [2][-]{subsection.6.2.1}{Fixed Point Arithmetic}{section.6.2}% 88
-\BOOKMARK [2][-]{subsection.6.2.2}{Choosing a Radix Point}{section.6.2}% 89
-\BOOKMARK [2][-]{subsection.6.2.3}{Trimming the Quotient}{section.6.2}% 90
-\BOOKMARK [2][-]{subsection.6.2.4}{Trimming the Residue}{section.6.2}% 91
-\BOOKMARK [2][-]{subsection.6.2.5}{The Barrett Algorithm}{section.6.2}% 92
-\BOOKMARK [2][-]{subsection.6.2.6}{The Barrett Setup Algorithm}{section.6.2}% 93
-\BOOKMARK [1][-]{section.6.3}{The Montgomery Reduction}{chapter.6}% 94
-\BOOKMARK [2][-]{subsection.6.3.1}{Digit Based Montgomery Reduction}{section.6.3}% 95
-\BOOKMARK [2][-]{subsection.6.3.2}{Baseline Montgomery Reduction}{section.6.3}% 96
-\BOOKMARK [2][-]{subsection.6.3.3}{Faster ``Comba'' Montgomery Reduction}{section.6.3}% 97
-\BOOKMARK [2][-]{subsection.6.3.4}{Montgomery Setup}{section.6.3}% 98
-\BOOKMARK [1][-]{section.6.4}{The Diminished Radix Algorithm}{chapter.6}% 99
-\BOOKMARK [2][-]{subsection.6.4.1}{Choice of Moduli}{section.6.4}% 100
-\BOOKMARK [2][-]{subsection.6.4.2}{Choice of k}{section.6.4}% 101
-\BOOKMARK [2][-]{subsection.6.4.3}{Restricted Diminished Radix Reduction}{section.6.4}% 102
-\BOOKMARK [2][-]{subsection.6.4.4}{Unrestricted Diminished Radix Reduction}{section.6.4}% 103
-\BOOKMARK [1][-]{section.6.5}{Algorithm Comparison}{chapter.6}% 104
-\BOOKMARK [0][-]{chapter.7}{Exponentiation}{}% 105
-\BOOKMARK [1][-]{section.7.1}{Exponentiation Basics}{chapter.7}% 106
-\BOOKMARK [2][-]{subsection.7.1.1}{Single Digit Exponentiation}{section.7.1}% 107
-\BOOKMARK [1][-]{section.7.2}{k-ary Exponentiation}{chapter.7}% 108
-\BOOKMARK [2][-]{subsection.7.2.1}{Optimal Values of k}{section.7.2}% 109
-\BOOKMARK [2][-]{subsection.7.2.2}{Sliding-Window Exponentiation}{section.7.2}% 110
-\BOOKMARK [1][-]{section.7.3}{Modular Exponentiation}{chapter.7}% 111
-\BOOKMARK [2][-]{subsection.7.3.1}{Barrett Modular Exponentiation}{section.7.3}% 112
-\BOOKMARK [1][-]{section.7.4}{Quick Power of Two}{chapter.7}% 113
-\BOOKMARK [0][-]{chapter.8}{Higher Level Algorithms}{}% 114
-\BOOKMARK [1][-]{section.8.1}{Integer Division with Remainder}{chapter.8}% 115
-\BOOKMARK [2][-]{subsection.8.1.1}{Quotient Estimation}{section.8.1}% 116
-\BOOKMARK [2][-]{subsection.8.1.2}{Normalized Integers}{section.8.1}% 117
-\BOOKMARK [2][-]{subsection.8.1.3}{Radix- Division with Remainder}{section.8.1}% 118
-\BOOKMARK [1][-]{section.8.2}{Single Digit Helpers}{chapter.8}% 119
-\BOOKMARK [2][-]{subsection.8.2.1}{Single Digit Addition and Subtraction}{section.8.2}% 120
-\BOOKMARK [2][-]{subsection.8.2.2}{Single Digit Multiplication}{section.8.2}% 121
-\BOOKMARK [2][-]{subsection.8.2.3}{Single Digit Division}{section.8.2}% 122
-\BOOKMARK [2][-]{subsection.8.2.4}{Single Digit Root Extraction}{section.8.2}% 123
-\BOOKMARK [1][-]{section.8.3}{Random Number Generation}{chapter.8}% 124
-\BOOKMARK [1][-]{section.8.4}{Formatted Representations}{chapter.8}% 125
-\BOOKMARK [2][-]{subsection.8.4.1}{Reading Radix-n Input}{section.8.4}% 126
-\BOOKMARK [2][-]{subsection.8.4.2}{Generating Radix-n Output}{section.8.4}% 127
-\BOOKMARK [0][-]{chapter.9}{Number Theoretic Algorithms}{}% 128
-\BOOKMARK [1][-]{section.9.1}{Greatest Common Divisor}{chapter.9}% 129
-\BOOKMARK [2][-]{subsection.9.1.1}{Complete Greatest Common Divisor}{section.9.1}% 130
-\BOOKMARK [1][-]{section.9.2}{Least Common Multiple}{chapter.9}% 131
-\BOOKMARK [1][-]{section.9.3}{Jacobi Symbol Computation}{chapter.9}% 132
-\BOOKMARK [2][-]{subsection.9.3.1}{Jacobi Symbol}{section.9.3}% 133
-\BOOKMARK [1][-]{section.9.4}{Modular Inverse}{chapter.9}% 134
-\BOOKMARK [2][-]{subsection.9.4.1}{General Case}{section.9.4}% 135
-\BOOKMARK [1][-]{section.9.5}{Primality Tests}{chapter.9}% 136
-\BOOKMARK [2][-]{subsection.9.5.1}{Trial Division}{section.9.5}% 137
-\BOOKMARK [2][-]{subsection.9.5.2}{The Fermat Test}{section.9.5}% 138
-\BOOKMARK [2][-]{subsection.9.5.3}{The Miller-Rabin Test}{section.9.5}% 139
diff --git a/libtommath/tommath.pdf b/libtommath/tommath.pdf
deleted file mode 100644
index c9571d8..0000000
--- a/libtommath/tommath.pdf
+++ /dev/null
diff --git a/libtommath/tommath.src b/libtommath/tommath.src
deleted file mode 100644
index 768ed10..0000000
--- a/libtommath/tommath.src
+++ /dev/null
@@ -1,6339 +0,0 @@
-\documentclass[b5paper]{book}
-\usepackage{hyperref}
-\usepackage{makeidx}
-\usepackage{amssymb}
-\usepackage{color}
-\usepackage{alltt}
-\usepackage{graphicx}
-\usepackage{layout}
-\def\union{\cup}
-\def\intersect{\cap}
-\def\getsrandom{\stackrel{\rm R}{\gets}}
-\def\cross{\times}
-\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
-\def\catn{$\|$}
-\def\divides{\hspace{0.3em} | \hspace{0.3em}}
-\def\nequiv{\not\equiv}
-\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
-\def\lcm{{\rm lcm}}
-\def\gcd{{\rm gcd}}
-\def\log{{\rm log}}
-\def\ord{{\rm ord}}
-\def\abs{{\mathit abs}}
-\def\rep{{\mathit rep}}
-\def\mod{{\mathit\ mod\ }}
-\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
-\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
-\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
-\def\Or{{\rm\ or\ }}
-\def\And{{\rm\ and\ }}
-\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
-\def\implies{\Rightarrow}
-\def\undefined{{\rm ``undefined"}}
-\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
-\let\oldphi\phi
-\def\phi{\varphi}
-\def\Pr{{\rm Pr}}
-\newcommand{\str}[1]{{\mathbf{#1}}}
-\def\F{{\mathbb F}}
-\def\N{{\mathbb N}}
-\def\Z{{\mathbb Z}}
-\def\R{{\mathbb R}}
-\def\C{{\mathbb C}}
-\def\Q{{\mathbb Q}}
-\definecolor{DGray}{gray}{0.5}
-\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
-\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
-\def\gap{\vspace{0.5ex}}
-\makeindex
-\begin{document}
-\frontmatter
-\pagestyle{empty}
-\title{Multi--Precision Math}
-\author{\mbox{
-%\begin{small}
-\begin{tabular}{c}
-Tom St Denis \\
-Algonquin College \\
-\\
-Mads Rasmussen \\
-Open Communications Security \\
-\\
-Greg Rose \\
-QUALCOMM Australia \\
-\end{tabular}
-%\end{small}
-}
-}
-\maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.39 release of the
-LibTomMath project.
-
-This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{}
-{\em book} macro package and the Perl {\em booker} package.
-
-\tableofcontents
-\listoffigures
-\chapter*{Prefaces}
-When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.
-They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''
-Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which
-perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
-others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give
-back to society in the form of tools and knowledge that can help others in their endeavours.
-
-I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
-code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
-explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
-itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
-of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
-from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
-
-This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
-of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
-length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
-comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg
-were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to
-continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.
-
-To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
-honour your kind gestures with this project.
-
-Open Source.  Open Academia.  Open Minds.
-
-\begin{flushright} Tom St Denis \end{flushright}
-
-\newpage
-I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also
-contribute to educate others facing the problem of having to handle big number mathematical calculations.
-
-This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of
-how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about
-the layout and language used.
-
-I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the
-practical aspects of cryptography.
-
-Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a
-great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up
-multiple precision calculations is often very important since we deal with outdated machine architecture where modular
-reductions, for example, become painfully slow.
-
-This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks
-themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?''
-
-\begin{flushright}
-Mads Rasmussen
-
-S\~{a}o Paulo - SP
-
-Brazil
-\end{flushright}
-
-\newpage
-It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about
-Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not
-really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once.
-
-At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the
-sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real
-contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity.
-Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake.
-
-When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully,
-and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close
-friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort,
-and I'm pleased to be involved with it.
-
-\begin{flushright}
-Greg Rose, Sydney, Australia, June 2003.
-\end{flushright}
-
-\mainmatter
-\pagestyle{headings}
-\chapter{Introduction}
-\section{Multiple Precision Arithmetic}
-
-\subsection{What is Multiple Precision Arithmetic?}
-When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively
-raise or lower the precision of the numbers we are dealing with.  For example, in decimal we almost immediate can
-reason that $7$ times $6$ is $42$.  However, $42$ has two digits of precision as opposed to one digit we started with.
-Further multiplications of say $3$ result in a larger precision result $126$.  In these few examples we have multiple
-precisions for the numbers we are working with.  Despite the various levels of precision a single subset\footnote{With the occasional optimization.}
- of algorithms can be designed to accomodate them.
-
-By way of comparison a fixed or single precision operation would lose precision on various operations.  For example, in
-the decimal system with fixed precision $6 \cdot 7 = 2$.
-
-Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in
-schools to manually add, subtract, multiply and divide.
-
-\subsection{The Need for Multiple Precision Arithmetic}
-The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation
-of public-key cryptography algorithms.   Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require
-integers of significant magnitude to resist known cryptanalytic attacks.  For example, at the time of this writing a
-typical RSA modulus would be at least greater than $10^{309}$.  However, modern programming languages such as ISO C \cite{ISOC} and
-Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision.
-
-\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{|r|c|}
-\hline \textbf{Data Type} & \textbf{Range} \\
-\hline char  & $-128 \ldots 127$ \\
-\hline short & $-32768 \ldots 32767$ \\
-\hline long  & $-2147483648 \ldots 2147483647$ \\
-\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Typical Data Types for the C Programming Language}
-\label{fig:ISOC}
-\end{figure}
-
-The largest data type guaranteed to be provided by the ISO C programming
-language\footnote{As per the ISO C standard.  However, each compiler vendor is allowed to augment the precision as they
-see fit.}  can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is
-insufficient to accomodate the magnitude required for the problem at hand.  An RSA modulus of magnitude $10^{19}$ could be
-trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer,
-rendering any protocol based on the algorithm insecure.  Multiple precision algorithms solve this very problem by
-extending the range of representable integers while using single precision data types.
-
-Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic
-primitives.  Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in
-various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient.  In fact, several
-major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and
-deployment of efficient algorithms.
-
-However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines.
-Another auxiliary use of multiple precision integers is high precision floating point data types.
-The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$.
-Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE.  Since IEEE
-floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small
-(\textit{23, 48 and 64 bits}).  The mantissa is merely an integer and a multiple precision integer could be used to create
-a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where
-scientific applications must minimize the total output error over long calculations.
-
-Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
-In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
-
-\subsection{Benefits of Multiple Precision Arithmetic}
-\index{precision}
-The benefit of multiple precision representations over single or fixed precision representations is that
-no precision is lost while representing the result of an operation which requires excess precision.  For example,
-the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully.  A multiple
-precision algorithm would augment the precision of the destination to accomodate the result while a single precision system
-would truncate excess bits to maintain a fixed level of precision.
-
-It is possible to implement algorithms which require large integers with fixed precision algorithms.  For example, elliptic
-curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum
-size the system will ever need.  Such an approach can lead to vastly simpler algorithms which can accomodate the
-integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard
-processor has an 8 bit accumulator.}.  However, as efficient as such an approach may be, the resulting source code is not
-normally very flexible.  It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated.
-
-Multiple precision algorithms have the most overhead of any style of arithmetic.  For the the most part the
-overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved
-platforms.  However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the
-inputs.  That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input
-without the designer's explicit forethought.  This leads to lower cost of ownership for the code as it only has to
-be written and tested once.
-
-\section{Purpose of This Text}
-The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms.
-That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping''
-elements that are neglected by authors of other texts on the subject.  Several well reknowned texts \cite{TAOCPV2,HAC}
-give considerably detailed explanations of the theoretical aspects of algorithms and often very little information
-regarding the practical implementation aspects.
-
-In most cases how an algorithm is explained and how it is actually implemented are two very different concepts.  For
-example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple
-algorithm for performing multiple precision integer addition.  However, the description lacks any discussion concerning
-the fact that the two integer inputs may be of differing magnitudes.  As a result the implementation is not as simple
-as the text would lead people to believe.  Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not
-discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}).
-
-Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers
-and fast modular inversion, which we consider practical oversights.  These optimal algorithms are vital to achieve
-any form of useful performance in non-trivial applications.
-
-To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
-package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.com}} package is used
-to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field
-tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text
-discusses a very large portion of the inner workings of the library.
-
-The algorithms that are presented will always include at least one ``pseudo-code'' description followed
-by the actual C source code that implements the algorithm.  The pseudo-code can be used to implement the same
-algorithm in other programming languages as the reader sees fit.
-
-This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch.  Showing
-the reader how the algorithms fit together as well as where to start on various taskings.
-
-\section{Discussion and Notation}
-\subsection{Notation}
-A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
-the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits
-of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer
-$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.
-
-\index{mp\_int}
-The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well
-as auxilary data required to manipulate the data.  These additional members are discussed further in section
-\ref{sec:MPINT}.  For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be
-synonymous.  When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members
-are present as well.  An expression of the type \textit{variablename.item} implies that it should evaluate to the
-member named ``item'' of the variable.  For example, a string of characters may have a member ``length'' which would
-evaluate to the number of characters in the string.  If the string $a$ equals ``hello'' then it follows that
-$a.length = 5$.
-
-For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used
-to solve a given problem.  When an algorithm is described as accepting an integer input it is assumed the input is
-a plain integer with no additional multiple-precision members.  That is, algorithms that use integers as opposed to
-mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management.  These
-algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple
-precision algorithm to solve the same problem.
-
-\subsection{Precision Notation}
-The variable $\beta$ represents the radix of a single digit of a multiple precision integer and
-must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in
-the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range
-$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the
-carry.  Since all modern computers are binary, it is assumed that $q$ is two.
-
-\index{mp\_digit} \index{mp\_word}
-Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent
-a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type.  In
-several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words.
-For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to
-the $j$'th digit of a double precision array.  Whenever an expression is to be assigned to a double precision
-variable it is assumed that all single precision variables are promoted to double precision during the evaluation.
-Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single
-precision data type.
-
-For example, if $\beta = 10^2$ a single precision data type may represent a value in the
-range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$.  Let
-$a = 23$ and $b = 49$ represent two single precision variables.  The single precision product shall be written
-as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$.
-In this particular case, $\hat c = 1127$ and $c = 127$.  The most significant digit of the product would not fit
-in a single precision data type and as a result $c \ne \hat c$.
-
-\subsection{Algorithm Inputs and Outputs}
-Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision
-as indicated.  The only exception to this rule is when variables have been indicated to be of type mp\_int.  This
-distinction is important as scalars are often used as array indicies and various other counters.
-
-\subsection{Mathematical Expressions}
-The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression
-itself.  For example, $\lfloor 5.7 \rfloor = 5$.  Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression
-rounded to an integer not less than the expression itself.  For example, $\lceil 5.1 \rceil = 6$.  Typically when
-the $/$ division symbol is used the intention is to perform an integer division with truncation.  For example,
-$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a
-fraction a real value division is implied, for example ${5 \over 2} = 2.5$.
-
-The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
-of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.
-
-\subsection{Work Effort}
-\index{big-Oh}
-To measure the efficiency of the specified algorithms, a modified big-Oh notation is used.  In this system all
-single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.
-That is a single precision addition, multiplication and division are assumed to take the same time to
-complete.  While this is generally not true in practice, it will simplify the discussions considerably.
-
-Some algorithms have slight advantages over others which is why some constants will not be removed in
-the notation.  For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a
-baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work.  In standard big-Oh notation these
-would both be said to be equivalent to $O(n^2)$.  However,
-in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small.  As a
-result small constant factors in the work effort will make an observable difference in algorithm efficiency.
-
-All of the algorithms presented in this text have a polynomial time work level.  That is, of the form
-$O(n^k)$ for $n, k \in \Z^{+}$.  This will help make useful comparisons in terms of the speed of the algorithms and how
-various optimizations will help pay off in the long run.
-
-\section{Exercises}
-Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to
-the discussion at hand.  These exercises are not designed to be prize winning problems, but instead to be thought
-provoking.  Wherever possible the problems are forward minded, stating problems that will be answered in subsequent
-chapters.  The reader is encouraged to finish the exercises as they appear to get a better understanding of the
-subject material.
-
-That being said, the problems are designed to affirm knowledge of a particular subject matter.  Students in particular
-are encouraged to verify they can answer the problems correctly before moving on.
-
-Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of
-the problem.  However, unlike \cite{TAOCPV2} the problems do not get nearly as hard.  The scoring of these
-exercises ranges from one (the easiest) to five (the hardest).  The following table sumarizes the
-scoring system used.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|l|}
-\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
-                            & minutes to solve.  Usually does not involve much computer time \\
-                            & to solve. \\
-\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
-                     & time usage.  Usually requires a program to be written to \\
-                     & solve the problem. \\
-\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
-                     & of work.  Usually involves trivial research and development of \\
-                     & new theory from the perspective of a student. \\
-\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
-                     & of work and research, the solution to which will demonstrate \\
-                     & a higher mastery of the subject matter. \\
-\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\
-                     & novice to solve.  Solutions to these problems will demonstrate a \\
-                     & complete mastery of the given subject. \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Exercise Scoring System}
-\end{figure}
-
-Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
-devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level
-are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  These
-two levels are essentially entry level questions.
-
-Problems at the third level are meant to be a bit more difficult than the first two levels.  The answer is often
-fairly obvious but arriving at an exacting solution requires some thought and skill.  These problems will almost always
-involve devising a new algorithm or implementing a variation of another algorithm previously presented.  Readers who can
-answer these questions will feel comfortable with the concepts behind the topic at hand.
-
-Problems at the fourth level are meant to be similar to those of the level three questions except they will require
-additional research to be completed.  The reader will most likely not know the answer right away, nor will the text provide
-the exact details of the answer until a subsequent chapter.
-
-Problems at the fifth level are meant to be the hardest
-problems relative to all the other problems in the chapter.  People who can correctly answer fifth level problems have a
-mastery of the subject matter at hand.
-
-Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
-is encouraged to answer the follow-up problems and try to draw the relevance of problems.
-
-\section{Introduction to LibTomMath}
-
-\subsection{What is LibTomMath?}
-LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C.  By portable it
-is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on
-any given platform.
-
-The library has been successfully tested under numerous operating systems including Unix\footnote{All of these
-trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such
-as the Gameboy Advance.  The library is designed to contain enough functionality to be able to develop applications such
-as public key cryptosystems and still maintain a relatively small footprint.
-
-\subsection{Goals of LibTomMath}
-
-Libraries which obtain the most efficiency are rarely written in a high level programming language such as C.  However,
-even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the
-library.  Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM
-processors.  Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window
-exponentiation and Montgomery reduction have been provided to make the library more efficient.
-
-Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface
-(\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized
-algorithms automatically without the developer's specific attention.  One such example is the generic multiplication
-algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication
-based on the magnitude of the inputs and the configuration of the library.
-
-Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should
-be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
-MPI library was used as a API template for all the basic functions.  MPI was chosen because it is another library that fits
-in the same niche as LibTomMath.  Even though LibTomMath uses MPI as the template for the function names and argument
-passing conventions, it has been written from scratch by Tom St Denis.
-
-The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum''
-library exists which can be used to teach computer science students how to perform fast and reliable multiple precision
-integer arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.
-
-\section{Choice of LibTomMath}
-LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
-for more worthy reasons.  Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL
-\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for
-reasons that will be explained in the following sub-sections.
-
-\subsection{Code Base}
-The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
-segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
-developer can more readily discern the true intent of a given section of source code without trying to keep track of
-what conditional code will be used.
-
-The code base of LibTomMath is well organized.  Each function is in its own separate source code file
-which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
-file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
-very hard.  GMP has many conditional code segments which also hinder tracing.
-
-When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
- which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about
-$50$KiB) but LibTomMath is also much faster and more complete than MPI.
-
-\subsection{API Simplicity}
-LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build
-with LibTomMath without change. The function names correlate directly to the action they perform.  Almost all of the
-functions share the same parameter passing convention.  The learning curve is fairly shallow with the API provided
-which is an extremely valuable benefit for the student and developer alike.
-
-The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to
-illegible short hand.  LibTomMath does not share this characteristic.
-
-The GMP library also does not return error codes.  Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors
-are signaled to the host application.  This happens to be the fastest approach but definitely not the most versatile.  In
-effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely
-undersireable in many situations.
-
-\subsection{Optimizations}
-While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does
-feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring.  GMP
-and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations.  GMP lacks a few
-of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP
-only had Barrett and Montgomery modular reduction algorithms.}.
-
-LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
-exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually
-slower than the best libraries such as GMP and OpenSSL by only a small factor.
-
-\subsection{Portability and Stability}
-LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler
-(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any
-variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of
-MPI has recently stopped working on his library and LIP has long since been discontinued.
-
-GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
-development and are very stable across a variety of platforms.
-
-\subsection{Choice}
-LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
-the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However,
-the reader is encouraged to download their own copy of the library to actually be able to work with the library.
-
-\chapter{Getting Started}
-\section{Library Basics}
-The trick to writing any useful library of source code is to build a solid foundation and work outwards from it.  First,
-a problem along with allowable solution parameters should be identified and analyzed.  In this particular case the
-inability to accomodate multiple precision integers is the problem.  Futhermore, the solution must be written
-as portable source code that is reasonably efficient across several different computer platforms.
-
-After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion.
-That is, to implement the lowest level dependencies first and work towards the most abstract functions last.  For example,
-before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm.
-By building outwards from a base foundation instead of using a parallel design methodology the resulting project is
-highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
-has a small footprint and updates are easy to perform.
-
-Usually when I start a project I will begin with the header files.  I define the data types I think I will need and
-prototype the initial functions that are not dependent on other functions (within the library).  After I
-implement these base functions I prototype more dependent functions and implement them.   The process repeats until
-I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as
-mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod().  As an example as to
-why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the
-dependent function mp\_exptmod() was written.  Adding the new multiplication algorithms did not require changes to the
-mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development
-for new algorithms.  This methodology allows new algorithms to be tested in a complete framework with relative ease.
-
-FIGU,design_process,Design Flow of the First Few Original LibTomMath Functions.
-
-Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing
-the source code.  For example, one day I may audit the multipliers and the next day the polynomial basis functions.
-
-It only makes sense to begin the text with the preliminary data types and support algorithms required as well.
-This chapter discusses the core algorithms of the library which are the dependents for every other algorithm.
-
-\section{What is a Multiple Precision Integer?}
-Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot
-be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is
-to use fixed precision data types to create and manipulate multiple precision integers which may represent values
-that are very large.
-
-As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
-the largest single digit value is $9$.  However, by concatenating digits together larger numbers may be represented.  Newly prepended digits
-(\textit{to the left}) are said to be in a different power of ten column.  That is, the number $123$ can be described as having a $1$ in the hundreds
-column, $2$ in the tens column and $3$ in the ones column.  Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$.  Computer based
-multiple precision arithmetic is essentially the same concept.  Larger integers are represented by adjoining fixed
-precision computer words with the exception that a different radix is used.
-
-What most people probably do not think about explicitly are the various other attributes that describe a multiple precision
-integer.  For example, the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive,
-that is the sign of this particular integer is positive as opposed to negative.  Second, the integer has three digits in
-its representation.  There is an additional property that the integer posesses that does not concern pencil-and-paper
-arithmetic.  The third property is how many digits placeholders are available to hold the integer.
-
-The human analogy of this third property is ensuring there is enough space on the paper to write the integer.  For example,
-if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left.
-Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer
-will not exceed the allowed boundaries.  These three properties make up what is known as a multiple precision
-integer or mp\_int for short.
-
-\subsection{The mp\_int Structure}
-\label{sec:MPINT}
-The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for
-any such data type but it does provide for making composite data types known as structures.  The following is the structure definition
-used within LibTomMath.
-
-\index{mp\_int}
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-%\begin{verbatim}
-\begin{tabular}{|l|}
-\hline
-typedef struct \{ \\
-\hspace{3mm}int used, alloc, sign;\\
-\hspace{3mm}mp\_digit *dp;\\
-\} \textbf{mp\_int}; \\
-\hline
-\end{tabular}
-%\end{verbatim}
-\end{small}
-\caption{The mp\_int Structure}
-\label{fig:mpint}
-\end{center}
-\end{figure}
-
-The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
-
-\begin{enumerate}
-\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
-a given integer.  The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count.
-
-\item The \textbf{alloc} parameter denotes how
-many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count
-of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the
-array to accommodate the precision of the result.
-
-\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple
-precision integer.  It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits.  The array is maintained in a least
-significant digit order.  As a pencil and paper analogy the array is organized such that the right most digits are stored
-first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array.  For example,
-if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then
-it would represent the integer $a + b\beta + c\beta^2 + \ldots$
-
-\index{MP\_ZPOS} \index{MP\_NEG}
-\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).
-\end{enumerate}
-
-\subsubsection{Valid mp\_int Structures}
-Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency.
-The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy().
-
-\begin{enumerate}
-\item The value of \textbf{alloc} may not be less than one.  That is \textbf{dp} always points to a previously allocated
-array of digits.
-\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero.
-\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero.  That is,
-leading zero digits in the most significant positions must be trimmed.
-   \begin{enumerate}
-   \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero.
-   \end{enumerate}
-\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero;
-this represents the mp\_int value of zero.
-\end{enumerate}
-
-\section{Argument Passing}
-A convention of argument passing must be adopted early on in the development of any library.  Making the function
-prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity.
-In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int
-structures.  That means that the source (input) operands are placed on the left and the destination (output) on the right.
-Consider the following examples.
-
-\begin{verbatim}
-   mp_mul(&a, &b, &c);   /* c = a * b */
-   mp_add(&a, &b, &a);   /* a = a + b */
-   mp_sqr(&a, &b);       /* b = a * a */
-\end{verbatim}
-
-The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
-functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
-
-Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order
-of assignment expressions.  That is, the destination (output) is on the left and arguments (inputs) are on the right.  In
-truth, it is entirely a matter of preference.  In the case of LibTomMath the convention from the MPI library has been
-adopted.
-
-Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a
-destination.  For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important
-feature to implement since it allows the calling functions to cut down on the number of variables it must maintain.
-However, to implement this feature specific care has to be given to ensure the destination is not modified before the
-source is fully read.
-
-\section{Return Values}
-A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them
-to the caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  However, the end
-developer can still manage to cause a library to crash.  For example, by passing an invalid pointer an application may
-fault by dereferencing memory not owned by the application.
-
-In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for
-instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor
-will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an
-\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
-
-\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Value} & \textbf{Meaning} \\
-\hline \textbf{MP\_OKAY} & The function was successful \\
-\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
-\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
-\hline
-\end{tabular}
-\end{center}
-\caption{LibTomMath Error Codes}
-\label{fig:errcodes}
-\end{figure}
-
-When an error is detected within a function it should free any memory it allocated, often during the initialization of
-temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the
-function was called.  Error checking with this style of API is fairly simple.
-
-\begin{verbatim}
-   int err;
-   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
-      printf("Error: %s\n", mp_error_to_string(err));
-      exit(EXIT_FAILURE);
-   }
-\end{verbatim}
-
-The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal
-and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
-
-\section{Initialization and Clearing}
-The logical starting point when actually writing multiple precision integer functions is the initialization and
-clearing of the mp\_int structures.  These two algorithms will be used by the majority of the higher level algorithms.
-
-Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
-the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even though
-the initial integer will represent zero.  If only a single digit were allocated quite a few subsequent re-allocations
-would occur when operations are performed on the integers.  There is a tradeoff between how many default digits to allocate
-and how many re-allocations are tolerable.  Obviously allocating an excessive amount of digits initially will waste
-memory and become unmanageable.
-
-If the memory for the digits has been successfully allocated then the rest of the members of the structure must
-be initialized.  Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set
-to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
-
-\subsection{Initializing an mp\_int}
-An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
-structure are set to valid values.  The mp\_init algorithm will perform such an action.
-
-\index{mp\_init}
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Allocate memory and initialize $a$ to a known valid mp\_int state.  \\
-\hline \\
-1.  Allocate memory for \textbf{MP\_PREC} digits. \\
-2.  If the allocation failed return(\textit{MP\_MEM}) \\
-3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$\\
-4.  $a.sign \leftarrow MP\_ZPOS$\\
-5.  $a.used \leftarrow 0$\\
-6.  $a.alloc \leftarrow MP\_PREC$\\
-7.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init}
-\end{figure}
-
-\textbf{Algorithm mp\_init.}
-The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
-manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
-a valid assumption if the input resides on the stack.
-
-Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
-the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC}
-name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.}
-used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
-precision number you'll be working with.
-
-Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
-heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack
-memory and the number of heap operations will be trivial.
-
-Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
-\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
-of the original condition of the input.
-
-\textbf{Remark.}
-This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
-when the ``to'' keyword is placed between two expressions.  For example, ``for $a$ from $b$ to $c$ do'' means that
-a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$.  In each
-iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$.  If $b > c$ occured
-the loop would not iterate.  By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate
-decrementally.
-
-EXAM,bn_mp_init.c
-
-One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure.  It
-is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The
-call to mp\_init() is used only to initialize the members of the structure to a known default state.
-
-Here we see (line @23,XMALLOC@) the memory allocation is performed first.  This allows us to exit cleanly and quickly
-if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
-was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
-but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
-memory allocation routine.
-
-In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
-accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a
-portable fashion you have to actually assign the value.  The for loop (line @28,for@) performs this required
-operation.
-
-After the memory has been successfully initialized the remainder of the members are initialized
-(lines @29,used@ through @31,sign@) to their respective default states.  At this point the algorithm has succeeded and
-a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the
-mp\_int structure has been properly initialized and is safe to use with other functions within the library.
-
-\subsection{Clearing an mp\_int}
-When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be
-returned to the application's memory pool with the mp\_clear algorithm.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clear}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
-\hline \\
-1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
-2.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}2.1  $a_n \leftarrow 0$ \\
-3.  Free the memory allocated for the digits of $a$. \\
-4.  $a.used \leftarrow 0$ \\
-5.  $a.alloc \leftarrow 0$ \\
-6.  $a.sign \leftarrow MP\_ZPOS$ \\
-7.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clear}
-\end{figure}
-
-\textbf{Algorithm mp\_clear.}
-This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that
-if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
-is to free the allocated memory.
-
-The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
-algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid
-digit pointer \textbf{dp} setting.
-
-Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
-with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
-
-EXAM,bn_mp_clear.c
-
-The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line @23,a->dp != NULL@)
-checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
-\textbf{NULL} in which case the if statement will evaluate to true.
-
-The digits of the mp\_int are cleared by the for loop (line @25,for@) which assigns a zero to every digit.  Similar to mp\_init()
-the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.
-
-The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
-a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
-still has to be reset to \textbf{NULL} manually (line @33,NULL@).
-
-Now that the digits have been cleared and deallocated the other members are set to their final values (lines @34,= 0@ and @35,ZPOS@).
-
-\section{Maintenance Algorithms}
-
-The previous sections describes how to initialize and clear an mp\_int structure.  To further support operations
-that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be
-able to augment the precision of an mp\_int and
-initialize mp\_ints with differing initial conditions.
-
-These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level
-algorithms such as addition, multiplication and modular exponentiation.
-
-\subsection{Augmenting an mp\_int's Precision}
-When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire
-result of an operation without loss of precision.  Quite often the size of the array given by the \textbf{alloc} member
-is large enough to simply increase the \textbf{used} digit count.  However, when the size of the array is too small it
-must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_grow}. \\
-\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
-\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
-\hline \\
-1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
-2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
-3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-4.  Re-allocate the array of digits $a$ to size $v$ \\
-5.  If the allocation failed then return(\textit{MP\_MEM}). \\
-6.  for n from a.alloc to $v - 1$ do  \\
-\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.alloc \leftarrow v$ \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_grow}
-\end{figure}
-
-\textbf{Algorithm mp\_grow.}
-It is ideal to prevent re-allocations from being performed if they are not required (step one).  This is useful to
-prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow.
-
-The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three).
-This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values.
-
-It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact.  This is much
-akin to how the \textit{realloc} function from the standard C library works.  Since the newly allocated digits are
-assumed to contain undefined values they are initially set to zero.
-
-EXAM,bn_mp_grow.c
-
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @24,alloc@) checks
-if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
-the function skips the re-allocation part thus saving time.
-
-When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
-padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line @25, size@).  The XREALLOC function is used
-to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
-function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
-the re-allocation.  All	that is left is to clear the newly allocated digits and return.
-
-Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
-an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
-result in a memory leak if XREALLOC ever failed.
-
-\subsection{Initializing Variable Precision mp\_ints}
-Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size
-of input mp\_ints to a given algorithm.  The purpose of algorithm mp\_init\_size is similar to mp\_init except that it
-will allocate \textit{at least} a specified number of digits.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_size}. \\
-\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$. \\
-\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
-\hline \\
-1.  $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\
-2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-3.  Allocate $v$ digits. \\
-4.  for $n$ from $0$ to $v - 1$ do \\
-\hspace{3mm}4.1  $a_n \leftarrow 0$ \\
-5.  $a.sign \leftarrow MP\_ZPOS$\\
-6.  $a.used \leftarrow 0$\\
-7.  $a.alloc \leftarrow v$\\
-8.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_init\_size}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_size.}
-This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of
-digits allocated can be controlled by the second input argument $b$.  The input size is padded upwards so it is a
-multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits.  This padding is used to prevent trivial
-allocations from becoming a bottleneck in the rest of the algorithms.
-
-Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero.  This
-particular algorithm is useful if it is known ahead of time the approximate size of the input.  If the approximation is
-correct no further memory re-allocations are required to work with the mp\_int.
-
-EXAM,bn_mp_init_size.c
-
-The number of digits $b$ requested is padded (line @22,MP_PREC@) by first augmenting it to the next multiple of
-\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the
-mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be
-returned (line @27,return@).
-
-The digits are allocated with the malloc() function (line @27,XMALLOC@) and set to zero afterwards (line @38,for@).  The
-\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set
-to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines @29,used@, @30,alloc@ and @31,sign@).  If the function
-returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the
-functions to work with.
-
-\subsection{Multiple Integer Initializations and Clearings}
-Occasionally a function will require a series of mp\_int data types to be made available simultaneously.
-The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single
-statement.  It is essentially a shortcut to multiple initializations.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_multi}. \\
-\textbf{Input}.   Variable length array $V_k$ of mp\_int variables of length $k$. \\
-\textbf{Output}.  The array is initialized such that each mp\_int of $V_k$ is ready to use. \\
-\hline \\
-1.  for $n$ from 0 to $k - 1$ do \\
-\hspace{+3mm}1.1.  Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\
-\hspace{+3mm}1.2.  If initialization failed then do \\
-\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
-\hspace{+9mm}1.2.1.1.  Free the mp\_int $V_j$ (\textit{mp\_clear}) \\
-\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
-2.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_multi}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_multi.}
-The algorithm will initialize the array of mp\_int variables one at a time.  If a runtime error has been detected
-(\textit{step 1.2}) all of the previously initialized variables are cleared.  The goal is an ``all or nothing''
-initialization which allows for quick recovery from runtime errors.
-
-EXAM,bn_mp_init_multi.c
-
-This function intializes a variable length list of mp\_int structure pointers.  However, instead of having the mp\_int
-structures in an actual C array they are simply passed as arguments to the function.  This function makes use of the
-``...'' argument syntax of the C programming language.  The list is terminated with a final \textbf{NULL} argument
-appended on the right.
-
-The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
-$n$ of succesfully initialized mp\_int structures is maintained (line @47,n++@) such that if a failure does occur,
-the algorithm can backtrack and free the previously initialized structures (lines @27,if@ to @46,}@).
-
-
-\subsection{Clamping Excess Digits}
-When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of
-the function instead of checking during the computation.  For example, a multiplication of a $i$ digit number by a
-$j$ digit produces a result of at most $i + j$ digits.  It is entirely possible that the result is $i + j - 1$
-though, with no final carry into the last position.  However, suppose the destination had to be first expanded
-(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry.
-That would be a considerable waste of time since heap operations are relatively slow.
-
-The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
-terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
-there would be an excess high order zero digit.
-
-For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit
-will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
-accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very
-low the representation is excessively large.
-
-The mp\_clamp algorithm is designed to solve this very problem.  It will trim high-order zeros by decrementing the
-\textbf{used} count until a non-zero most significant digit is found.  Also in this system, zero is considered to be a
-positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to
-\textbf{MP\_ZPOS}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clamp}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
-\hline \\
-1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
-\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
-2.  if $a.used = 0$ then do \\
-\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
-\hline \\
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clamp}
-\end{figure}
-
-\textbf{Algorithm mp\_clamp.}
-As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
-the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for
-when all of the digits are zero to ensure that the mp\_int is valid at all times.
-
-EXAM,bn_mp_clamp.c
-
-Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
-language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is
-important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously
-undesirable.  The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not
-the pointer ``a''.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
-                     & \\
-$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
-                     & encryption when $\beta = 2^{28}$.  \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
-                     & \\
-$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
-                     & \\
-\end{tabular}
-
-
-%%%
-% CHAPTER FOUR
-%%%
-
-\chapter{Basic Operations}
-
-\section{Introduction}
-In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining
-mp\_int structures.  This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low
-level basis of the entire library.  While these algorithm are relatively trivial it is important to understand how they
-work before proceeding since these algorithms will be used almost intrinsically in the following chapters.
-
-The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of
-mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures
-represent.
-
-\section{Assigning Values to mp\_int Structures}
-\subsection{Copying an mp\_int}
-Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making
-a copy for the purposes of this text.  The copy of the mp\_int will be a separate entity that represents the same
-value as the mp\_int it was copied from.  The mp\_copy algorithm provides this functionality.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_copy}. \\
-\textbf{Input}.  An mp\_int $a$ and $b$. \\
-\textbf{Output}.  Store a copy of $a$ in $b$. \\
-\hline \\
-1.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
-2.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}2.1  $b_{n} \leftarrow a_{n}$ \\
-3.  for $n$ from $a.used$ to $b.used - 1$ do \\
-\hspace{3mm}3.1  $b_{n} \leftarrow 0$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $b.sign \leftarrow a.sign$ \\
-6.  return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_copy.}
-This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will
-represent the same integer as the mp\_int $a$.  The mp\_int $b$ shall be a complete and distinct copy of the
-mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$.
-
-If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow
-algorithm.  The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two
-and three).  The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of
-$b$.
-
-\textbf{Remark.}  This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the
-text.  The error return codes of other algorithms are not explicitly checked in the pseudo-code presented.  For example, in
-step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded.  Text space is
-limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return
-the error code itself.  However, the C code presented will demonstrate all of the error handling logic required to
-implement the pseudo-code.
-
-EXAM,bn_mp_copy.c
-
-Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
-mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without
-copying digits (line @24,a == b@).
-
-The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
-$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines @29,alloc@ to @33,}@).  In order to
-simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
-of the mp\_ints $a$ and $b$ respectively.  These aliases (lines @42,tmpa@ and @45,tmpb@) allow the compiler to access the digits without first dereferencing the
-mp\_int pointers and then subsequently the pointer to the digits.
-
-After the aliases are established the digits from $a$ are copied into $b$ (lines @48,for@ to @50,}@) and then the excess
-digits of $b$ are set to zero (lines @53,for@ to @55,}@).  Both ``for'' loops make use of the pointer aliases and in
-fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization
-allows the alias to stay in a machine register fairly easy between the two loops.
-
-\textbf{Remarks.}  The use of pointer aliases is an implementation methodology first introduced in this function that will
-be used considerably in other functions.  Technically, a pointer alias is simply a short hand alias used to lower the
-number of pointer dereferencing operations required to access data.  For example, a for loop may resemble
-
-\begin{alltt}
-for (x = 0; x < 100; x++) \{
-    a->num[4]->dp[x] = 0;
-\}
-\end{alltt}
-
-This could be re-written using aliases as
-
-\begin{alltt}
-mp_digit *tmpa;
-a = a->num[4]->dp;
-for (x = 0; x < 100; x++) \{
-    *a++ = 0;
-\}
-\end{alltt}
-
-In this case an alias is used to access the
-array of digits within an mp\_int structure directly.  It may seem that a pointer alias is strictly not required
-as a compiler may optimize out the redundant pointer operations.  However, there are two dominant reasons to use aliases.
-
-The first reason is that most compilers will not effectively optimize pointer arithmetic.  For example, some optimizations
-may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC).  Also some optimizations may
-work for GCC and not MSVC.  As such it is ideal to find a common ground for as many compilers as possible.  Pointer
-aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code
-stands a better chance of being faster.
-
-The second reason is that pointer aliases often can make an algorithm simpler to read.  Consider the first ``for''
-loop of the function mp\_copy() re-written to not use pointer aliases.
-
-\begin{alltt}
-    /* copy all the digits */
-    for (n = 0; n < a->used; n++) \{
-      b->dp[n] = a->dp[n];
-    \}
-\end{alltt}
-
-Whether this code is harder to read depends strongly on the individual.  However, it is quantifiably slightly more
-complicated as there are four variables within the statement instead of just two.
-
-\subsubsection{Nested Statements}
-Another commonly used technique in the source routines is that certain sections of code are nested.  This is used in
-particular with the pointer aliases to highlight code phases.  For example, a Comba multiplier (discussed in chapter six)
-will typically have three different phases.  First the temporaries are initialized, then the columns calculated and
-finally the carries are propagated.  In this example the middle column production phase will typically be nested as it
-uses temporary variables and aliases the most.
-
-The nesting also simplies the source code as variables that are nested are only valid for their scope.  As a result
-the various temporary variables required do not propagate into other sections of code.
-
-
-\subsection{Creating a Clone}
-Another common operation is to make a local temporary copy of an mp\_int argument.  To initialize an mp\_int
-and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone.  This is
-useful within functions that need to modify an argument but do not wish to actually modify the original copy.  The
-mp\_init\_copy algorithm has been designed to help perform this task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_copy}. \\
-\textbf{Input}.   An mp\_int $a$ and $b$\\
-\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
-\hline \\
-1.  Init $a$.  (\textit{mp\_init}) \\
-2.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
-3.  Return the status of the copy operation. \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_copy.}
-This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it.  As
-such this algorithm will perform two operations in one step.
-
-EXAM,bn_mp_init_copy.c
-
-This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that
-\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
-and \textbf{a} will be left intact.
-
-\section{Zeroing an Integer}
-Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
-perform this task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_zero}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Zero the contents of $a$ \\
-\hline \\
-1.  $a.used \leftarrow 0$ \\
-2.  $a.sign \leftarrow$ MP\_ZPOS \\
-3.  for $n$ from 0 to $a.alloc - 1$ do \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_zero}
-\end{figure}
-
-\textbf{Algorithm mp\_zero.}
-This algorithm simply resets a mp\_int to the default state.
-
-EXAM,bn_mp_zero.c
-
-After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the
-\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
-
-\section{Sign Manipulation}
-\subsection{Absolute Value}
-With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
-the absolute value of an mp\_int.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_abs}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = \vert a \vert$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  $b.sign \leftarrow MP\_ZPOS$ \\
-4.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_abs}
-\end{figure}
-
-\textbf{Algorithm mp\_abs.}
-This algorithm computes the absolute of an mp\_int input.  First it copies $a$ over $b$.  This is an example of an
-algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful.  This allows,
-for instance, the developer to pass the same mp\_int as the source and destination to this function without addition
-logic to handle it.
-
-EXAM,bn_mp_abs.c
-
-This fairly trivial algorithm first eliminates non--required duplications (line @27,a != b@) and then sets the
-\textbf{sign} flag to \textbf{MP\_ZPOS}.
-
-\subsection{Integer Negation}
-With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
-the negative of an mp\_int input.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_neg}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = -a$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  If $a.used = 0$ then return(\textit{MP\_OKAY}). \\
-4.  If $a.sign = MP\_ZPOS$ then do \\
-\hspace{3mm}4.1  $b.sign = MP\_NEG$. \\
-5.  else do \\
-\hspace{3mm}5.1  $b.sign = MP\_ZPOS$. \\
-6.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_neg}
-\end{figure}
-
-\textbf{Algorithm mp\_neg.}
-This algorithm computes the negation of an input.  First it copies $a$ over $b$.  If $a$ has no used digits then
-the algorithm returns immediately.  Otherwise it flips the sign flag and stores the result in $b$.  Note that if
-$a$ had no digits then it must be positive by definition.  Had step three been omitted then the algorithm would return
-zero as negative.
-
-EXAM,bn_mp_neg.c
-
-Like mp\_abs() this function avoids non--required duplications (line @21,a != b@) and then sets the sign.  We
-have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
-than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
-
-\section{Small Constants}
-\subsection{Setting Small Constants}
-Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set}. \\
-\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}). \\
-2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
-3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
-                              1 &  \mbox{if }a_0 > 0 \\
-                              0 &  \mbox{if }a_0 = 0
-                              \end{array} \right .$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set}
-\end{figure}
-
-\textbf{Algorithm mp\_set.}
-This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
-single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
-
-EXAM,bn_mp_set.c
-
-First we zero (line @21,mp_zero@) the mp\_int to make sure that the other members are initialized for a
-small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
-is zero.  Next we set the digit and reduce it modulo $\beta$ (line @22,MP_MASK@).  After this step we have to
-check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
-to zero.
-
-We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with
-$2^k - 1$ will perform the same operation.
-
-One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses
-this function should take that into account.  Only trivially small constants can be set using this function.
-
-\subsection{Setting Large Constants}
-To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal.  It accepts a ``long''
-data type as input and will always treat it as a 32-bit integer.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set\_int}. \\
-\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}) \\
-2.  for $n$ from 0 to 7 do \\
-\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
-\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
-\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
-\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
-3.  Clamp excess used digits (\textit{mp\_clamp}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set\_int}
-\end{figure}
-
-\textbf{Algorithm mp\_set\_int.}
-The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the
-mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
-next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is
-incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
-zero digits used and the newly added four bits would be ignored.
-
-Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
-
-EXAM,bn_mp_set_int.c
-
-This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
-addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits.  While it may not
-seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@
-as well as the  call to mp\_clamp() on line @40,mp_clamp@.  Both functions will clamp excess leading digits which keeps
-the number of used digits low.
-
-\section{Comparisons}
-\subsection{Unsigned Comparisions}
-Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
-to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
-to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude
-positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.
-
-The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
-mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the
-signs are known to agree in advance.
-
-To facilitate working with the results of the comparison functions three constants are required.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|r|l|}
-\hline \textbf{Constant} & \textbf{Meaning} \\
-\hline \textbf{MP\_GT} & Greater Than \\
-\hline \textbf{MP\_EQ} & Equal To \\
-\hline \textbf{MP\_LT} & Less Than \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Comparison Return Codes}
-\end{figure}
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp\_mag}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
-\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
-\hline \\
-1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
-2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
-3.  for n from $a.used - 1$ to 0 do \\
-\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
-\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
-4.  Return(\textit{MP\_EQ}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp\_mag}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp\_mag.}
-By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
-\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.
-Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.
-If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.
-
-By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
-the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
-
-EXAM,bn_mp_cmp_mag.c
-
-The two if statements (lines @24,if@ and @28,if@) compare the number of digits in the two inputs.  These two are
-performed before all of the digits are compared since it is a very cheap test to perform and can potentially save
-considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be
-smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
-
-
-
-\subsection{Signed Comparisons}
-Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude
-comparison a trivial signed comparison algorithm can be written.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
-\hline \\
-1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
-2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
-3.  if $a.sign = MP\_NEG$ then \\
-\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
-4   Otherwise \\
-\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp.}
-The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate
-comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step
-three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then
-$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
-
-EXAM,bn_mp_cmp.c
-
-The two if statements (lines @22,if@ and @26,if@) perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   The inputs are compared (line @30,if@) based on magnitudes.  If the signs were both
-negative then the unsigned comparison is performed in the opposite direction (line @31,mp_cmp_mag@).  Otherwise, the signs are assumed to
-be both positive and a forward direction unsigned comparison is performed.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
-                     & \\
-$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
-                     & of two random digits (of equal magnitude) before a difference is found. \\
-                     & \\
-$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
-                     & on the observations made in the previous problem. \\
-                     &
-\end{tabular}
-
-\chapter{Basic Arithmetic}
-\section{Introduction}
-At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been
-established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These
-algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important
-that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms
-which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.
-
-MARK,SHIFTS
-All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right
-logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real
-number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}).
-Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two.
-For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
-
-One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
-from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the
-result is $110_2$.
-
-\section{Addition and Subtraction}
-In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
-$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.
-As a result subtraction can be performed with a trivial series of logical operations and an addition.
-
-However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
-sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or
-subtraction algorithms with the sign fixed up appropriately.
-
-The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
-the integers respectively.
-
-\subsection{Low Level Addition}
-An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the
-trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.
-Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
-
-\newpage
-\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
-\hline \\
-1.  if $a.used > b.used$ then \\
-\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
-\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
-\hspace{+3mm}1.3  $x   \leftarrow a$ \\
-2.  else  \\
-\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
-\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
-\hspace{+3mm}2.3  $x   \leftarrow b$ \\
-3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
-4.  $oldused \leftarrow c.used$ \\
-5.  $c.used \leftarrow max + 1$ \\
-6.  $u \leftarrow 0$ \\
-7.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{+3mm}7.1  $c_n \leftarrow a_n + b_n + u$ \\
-\hspace{+3mm}7.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-8.  if $min \ne max$ then do \\
-\hspace{+3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{+6mm}8.1.1  $c_n \leftarrow x_n + u$ \\
-\hspace{+6mm}8.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $c_{max} \leftarrow u$ \\
-10.  if $olduse > max$ then \\
-\hspace{+3mm}10.1  for $n$ from $max + 1$ to $oldused - 1$ do \\
-\hspace{+6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_add}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_add.}
-This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.
-Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the
-MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
-
-The first thing that has to be accomplished is to sort out which of the two inputs is the largest.  The addition logic
-will simply add all of the smallest input to the largest input and store that first part of the result in the
-destination.  Then it will apply a simpler addition loop to excess digits of the larger input.
-
-The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two
-inputs.  The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the
-same number of digits.  After the inputs are sorted the destination $c$ is grown as required to accomodate the sum
-of the two inputs.  The original \textbf{used} count of $c$ is copied and set to the new used count.
-
-At this point the first addition loop will go through as many digit positions that both inputs have.  The carry
-variable $\mu$ is set to zero outside the loop.  Inside the loop an ``addition'' step requires three statements to produce
-one digit of the summand.  First
-two digits from $a$ and $b$ are added together along with the carry $\mu$.  The carry of this step is extracted and stored
-in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$.
-
-Now all of the digit positions that both inputs have in common have been exhausted.  If $min \ne max$ then $x$ is an alias
-for one of the inputs that has more digits.  A simplified addition loop is then used to essentially copy the remaining digits
-and the carry to the destination.
-
-The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition.
-
-
-EXAM,bn_s_mp_add.c
-
-We first sort (lines @27,if@ to @35,}@) the inputs based on magnitude and determine the $min$ and $max$ variables.
-Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
-grow the destination (@37,init@ to @42,}@) ensure that it can accomodate the result of the addition.
-
-Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on
-lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
-compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
-
-The initial carry $u$ will be cleared (line @65,u = 0@), note that $u$ is of type mp\_digit which ensures type
-compatibility within the implementation.  The initial addition (line @66,for@ to @75,}@) adds digits from
-both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
-(line @81,for@ to @90,}@) adds the remaining digits from the larger of the two inputs.  The addition is finished
-with the final carry being stored in $tmpc$ (line @94,tmpc++@).  Note the ``++'' operator within the same expression.
-After line @94,tmpc++@, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop (line @97,for@ to @99,}@) which set any old upper digits to zero.
-
-\subsection{Low Level Subtraction}
-The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
-unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must
-be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.
-This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
-
-MARK,GAMMA
-
-For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
-the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For
-this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a
-mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).
-
-For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
-\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
-\hline \\
-1.  $min \leftarrow b.used$ \\
-2.  $max \leftarrow a.used$ \\
-3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
-4.  $oldused \leftarrow c.used$ \\
-5.  $c.used \leftarrow max$ \\
-6.  $u \leftarrow 0$ \\
-7.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{3mm}7.1  $c_n \leftarrow a_n - b_n - u$ \\
-\hspace{3mm}7.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-8.  if $min < max$ then do \\
-\hspace{3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{6mm}8.1.1  $c_n \leftarrow a_n - u$ \\
-\hspace{6mm}8.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9. if $oldused > max$ then do \\
-\hspace{3mm}9.1  for $n$ from $max$ to $oldused - 1$ do \\
-\hspace{6mm}9.1.1  $c_n \leftarrow 0$ \\
-10. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
-11. Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_sub}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sub.}
-This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
-passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
-algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
-of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
-
-The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2
-set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at
-most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and
-set to the maximal count for the operation.
-
-The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision
-subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction
-loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.
-
-For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to
-the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the
-third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the
-way to the most significant bit.
-
-Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most
-significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
-is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the
-carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.
-
-If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
-10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
-
-EXAM,bn_s_mp_sub.c
-
-Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded
-(lines @24,min@ and @25,max@).  In reality the $min$ and $max$ variables are only aliases and are only
-used to make the source code easier to read.  Again the pointer alias optimization is used
-within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
-(lines @42,tmpa@, @43,tmpb@ and @44,tmpc@) for $a$, $b$ and $c$ respectively.
-
-The first subtraction loop (lines @47,u = 0@ through @61,}@) subtract digits from both inputs until the smaller of
-the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward''
-method of extracting the carry (line @57, >>@).  The traditional method for extracting the carry would be to shift
-by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of
-the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry
-extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the
-most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This
-optimization only works on twos compliment machines which is a safe assumption to make.
-
-If $a$ has a larger magnitude than $b$ an additional loop (lines @64,for@ through @73,}@) is required to propagate
-the carry through $a$ and copy the result to $c$.
-
-\subsection{High Level Addition}
-Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
-established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data
-types.
-
-Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign}
-flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
-
-\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed addition $c = a + b$. \\
-\hline \\
-1.  if $a.sign = b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_add}
-\end{figure}
-
-\textbf{Algorithm mp\_add.}
-This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from
-either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly
-straightforward but restricted since subtraction can only produce positive results.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&&\\
-
-\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
-\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
-
-\hline &&&&\\
-
-\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Addition Guide Chart}
-\label{fig:AddChart}
-\end{figure}
-
-Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three
-specific cases need to be handled.  The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are
-forwarded to step three to check for errors.  This simplifies the description of the algorithm considerably and best
-follows how the implementation actually was achieved.
-
-Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
-s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
-to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.
-
-For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
-produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp
-within algorithm s\_mp\_add will force $-0$ to become $0$.
-
-EXAM,bn_mp_add.c
-
-The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
-is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
-explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
-level functions do so.  Returning their return code is sufficient.
-
-\subsection{High Level Subtraction}
-The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed subtraction $c = a - b$. \\
-\hline \\
-1.  if $a.sign \ne b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
-                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
-                              MP\_NEG  &  \mbox{otherwise} \\
-                              \end{array} \right .$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_sub}
-\end{figure}
-
-\textbf{Algorithm mp\_sub.}
-This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or
-\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  Chart \ref{fig:SubChart} lists the eight possible inputs and
-the operations required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Subtraction Guide Chart}
-\label{fig:SubChart}
-\end{figure}
-
-Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the
-algorithm from producing $-a - -a = -0$ as a result.
-
-EXAM,bn_mp_sub.c
-
-Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
-and forward it to the end of the function.  On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a
-``greater than or equal to'' comparison.
-
-\section{Bit and Digit Shifting}
-MARK,POLY
-It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.
-This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.
-
-In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
-the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
-are on radix-$\beta$ digits.
-
-\subsection{Multiplication by Two}
-
-In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient
-operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = 2a$. \\
-\hline \\
-1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
-2.  $oldused \leftarrow b.used$ \\
-3.  $b.used \leftarrow a.used$ \\
-4.  $r \leftarrow 0$ \\
-5.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}5.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
-\hspace{3mm}5.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}5.3  $r \leftarrow rr$ \\
-6.  If $r \ne 0$ then do \\
-\hspace{3mm}6.1  $b_{n + 1} \leftarrow r$ \\
-\hspace{3mm}6.2  $b.used \leftarrow b.used + 1$ \\
-7.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2.}
-This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such
-an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since
-it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.
-
-Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
-is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
-
-Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together
-are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
-obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
-the previous carry.  Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with
-forwarding the carry to the next iteration.
-
-Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.
-Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
-
-EXAM,bn_mp_mul_2.c
-
-This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
-is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling.
-
-\subsection{Division by Two}
-A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = a/2$. \\
-\hline \\
-1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
-2.  If the reallocation failed return(\textit{MP\_MEM}). \\
-3.  $oldused \leftarrow b.used$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $r \leftarrow 0$ \\
-6.  for $n$ from $b.used - 1$ to $0$ do \\
-\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
-\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}6.3  $r \leftarrow rr$ \\
-7.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
-10.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2.}
-This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
-core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
-could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
-reading past the end of the array of digits.
-
-Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the
-least significant bit not the most significant bit.
-
-EXAM,bn_mp_div_2.c
-
-\section{Polynomial Basis Operations}
-Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
-the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single
-place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
-division and Karatsuba multiplication.
-
-Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
-$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
-polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.
-
-\subsection{Multiplication by $x$}
-
-Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one
-degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
-multiplying by the integer $\beta$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_lshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
-2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  $a.used \leftarrow a.used + b$ \\
-5.  $i \leftarrow a.used - 1$ \\
-6.  $j \leftarrow a.used - 1 - b$ \\
-7.  for $n$ from $a.used - 1$ to $b$ do \\
-\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
-\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
-\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
-8.  for $n$ from 0 to $b - 1$ do \\
-\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_lshd}
-\end{figure}
-
-\textbf{Algorithm mp\_lshd.}
-This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs
-from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
-motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
-different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
-typically used on values where the original value is no longer required.  The algorithm will return success immediately if
-$b \le 0$ since the rest of algorithm is only valid when $b > 0$.
-
-First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
-the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).
-The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on
-step 8 sets the lower $b$ digits to zero.
-
-\newpage
-FIGU,sliding_window,Sliding Window Movement
-
-EXAM,bn_mp_lshd.c
-
-The if statement (line @24,if@) ensures that the $b$ variable is greater than zero since we do not interpret negative
-shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates
-the need for an additional variable in the for loop.  The variable $top$ (line @42,top@) is an alias
-for the leading digit while $bottom$ (line @45,bottom@) is an alias for the trailing edge.  The aliases form a
-window of exactly $b$ digits over the input.
-
-\subsection{Division by $x$}
-
-Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_rshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return. \\
-2.  If $a.used \le b$ then do \\
-\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
-\hspace{3mm}2.2  Return. \\
-3.  $i \leftarrow 0$ \\
-4.  $j \leftarrow b$ \\
-5.  for $n$ from 0 to $a.used - b - 1$ do \\
-\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
-\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
-\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
-6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
-\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.used \leftarrow a.used - b$ \\
-8.  Return. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_rshd}
-\end{figure}
-
-\textbf{Algorithm mp\_rshd.}
-This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
-it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.
-
-If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
-to the shift count $b$ then it will simply zero the input and return.
-
-After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
-is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.
-Also the digits are copied from the leading to the trailing edge.
-
-Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
-
-EXAM,bn_mp_rshd.c
-
-The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
-form a sliding window except we copy in the other direction.  After the window (line @59,for (;@) we then zero
-the upper digits of the input to make sure the result is correct.
-
-\section{Powers of Two}
-
-Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For
-example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
-shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.
-
-\subsection{Multiplication by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
-\hline \\
-1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
-2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  If $b \ge lg(\beta)$ then \\
-\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
-\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
-5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $d \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-\hspace{3mm}6.4  If $r > 0$ then do \\
-\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
-\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
-7.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2d.}
-This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
-quickly compute the product.
-
-First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than
-$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$
-left.
-
-After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts
-required.  If it is non-zero a modified shift loop is used to calculate the remaining product.
-Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
-variable is used to extract the upper $d$ bits to form the carry for the next iteration.
-
-This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to
-complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
-
-EXAM,bn_mp_mul_2d.c
-
-The shifting is performed in--place which means the first step (line @24,a != c@) is to copy the input to the
-destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
-has to be grown (line @31,grow@) to accomodate the result.
-
-If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples
-of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift
-loop (lines @45,if@ to @76,}@) we make use of pre--computed values $shift$ and $mask$.   These are used to
-extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a
-chain between consecutive iterations to propagate the carry.
-
-\subsection{Division by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
-2.  $c \leftarrow a$ \\
-3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-4.  If $b \ge lg(\beta)$ then do \\
-\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
-5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $k \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2d.}
-This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm
-mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
-by using algorithm mp\_mod\_2d.
-
-EXAM,bn_mp_div_2d.c
-
-The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally
-ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the
-result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
-the quotient is obtained.
-
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
-the direction of the shifts.
-
-\subsection{Remainder of Division by Power of Two}
-
-The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
-algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mod\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $b > a.used \cdot lg(\beta)$ then do \\
-\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}2.2  Return the result of step 2.1. \\
-3.  $c \leftarrow a$ \\
-4.  If step 3 failed return(\textit{MP\_MEM}). \\
-5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
-\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
-6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
-8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mod\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mod\_2d.}
-This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the
-result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$
-is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
-
-EXAM,bn_mp_mod_2d.c
-
-We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
-than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
-perform some work to produce the remainder.
-
-Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce
-the number.  First we zero any digits above the last digit in $2^b$ (line @41,for@).  Next we reduce the
-leading digit of both (line @45,&=@) and then mp\_clamp().
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
-                      & in $O(n)$ time. \\
-                      &\\
-$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
-                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
-                      & upto $64$ with a hamming weight less than three. \\
-                      &\\
-$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
-                      & $2^k - 1$ as well. \\
-                      &\\
-$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
-                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
-                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
-                      & calculation.  \\
-                      & \\
-$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
-                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
-                      & the cost of addition. \\
-                      & \\
-$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
-                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
-                      & \\
-$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
-                      & calculating the result of a signed comparison. \\
-                      &
-\end{tabular}
-
-\chapter{Multiplication and Squaring}
-\section{The Multipliers}
-For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of
-algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction
-where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication
-and squaring, leaving modular reductions for the subsequent chapter.
-
-The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular
-exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
-exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions,
-35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision
-multiplications.
-
-For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied
-against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the
-overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in
-1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.
-This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.
-
-\section{Multiplication}
-\subsection{The Baseline Multiplication}
-\label{sec:basemult}
-\index{baseline multiplication}
-Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
-algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision
-multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To
-simplify most discussions, it will be assumed that the inputs have comparable number of digits.
-
-The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be
-used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important
-facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this
-modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product
-will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.
-
-Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to
-include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The
-constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}).
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-1.  If min$(a.used, b.used) < \delta$ then do \\
-\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
-\hspace{3mm}1.2  Return the result of step 1.1 \\
-\\
-Allocate and initialize a temporary mp\_int. \\
-2.  Init $t$ to be of size $digs$ \\
-3.  If step 2 failed return(\textit{MP\_MEM}). \\
-4.  $t.used \leftarrow digs$ \\
-\\
-Compute the product. \\
-5.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}5.1  $u \leftarrow 0$ \\
-\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
-\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
-\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
-\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
-6.  Clamp excess digits of $t$. \\
-7.  Swap $c$ with $t$ \\
-8.  Clear $t$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_mul\_digs}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_mul\_digs.}
-This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
-a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent
-algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.
-Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the
-inputs.
-
-The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
-input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A
-temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to
-compute products when either $a = c$ or $b = c$ without overwriting the inputs.
-
-All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
-is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
-will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the
-innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.
-
-For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
-visualized in the following table.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|l|}
-\hline   &&          & 5 & 7 & 6 & \\
-\hline   $\times$&&  & 2 & 4 & 1 & \\
-\hline &&&&&&\\
-  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
-  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
-  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Long-Hand Multiplication Diagram}
-\end{figure}
-
-Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate
-count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
-
-Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
-is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
-double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
-5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit
-$t_{ix+iy}$ and the result would be lost.
-
-At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
-digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
-exceed the precision requested.
-
-EXAM,bn_s_mp_mul_digs.c
-
-First we determine (line @30,if@) if the Comba method can be used first since it's faster.  The conditions for
-sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than
-\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is
-set to $\delta$ but can be reduced when memory is at a premium.
-
-If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
-$t$ (line @36,init@) to the exact size of the output to avoid further re--allocations.  At this point we now
-begin the $O(n^2)$ loop.
-
-This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
-digits as output.  In each iteration of the outer loop the $pb$ variable is set (line @48,MIN@) to the maximum
-number of inner loop iterations.
-
-Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
-carry from the previous iteration.  A particularly important observation is that most modern optimizing
-C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that
-is required for the product.  In x86 terms for example, this means using the MUL instruction.
-
-Each digit of the product is stored in turn (line @68,tmpt@) and the carry propagated (line @71,>>@) to the
-next iteration.
-
-\subsection{Faster Multiplication by the ``Comba'' Method}
-MARK,COMBA
-
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be
-computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement
-in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G.
-Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an
-interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written
-five years before.
-
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight
-twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products
-are produced then added together to form the final result.  In the baseline algorithm the columns are added together
-after each iteration to get the result instantaneously.
-
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at
-the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated
-after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute
-the product vector $\vec x$ as follows.
-
-\begin{equation}
-\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
-\end{equation}
-
-Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
-of $576$ and $241$.
-
-\newpage\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|}
-  \hline &          & 5 & 7 & 6 & First Input\\
-  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
-\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
-                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
-   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
-\hline 10 & 34 & 45 & 31 & 6 & Final Result \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Comba Multiplication Diagram}
-\end{figure}
-
-At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.
-Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
-congruent to adding a leading zero digit.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Comba Fixup}. \\
-\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
-\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
-\hline \\
-1.  for $n$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
-\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
-2.  Return($\vec x$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Comba Fixup}
-\end{figure}
-
-With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case
-$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
-efficient than the baseline algorithm why not simply always use this algorithm?
-
-\subsubsection{Column Weight.}
-At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output
-independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
-the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
-three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
-an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is
-min$(m, n)$ which is fairly obvious.
-
-The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
-from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
-two quantities we must not violate the following
-
-\begin{equation}
-k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
-\end{equation}
-
-Which reduces to
-
-\begin{equation}
-k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
-\end{equation}
-
-Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
-found.
-
-\begin{equation}
-k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
-\end{equation}
-
-The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration
-the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since
-$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
-1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}).\\
-\\
-3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
-\\
-4.  $\_ \hat W \leftarrow 0$ \\
-5.  for $ix$ from 0 to $pa - 1$ do \\
-\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
-\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
-\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
-\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
-\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
-\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
-\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
-\\
-6.  $oldused \leftarrow c.used$ \\
-7.  $c.used \leftarrow digs$ \\
-8.  for $ix$ from $0$ to $pa$ do \\
-\hspace{3mm}8.1  $c_{ix} \leftarrow W_{ix}$ \\
-9.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
-\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\
-\\
-10.  Clamp $c$. \\
-11.  Return MP\_OKAY. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_mul\_digs}
-\label{fig:COMBAMULT}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
-
-The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the
-loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
-reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
-
-The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
-$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
-$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.
-
-The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
-means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
-pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to
-move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until
-$tx \ge a.used$ or $ty < 0$ occurs.
-
-After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
-into the next round by dividing $\_ \hat W$ by $\beta$.
-
-To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the
-cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require
-$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice,
-the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
-and addition operations in the nested loop in parallel.
-
-EXAM,bn_fast_s_mp_mul_digs.c
-
-As per the pseudo--code we first calculate $pa$ (line @47,MIN@) as the number of digits to output.  Next we begin the outer loop
-to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines @61,tmpx@, @62,tmpy@) to point
-inside the two multiplicands quickly.
-
-The inner loop (lines @70,for@ to @72,}@) of this implementation is where the tradeoff come into play.  Originally this comba
-implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix
-the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write
-one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth
-is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often
-slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the
-compiler has aliased $\_ \hat W$ to a CPU register.
-
-After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines @75,W[ix]@, @78,>>@) to forward it as
-a carry for the next pass.  After the outer loop we use the final carry (line @82,W[ix]@) as the last digit of the product.
-
-\subsection{Polynomial Basis Multiplication}
-To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
-the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and
-$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
-
-The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
-directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
-requires $O(n^2)$ time and would in practice be slower than the Comba technique.
-
-However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown
-coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with
-Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in
-effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.
-
-The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since
-$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the
-fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required
-by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
-
-When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
-is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product
-$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
-simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.
-The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the
-points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
-
-If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points}
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
-example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
-
-\begin{eqnarray}
-\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
-16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
-\end{eqnarray}
-
-Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
-polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.
-
-As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of
-multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is
-$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
-summarizes the exponents for various values of $n$.
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
-\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
-\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
-\hline $4$ & $1.403677461$ &\\
-\hline $5$ & $1.365212389$ &\\
-\hline $10$ & $1.278753601$ &\\
-\hline $100$ & $1.149426538$ &\\
-\hline $1000$ & $1.100270931$ &\\
-\hline $10000$ & $1.075252070$ &\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
-\label{fig:exponent}
-\end{figure}
-
-At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
-of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
-numbers.
-
-\subsubsection{Cutoff Point}
-The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However,
-the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
-polynomial basis approach more costly to use with small inputs.
-
-Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a
-point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and
-when $m > y$ the Comba methods are slower than the polynomial basis algorithms.
-
-The exact location of $y$ depends on several key architectural elements of the computer platform in question.
-
-\begin{enumerate}
-\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
-on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
-the cutoff point $y$ will be.
-
-\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
-grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
-directly reflects on the ratio previous mentioned.
-
-\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
-influence over the cutoff point.
-
-\end{enumerate}
-
-A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
-is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
-a high resolution timer is available.
-
-\subsection{Karatsuba Multiplication}
-Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
-general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with
-light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
-
-\begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
-\end{equation}
-
-Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
-this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns
-out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points
-$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
-
-\begin{center}
-\begin{tabular}{rcrcrcrc}
-$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
-$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
-\end{tabular}
-\end{center}
-
-By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
-of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
-\hline \\
-1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
-2.  If step 2 failed then return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
-3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
-6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
-7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
-\\
-Calculate the three products. \\
-8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
-9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
-11.  $x0 \leftarrow y1 + y0$ \\
-12.  $t1 \leftarrow t1 \cdot x0$ \\
-\\
-Calculate the middle term. \\
-13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
-\\
-Calculate the final product. \\
-15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
-16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
-17.  $t1 \leftarrow x0y0 + t1$ \\
-18.  $c \leftarrow t1 + x1y1$ \\
-19.  Clear all of the temporary variables. \\
-20.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_mul.}
-This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
-from Knuth \cite[pp. 294-295]{TAOCPV2}.
-
-\index{radix point}
-In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
-be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the
-smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5
-compute the lower halves.  Step 6 and 7 computer the upper halves.
-
-After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
-of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
-
-The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
-
-EXAM,bn_mp_karatsuba_mul.c
-
-The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
-wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
-to handle error recovery with a single piece of code.  Lines @61,if@ to @75,if@ handle initializing all of the temporary variables
-required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
-the temporaries that have been successfully allocated so far.
-
-The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the
-additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
-number of digits for the next section of code.
-
-The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
-to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and
-\textbf{sign} members are copied first.  The first for loop on line @98,for@ copies the lower halves.  Since they are both the same magnitude it
-is simpler to calculate both lower halves in a single loop.  The for loop on lines @104,for@ and @109,for@ calculate the upper halves $x1$ and
-$y1$ respectively.
-
-By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
-
-When line @152,err@ is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
-the same code that handles errors can be used to clear the temporary variables and return.
-
-\subsection{Toom-Cook $3$-Way Multiplication}
-Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points  are
-chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$,
-$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients
-of the $W(x)$.
-
-With the five relations that Toom-Cook specifies, the following system of equations is formed.
-
-\begin{center}
-\begin{tabular}{rcrcrcrcrcr}
-$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
-$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
-$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
-$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
-$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
-\end{tabular}
-\end{center}
-
-A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
-of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
-the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
-(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
-\hline \\
-Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
-1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
-2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-\\
-Find the five equations for $w_0, w_1, ..., w_4$. \\
-8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
-9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
-10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
-11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
-13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
-14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
-15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
-16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
-\\
-Continued on the next page.\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
-\hline \\
-Now solve the system of equations. \\
-18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
-19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
-20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
-21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
-23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
-24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
-\\
-Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
-26. for $n$ from $1$ to $4$ do \\
-\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
-27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
-28. Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul (continued)}
-\end{figure}
-
-\textbf{Algorithm mp\_toom\_mul.}
-This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this
-algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
-description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
-any given step.
-
-The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
-integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
-
-The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
-to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
-$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
-
-After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients
-$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
-the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
-that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.
-
-Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer
-result $a \cdot b$ is produced.
-
-EXAM,bn_mp_toom_mul.c
-
-The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very
-large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
-Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
-algorithm is not practical as Karatsuba has a much lower cutoff point.
-
-First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines @40,mod@ to @69,rshd@) with
-combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
-for $b$.
-
-Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
-we get those out of the way first (lines @72,mul@ and @77,mul@).  Next we compute $w1, w2$ and $w3$ using Horners method.
-
-After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
-straight forward.
-
-\subsection{Signed Multiplication}
-Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
-of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot b$ \\
-\hline \\
-1.  If $a.sign = b.sign$ then \\
-\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
-2.  else \\
-\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
-3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
-\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
-4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
-\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
-5.  else \\
-\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
-\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
-\hspace{3mm}5.3  else \\
-\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
-6.  $c.sign \leftarrow sign$ \\
-7.  Return the result of the unsigned multiplication performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_mul.}
-This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms
-available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
-s\_mp\_mul\_digs will clear it.
-
-EXAM,bn_mp_mul.c
-
-The implementation is rather simplistic and is not particularly noteworthy.  Line @22,?@ computes the sign of the result using the ``?''
-operator from the C programming language.  Line @37,<<@ computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.
-
-\section{Squaring}
-\label{sec:basesquare}
-
-Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
-available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
-performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider
-the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$,
-$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$
-and $3 \cdot 1 = 1 \cdot 3$.
-
-For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
-required for multiplication.  The following diagram gives an example of the operations required.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{ccccc|c}
-&&1&2&3&\\
-$\times$ &&1&2&3&\\
-\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
-       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
-         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
-\end{tabular}
-\end{center}
-\caption{Squaring Optimization Diagram}
-\end{figure}
-
-MARK,SQUARE
-Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
-represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.
-
-The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
-appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double
-products and at most one square (\textit{see the exercise section}).
-
-The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row,
-occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero.
-Column two of row one is a square and column three is the first unique column.
-
-\subsection{The Baseline Squaring Algorithm}
-The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
-will not handle.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}) \\
-3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
-4.  For $ix$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}Calculate the square. \\
-\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
-\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}Calculate the double products after the square. \\
-\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
-\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}Set the last carry. \\
-\hspace{3mm}4.5  While $u > 0$ do \\
-\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
-\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
-6.  Exchange $b$ and $t$. \\
-7.  Clear $t$ (\textit{mp\_clear}) \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sqr.}
-This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
-\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the
-destination mp\_int to be the same as the source mp\_int.
-
-The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
-the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
-the carry and compute the double products.
-
-The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
-very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
-when it is multiplied by two, it can be properly represented by a mp\_word.
-
-Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial
-results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.
-
-EXAM,bn_s_mp_sqr.c
-
-Inside the outer loop (line @32,for@) the square term is calculated on line @35,r =@.  The carry (line @42,>>@) has been
-extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized
-(lines @45,tmpx@ and @48,tmpt@) to simplify the inner loop.  The doubling is performed using two
-additions (line @57,r + r@) since it is usually faster than shifting, if not at least as fast.
-
-The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
-get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
-square a number.
-
-\subsection{Faster Squaring by the ``Comba'' Method}
-A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
-drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
-performance hazards.
-
-The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
-propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
-that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
-$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.
-
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two
-mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and
-carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
-1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
-2.  If step 1 failed return(\textit{MP\_MEM}). \\
-\\
-3.  $pa \leftarrow 2 \cdot a.used$ \\
-4.  $\hat W1 \leftarrow 0$ \\
-5.  for $ix$ from $0$ to $pa - 1$ do \\
-\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
-\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
-\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
-\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
-\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
-\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
-\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
-\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
-\hspace{3mm}5.8  if $ix$ is even then \\
-\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
-\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
-\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
-\\
-6.  $oldused \leftarrow b.used$ \\
-7.  $b.used \leftarrow 2 \cdot a.used$ \\
-8.  for $ix$ from $0$ to $pa - 1$ do \\
-\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
-9.  for $ix$ from $pa$ to $oldused - 1$ do \\
-\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
-10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm
-s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.
-This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
-
-First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
-products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
-addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
-$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
-of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
-fewer multiplications and the routine ends up being faster.
-
-Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
-only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
-
-EXAM,bn_fast_s_mp_sqr.c
-
-This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for
-the special case of squaring.
-
-\subsection{Polynomial Basis Squaring}
-The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
-is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
-multiplications to find the $\zeta$ relations, squaring operations are performed instead.
-
-\subsection{Karatsuba Squaring}
-Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.
-Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a
-number with the following equation.
-
-\begin{equation}
-h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
-\end{equation}
-
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in
-Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of
-$O \left ( n^{lg(3)} \right )$.
-
-If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm
-instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the
-time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff
-point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.
-
-Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.
-The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
-were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
-2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1\beta^B + x0$ \\
-3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
-\\
-Calculate the three squares. \\
-6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
-7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
-9.  $t1 \leftarrow t1^2$ \\
-\\
-Compute the middle term. \\
-10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t1 - t2$ \\
-\\
-Compute final product. \\
-12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
-13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
-14.  $t1 \leftarrow t1 + x0x0$ \\
-15.  $b \leftarrow t1 + x1x1$ \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_sqr.}
-This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
-multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
-
-The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
-placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
-as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
-
-By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
-Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
-this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
-
-Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
-machine clock cycles.}.
-
-\begin{equation}
-5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
-\end{equation}
-
-For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
-\begin{center}
-\begin{tabular}{rcl}
-${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
-${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
-${13 \over 9}$     & $<$ & $n$ \\
-\end{tabular}
-\end{center}
-
-This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
-where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
-the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
-ratio of 1:7.  } than simpler operations such as addition.
-
-EXAM,bn_mp_karatsuba_sqr.c
-
-This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and
-shift the input into the two halves.  The loop from line @54,{@ to line @70,}@ has been modified since only one input exists.  The \textbf{used}
-count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
-to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.
-
-By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
-is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
-it is actually below the Comba limit (\textit{at 110 digits}).
-
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are
-redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and
-mp\_clears are executed normally.
-
-\subsection{Toom-Cook Squaring}
-The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to
-derive their own Toom-Cook squaring algorithm.
-
-\subsection{High Level Squaring}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
-\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
-2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
-\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
-3.  else \\
-\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
-\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
-\hspace{3mm}3.3  else \\
-\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
-4.  $b.sign \leftarrow MP\_ZPOS$ \\
-5.  Return the result of the unsigned squaring performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_sqr.}
-This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
-\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
-neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.
-
-EXAM,bn_mp_sqr.c
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
-                      & that have different number of digits in Karatsuba multiplication. \\
-                      & \\
-$\left [ 2 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
-                      & of double products and at most one square is stated.  Prove this statement. \\
-                      & \\
-$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
-                      & \\
-$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
-                      & \\
-$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
-                      & required for equation $6.7$ to be true.  \\
-                      & \\
-$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
-                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
-                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
-                      &\\
-$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
-                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
-                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
-                      & \\
-\end{tabular}
-
-\chapter{Modular Reduction}
-MARK,REDUCTION
-\section{Basics of Modular Reduction}
-\index{modular residue}
-Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms,
-such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be \textit{reduced}
-modulo another number $b$ by finding the remainder of the division $a/b$.  Full integer division with remainder is a topic to be covered
-in~\ref{sec:division}.
-
-Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result
-$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the
-``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
-other forms of residues.
-
-Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions
-is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the
-RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in
-elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular
-exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the
-range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check
-algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.
-
-\section{The Barrett Reduction}
-The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
-division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to
-
-\begin{equation}
-c = a - b \cdot \lfloor a/b \rfloor
-\end{equation}
-
-Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper
-targeted the DSP56K processor.}  intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal.  However,
-DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types.
-It would take another common optimization to optimize the algorithm.
-
-\subsection{Fixed Point Arithmetic}
-The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
-point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were
-fairly slow if not unavailable.   The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit
-integer and a $q$-bit fraction part (\textit{where $p+q = k$}).
-
-In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
-value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by
-moving the implied decimal point back to where it should be.  For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted
-to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the
-fixed point representation of $5$.  The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$.
-
-This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication
-of two fixed point numbers.  Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal.  If $2^q$ is
-equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic.  Using this fact dividing an integer
-$a$ by another integer $b$ can be achieved with the following expression.
-
-\begin{equation}
-\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
-\end{equation}
-
-The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with
-modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
-are considerably faster than division on most processors.
-
-Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
-leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
-the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  The value of $2^q$ must be close to or ideally
-larger than the dividend.  In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach
-to work correctly.  Plugging this form of divison into the original equation the following modular residue equation arises.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
-\end{equation}
-
-Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
-variable also helps re-inforce the idea that it is meant to be computed once and re-used.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
-\end{equation}
-
-Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  In the context of Barrett
-reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough
-precision.
-
-Let $n$ represent the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and
-another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to
-reduce the number.
-
-For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
-$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
-By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
-
-\subsection{Choosing a Radix Point}
-Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
-that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$.
-See~\ref{sec:division} for further details.} might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
-the initial multiplication that finds the quotient.
-
-Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
-the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if
-two $m$-digit numbers have been multiplied.  Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the
-$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  Another way to
-express this is by re-writing $a$ as two parts.  If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then
-${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$.  Since $a'$ is bound to be less than $b$ the quotient
-is bound by $0 \le {a' \over b} < 1$.
-
-Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits
-``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
-with the irrelevant digits trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
-
-\begin{equation}
-c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
-\end{equation}
-
-Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the
-exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$.  If the optimization had not been performed the divisor
-would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient
-$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two.  The original fixed point quotient can be off
-by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient
-can be off by an additional value of one for a total of at most two.  This implies that
-$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then conditionally subtracting
-$b$ once or twice the residue is found.
-
-The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
-precision multiplications, ignoring the subtractions required.  In total $2m^2 + m$ single precision multiplications are required to find the residue.
-This is considerably faster than the original attempt.
-
-For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$
-represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.
-With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then
-$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$
-is found.
-
-\subsection{Trimming the Quotient}
-So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As
-it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
-optimization.
-
-After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
-half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision
-multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.
-In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.
-
-The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
-multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
-of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.
-
-\subsection{Trimming the Residue}
-After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
-multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the
-result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
-implicitly zero.
-
-The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
-$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
-be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces
-only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.
-
-With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
-is considerably faster than the straightforward $3m^2$ method.
-
-\subsection{The Barrett Algorithm}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\
-\textbf{Output}.  $a \mbox{ (mod }b\mbox{)}$ \\
-\hline \\
-Let $m$ represent the number of digits in $b$.  \\
-1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
-2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
-\\
-Produce the quotient. \\
-3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
-4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
-\\
-Subtract the multiple of modulus from the input. \\
-5.  $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
-7.  $a \leftarrow a - q$ (\textit{mp\_sub}) \\
-\\
-Add $\beta^{m+1}$ if a carry occured. \\
-8.  If $a < 0$ then (\textit{mp\_cmp\_d}) \\
-\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
-\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
-\hspace{3mm}8.3  $a \leftarrow a + q$ \\
-\\
-Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
-9.  While $a \ge b$ do (\textit{mp\_cmp}) \\
-\hspace{3mm}9.1  $c \leftarrow a - b$ \\
-10.  Clear $q$. \\
-11.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce.}
-This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
-\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must
-be adhered to for the algorithm to work.
-
-First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
-a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
-for the quotient to have enough precision.  If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem.
-Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The value of $\mu$ is passed as an argument to this
-algorithm and is assumed to be calculated and stored before the algorithm is used.
-
-Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called
-$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  The algorithm is based on $s\_mp\_mul\_digs$ except that
-instead of stopping at a given level of precision it starts at a given level of precision.  This optimal algorithm can only be used if the number
-of digits in $b$ is very much smaller than $\beta$.
-
-While it is known that
-$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied
-``borrow'' from the higher digits might leave a negative result.  After the multiple of the modulus has been subtracted from $a$ the residue must be
-fixed up in case it is negative.  The invariant $\beta^{m+1}$ must be added to the residue to make it positive again.
-
-The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is
-performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should.
-
-EXAM,bn_mp_reduce.c
-
-The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
-the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
-in the modulus.  In the source code this is evaluated on lines @36,if@ to @44,}@ where algorithm s\_mp\_mul\_high\_digs is used when it is
-safe to do so.
-
-\subsection{The Barrett Setup Algorithm}
-In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
-future use so that the Barrett algorithm can be used without delay.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_setup}. \\
-\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
-\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
-\hline \\
-1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
-2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
-3.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_setup.}
-This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
-is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
-
-EXAM,bn_mp_reduce_setup.c
-
-This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
-which would received the remainder is passed as NULL.  As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the
-remainder to be passed as NULL meaning to ignore the value.
-
-\section{The Montgomery Reduction}
-Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting
-form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a
-residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.
-
-Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
-$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
-is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
-
-\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
-to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$.  Adding zero will not change the value of the residue.
-
-\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
-this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to
-multiplication by $k^{-1}$ modulo $n$.
-
-From these two simple facts the following simple algorithm can be derived.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $1$ to $k$ do \\
-\hspace{3mm}1.1  If $x$ is odd then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
-\hspace{3mm}1.2  $x \leftarrow x/2$ \\
-2.  Return $x$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction}
-\end{figure}
-
-The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
-added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
-$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the
-final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to
-$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
-\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
-\hline $2$ & $x/2 = 1453$ \\
-\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
-\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
-\hline $5$ & $x/2 = 278$ \\
-\hline $6$ & $x/2 = 139$ \\
-\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
-\hline $8$ & $x/2 = 99$ \\
-\hline $9$ & $x + n = 356$, $x/2 = 178$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (I)}
-\label{fig:MONT1}
-\end{figure}
-
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 9$ (note $\beta^k = 512$ which is larger than $n$).  The result of
-the algorithm $r = 178$ is congruent to the value of $2^{-9} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^9$ modulo $257$ the correct residue
-$r \equiv 158$ is produced.
-
-Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
-and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.
-Fortunately there exists an alternative representation of the algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ ($2^k > n$) \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $1$ to $k$ do \\
-\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
-2.  Return $x/2^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified I)}
-\end{figure}
-
-This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
-precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|r|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\
-\hline -- & $5555$ & $1010110110011$ \\
-\hline $1$ & $x + 2^{0}n = 5812$ &  $1011010110100$ \\
-\hline $2$ & $5812$ & $1011010110100$ \\
-\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\
-\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\
-\hline $5$ & $8896$ & $10001011000000$ \\
-\hline $6$ & $8896$ & $10001011000000$ \\
-\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
-\hline $8$ & $25344$ & $110001100000000$ \\
-\hline $9$ & $x + 2^{7}n = 91136$ & $10110010000000000$ \\
-\hline -- & $x/2^k = 178$ & \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (II)}
-\label{fig:MONT2}
-\end{figure}
-
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 9$.
-With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the
-loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is
-zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.
-
-\subsection{Digit Based Montgomery Reduction}
-Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
-previous algorithm re-written to compute the Montgomery reduction in this new fashion.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ ($\beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
-2.  Return $x/\beta^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified II)}
-\end{figure}
-
-The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of
-the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
-problem breaks down to solving the following congruency.
-
-\begin{center}
-\begin{tabular}{rcl}
-$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\end{tabular}
-\end{center}
-
-In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used
-extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.
-
-For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$
-represent the value to reduce.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
-\hline --                 & $33$ & --\\
-\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
-\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Montgomery Reduction}
-\end{figure}
-
-The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$
-which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
-the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
-the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.
-
-\subsection{Baseline Montgomery Reduction}
-The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for
-Montgomery reductions.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  $digs \leftarrow 2n.used + 1$ \\
-2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
-\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
-\\
-Setup $x$ for the reduction. \\
-3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
-4.  $x.used \leftarrow digs$ \\
-\\
-Eliminate the lower $k$ digits. \\
-5.  For $ix$ from $0$ to $k - 1$ do \\
-\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}5.2  $u \leftarrow 0$ \\
-\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
-\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
-\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.4  While $u > 0$ do \\
-\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
-\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
-\\
-Divide by $\beta^k$ and fix up as required. \\
-6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
-7.  If $x \ge n$ then \\
-\hspace{3mm}7.1  $x \leftarrow x - n$ \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_reduce.}
-This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
-on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
-restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as
-for the Barrett algorithm.  Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$.  $\rho$ must be calculated in
-advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.
-
-Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
-the size of the input.  This algorithm is discussed in ~COMBARED~.
-
-Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
-calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
-multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
-
-Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications
-in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
-multiplications.
-
-EXAM,bn_mp_montgomery_reduce.c
-
-This is the baseline implementation of the Montgomery reduction algorithm.  Lines @30,digs@ to @35,}@ determine if the Comba based
-routine can be used instead.  Line @47,mu@ computes the value of $\mu$ for that particular iteration of the outer loop.
-
-The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
-the alias $tmpn$ refers to the modulus $n$.
-
-\subsection{Faster ``Comba'' Montgomery Reduction}
-MARK,COMBARED
-
-The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
-nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
-technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
-a $k \times 1$ product $k$ times.
-
-The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the
-carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.
-Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.
-
-With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
-the speed of the algorithm.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
-1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
-Copy the digits of $x$ into the array $\hat W$ \\
-2.  For $ix$ from $0$ to $x.used - 1$ do \\
-\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
-3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-Elimiate the lower $k$ digits. \\
-4.  for $ix$ from $0$ to $n.used - 1$ do \\
-\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
-\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Propagate carries upwards. \\
-5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
-\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Shift right and reduce modulo $\beta$ simultaneously. \\
-6.  for $ix$ from $0$ to $n.used + 1$ do \\
-\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
-Zero excess digits and fixup $x$. \\
-7.  if $x.used > n.used + 1$ then do \\
-\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
-\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
-8.  $x.used \leftarrow n.used + 1$ \\
-9.  Clamp excessive digits of $x$. \\
-10.  If $x \ge n$ then \\
-\hspace{3mm}10.1  $x \leftarrow x - n$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
-This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
-faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
-on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the
-the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
-a modulus of at most $3,556$ bits in length.
-
-As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
-contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
-4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
-as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
-a single precision multiplication instead half the amount of time is spent.
-
-Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
-4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
-how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
-point.
-
-Step 5 will propagate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
-stored in the destination $x$.
-
-EXAM,bn_fast_mp_montgomery_reduce.c
-
-The $\hat W$ array is first filled with digits of $x$ on line @49,for@ then the rest of the digits are zeroed on line @54,for@.  Both loops share
-the same alias variables to make the code easier to read.
-
-The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
-forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line @101,>>@ fixes the carry
-for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
-
-The for loop on line @113,for@ propagates the rest of the carries upwards through the columns.  The for loop on line @126,for@ reduces the columns
-modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
-digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.
-
-\subsection{Montgomery Setup}
-To calculate the variable $\rho$ a relatively simple algorithm will be required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
-\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
-\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\hline \\
-1.  $b \leftarrow n_0$ \\
-2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
-4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
-\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
-5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
-6.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_setup.}
-This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick
-to calculate $1/n_0$ when $\beta$ is a power of two.
-
-EXAM,bn_mp_montgomery_setup.c
-
-This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
-multiplications when $\beta$ is not the default 28-bits.
-
-\section{The Diminished Radix Algorithm}
-The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
-or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
-
-\begin{equation}
-(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
-\end{equation}
-
-This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that
-then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
-of the above equation is very simple.  First write $x$ in the product form.
-
-\begin{equation}
-x = qn + r
-\end{equation}
-
-Now reduce both sides modulo $(n - k)$.
-
-\begin{equation}
-x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
-\end{equation}
-
-The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$
-into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Diminished Radix Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$, $k$ \\
-\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
-\hline \\
-1.  $q \leftarrow \lfloor x / n \rfloor$ \\
-2.  $q \leftarrow k \cdot q$ \\
-3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
-4.  $x \leftarrow x + q$ \\
-5.  If $x \ge (n - k)$ then \\
-\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
-\hspace{3mm}5.2  Goto step 1. \\
-6.  Return $x$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Diminished Radix Reduction}
-\label{fig:DR}
-\end{figure}
-
-This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
-once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
-
-\begin{equation}
-0 \le x < n^2 + k^2 - 2nk
-\end{equation}
-
-The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
-
-\begin{equation}
-q < n - 2k - k^2/n
-\end{equation}
-
-Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
-$0 \le x < n$.  By step four the sum $x + q$ is bounded by
-
-\begin{equation}
-0 \le q + x < (k + 1)n - 2k^2 - 1
-\end{equation}
-
-With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
-sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the
-range $0 \le x < (n - k - 1)^2$.
-
-\begin{figure}
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|}
-\hline
-$x = 123456789, n = 256, k = 3$ \\
-\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
-$q \leftarrow q*k = 1446759$ \\
-$x \leftarrow x \mbox{ mod } n = 21$ \\
-$x \leftarrow x + q = 1446780$ \\
-$x \leftarrow x - (n - k) = 1446527$ \\
-\hline
-$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
-$q \leftarrow q*k = 16950$ \\
-$x \leftarrow x \mbox{ mod } n = 127$ \\
-$x \leftarrow x + q = 17077$ \\
-$x \leftarrow x - (n - k) = 16824$ \\
-\hline
-$q \leftarrow \lfloor x/n \rfloor = 65$ \\
-$q \leftarrow q*k = 195$ \\
-$x \leftarrow x \mbox{ mod } n = 184$ \\
-$x \leftarrow x + q = 379$ \\
-$x \leftarrow x - (n - k) = 126$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example Diminished Radix Reduction}
-\label{fig:EXDR}
-\end{figure}
-
-Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
-is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
-three passes were required to find the residue $x \equiv 126$.
-
-
-\subsection{Choice of Moduli}
-On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
-modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen.
-
-Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.
-Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division
-by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$
-which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.
-
-However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be
-performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.
-Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$.
-
-Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted
-modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the
-$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.
-
-\subsection{Choice of $k$}
-Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
-in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
-as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.
-
-\subsection{Restricted Diminished Radix Reduction}
-The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce
-an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
-of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition
-of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular
-exponentiations are performed.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
-\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
-\textbf{Output}.  $x \mbox{ mod } n$ \\
-\hline \\
-1.  $m \leftarrow n.used$ \\
-2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
-3.  $\mu \leftarrow 0$ \\
-4.  for $i$ from $0$ to $m - 1$ do \\
-\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
-\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  $x_{m} \leftarrow \mu$ \\
-6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
-\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
-7.  Clamp excess digits of $x$. \\
-8.  If $x \ge n$ then \\
-\hspace{3mm}8.1  $x \leftarrow x - n$ \\
-\hspace{3mm}8.2  Goto step 3. \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_reduce.}
-This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
-with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.
-
-This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
-and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
-the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
-digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to
-$x$ before the addition of the multiple of the upper half.
-
-At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
-at step 3.
-
-EXAM,bn_mp_dr_reduce.c
-
-The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line @49,top:@ is where
-the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
-the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.
-
-The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
-a division by $\beta^m$ can be simulated virtually for free.  The loop on line @61,for@ performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
-in this algorithm.
-
-By line @68,mu@ the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line @71,for@ the
-same pointer will point to the $m+1$'th digit where the zeroes will be placed.
-
-Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.
-With the same logic at line @82,sub@ the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
-as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
-does not need to be checked.
-
-\subsubsection{Setup}
-To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
-completeness.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_setup}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $k = \beta - n_0$ \\
-\hline \\
-1.  $k \leftarrow \beta - n_0$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_setup}
-\end{figure}
-
-EXAM,bn_mp_dr_setup.c
-
-\subsubsection{Modulus Detection}
-Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
-of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
-\hline
-1.  If $n.used < 2$ then return($0$). \\
-2.  for $ix$ from $1$ to $n.used - 1$ do \\
-\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
-3.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_is\_modulus}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_is\_modulus.}
-This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
-in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
-step 3 then $n$ must be of Diminished Radix form.
-
-EXAM,bn_mp_dr_is_modulus.c
-
-\subsection{Unrestricted Diminished Radix Reduction}
-The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
-is a straightforward adaptation of algorithm~\ref{fig:DR}.
-
-In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
-algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k}. \\
-\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
-\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
-\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  While $a \ge n$ do \\
-\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
-\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
-\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.5  If $a \ge n$ then do \\
-\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k.}
-This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
-shift which makes the algorithm fairly inexpensive to use.
-
-EXAM,bn_mp_reduce_2k.c
-
-The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
-on line @31,mp_div_2d@ calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
-is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
-any multiplications.
-
-The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are
-positive.  By using the unsigned versions the overhead is kept to a minimum.
-
-\subsubsection{Unrestricted Setup}
-To setup this reduction algorithm the value of $k = 2^p - n$ is required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $k = 2^p - n$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
-3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
-4.  $k \leftarrow x_0$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k\_setup.}
-This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
-is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.
-
-EXAM,bn_mp_reduce_2k_setup.c
-
-\subsubsection{Unrestricted Detection}
-An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
-
-\begin{enumerate}
-\item  The number has only one digit.
-\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
-\end{enumerate}
-
-If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
-one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
-that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
-significant bit.  The resulting sum will be a power of two.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
-\hline
-1.  If $n.used = 0$ then return($0$). \\
-2.  If $n.used = 1$ then return($1$). \\
-3.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-4.  for $x$ from $lg(\beta)$ to $p$ do \\
-\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
-5.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_is\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_is\_2k.}
-This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.
-
-EXAM,bn_mp_reduce_is_2k.c
-
-
-
-\section{Algorithm Comparison}
-So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
-that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
-all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.
-
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
-\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
-\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
-\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-
-In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
-reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
-calling the half precision multipliers, addition and division by $\beta$ algorithms.
-
-For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
-shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
-primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
-modular exponentiation to greatly speed up the operation.
-
-
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
-                     & calculates the correct value of $\rho$. \\
-                     & \\
-$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
-                     & \\
-$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
-                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
-                     & terminate within $1 \le k \le 10$ iterations. \\
-                     & \\
-\end{tabular}
-
-
-\chapter{Exponentiation}
-Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
-in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key
-cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
-such cryptosystem and many methods have been sought to speed it up.
-
-\section{Exponentiation Basics}
-A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
-the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
-with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
-
-Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
-are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least
-significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
-
-\begin{equation}
-a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
-\end{equation}
-
-By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
-
-\begin{equation}
-b = \sum_{i=0}^{k-1}2^i \cdot b_i
-\end{equation}
-
-The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
-$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
-$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
-
-While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to
-be computed in an auxilary variable.  Consider the following equivalent algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Left to Right Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$ and $k$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $k - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Left to Right Exponentiation}
-\label{fig:LTOR}
-\end{figure}
-
-This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
-multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
-product.
-
-For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|}
-\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
-\hline - & $1$ \\
-\hline $5$ & $a$ \\
-\hline $4$ & $a^2$ \\
-\hline $3$ & $a^4 \cdot a$ \\
-\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
-\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
-\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Left to Right Exponentiation}
-\end{figure}
-
-When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is
-called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.
-
-\subsection{Single Digit Exponentiation}
-The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended
-to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of
-$b$ that are greater than three.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_expt\_d}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
-2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
-3.  for $x$ from 1 to $lg(\beta)$ do \\
-\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
-\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
-\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
-\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
-4.  Clear $g$. \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_expt\_d}
-\end{figure}
-
-\textbf{Algorithm mp\_expt\_d.}
-This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
-quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the
-exponent is a fixed width.
-
-A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of
-$1$ in the subsequent step.
-
-Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
-on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
-of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
-iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
-
-EXAM,bn_mp_expt_d_ex.c
-
-This describes only the algorithm that is used when the parameter $fast$ is $0$.  Line @31,mp_set@ sets the initial value of the result to $1$.  Next the loop on line @54,for@ steps through each bit of the exponent starting from
-the most significant down towards the least significant. The invariant squaring operation placed on line @333,mp_sqr@ is performed first.  After
-the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
-@69,<<@ moves all of the bits of the exponent upwards towards the most significant location.
-
-\section{$k$-ary Exponentiation}
-When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
-slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
-the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
-computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
-portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
-\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
-\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{$k$-ary Exponentiation}
-\label{fig:KARY}
-\end{figure}
-
-The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
-precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
-$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.
-However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
-
-Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
-original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
-has increased slightly but the number of multiplications has nearly halved.
-
-\subsection{Optimal Values of $k$}
-An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
-approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
-for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
-\hline $16$ & $2$ & $27$ & $24$ \\
-\hline $32$ & $3$ & $49$ & $48$ \\
-\hline $64$ & $3$ & $92$ & $96$ \\
-\hline $128$ & $4$ & $175$ & $192$ \\
-\hline $256$ & $4$ & $335$ & $384$ \\
-\hline $512$ & $5$ & $645$ & $768$ \\
-\hline $1024$ & $6$ & $1257$ & $1536$ \\
-\hline $2048$ & $6$ & $2452$ & $3072$ \\
-\hline $4096$ & $7$ & $4808$ & $6144$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
-\label{fig:OPTK}
-\end{figure}
-
-\subsection{Sliding-Window Exponentiation}
-A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
-this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the
-algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.
-
-Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm {\ref{fig:KARY}}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
-\hline $16$ & $3$ & $24$ & $27$ \\
-\hline $32$ & $3$ & $45$ & $49$ \\
-\hline $64$ & $4$ & $87$ & $92$ \\
-\hline $128$ & $4$ & $167$ & $175$ \\
-\hline $256$ & $5$ & $322$ & $335$ \\
-\hline $512$ & $6$ & $628$ & $645$ \\
-\hline $1024$ & $6$ & $1225$ & $1257$ \\
-\hline $2048$ & $7$ & $2403$ & $2452$ \\
-\hline $4096$ & $8$ & $4735$ & $4808$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
-\label{fig:OPTK2}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
-\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
-\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
-\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
-\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Sliding Window $k$-ary Exponentiation}
-\end{figure}
-
-Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
-algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
-the size as the previous table.
-
-Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as
-the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the
-exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
-a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$
-squarings.  The second method requires $8$ multiplications and $18$ squarings.
-
-In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.
-
-\section{Modular Exponentiation}
-
-Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing
-$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it
-modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.
-
-This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
-one of the algorithms presented in ~REDUCTION~.
-
-Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
-will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
-value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}).  If no inverse exists the algorithm
-terminates with an error.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
-2.  If $b.sign = MP\_NEG$ then \\
-\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
-\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
-\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
-3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
-\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
-4.  else \\
-\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_exptmod}
-\end{figure}
-
-\textbf{Algorithm mp\_exptmod.}
-The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm
-which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation
-except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
-algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).
-
-EXAM,bn_mp_exptmod.c
-
-In order to keep the algorithms in a known state the first step on line @29,if@ is to reject any negative modulus as input.  If the exponent is
-negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
-the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
-exponent.
-
-If the exponent is positive the algorithm resumes the exponentiation.  Line @63,dr_@ determines if the modulus is of the restricted Diminished Radix
-form.  If it is not line @65,reduce@ attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
-of three values.
-
-\begin{enumerate}
-\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
-\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
-\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
-\end{enumerate}
-
-Line @69,if@ determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
-the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.
-
-\subsection{Barrett Modular Exponentiation}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  $k \leftarrow lg(x)$ \\
-2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
-                              2 &  \mbox{if }k \le 7 \\
-                              3 &  \mbox{if }7 < k \le 36 \\
-                              4 &  \mbox{if }36 < k \le 140 \\
-                              5 &  \mbox{if }140 < k \le 450 \\
-                              6 &  \mbox{if }450 < k \le 1303 \\
-                              7 &  \mbox{if }1303 < k \le 3529 \\
-                              8 &  \mbox{if }3529 < k \\
-                              \end{array} \right .$ \\
-3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
-4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
-5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
-\\
-Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
-6.  $k \leftarrow 2^{winsize - 1}$ \\
-7.  $M_{k} \leftarrow M_1$ \\
-8.  for $ix$ from 0 to $winsize - 2$ do \\
-\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
-\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
-\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
-\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-10.  $res \leftarrow 1$ \\
-\\
-Start Sliding Window. \\
-11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
-12.  Loop \\
-\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
-\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
-\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
-\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
-\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
-\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
-Continued on next page. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
-\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
-\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
-\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
-\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
-\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.6.3  Goto step 12. \\
-\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
-\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
-\hspace{3mm}12.9  $mode \leftarrow 2$ \\
-\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
-\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
-\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
-\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
-\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
-\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}Reset the window. \\
-\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
-\\
-No more windows left.  Check for residual bits of exponent. \\
-13.  If $mode = 2$ and $bitcpy > 0$ then do \\
-\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
-\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
-\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
-\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
-\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
-\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-14.  $y \leftarrow res$ \\
-15.  Clear $res$, $mu$ and the $M$ array. \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod (continued)}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_exptmod.}
-This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
-algorithm to keep the product small throughout the algorithm.
-
-The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the
-larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
-table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.
-
-After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
-the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
-times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
-
-Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
-\begin{enumerate}
-\item The variable $mode$ dictates how the bits of the exponent are interpreted.
-\begin{enumerate}
-   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply
-         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.
-   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits
-         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.
-   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
-         downwards.
-\end{enumerate}
-\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
-      is fetched from the exponent.
-\item The variable $buf$ holds the currently read digit of the exponent.
-\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
-\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
-      the appropriate operations performed.
-\item The variable $bitbuf$ holds the current bits of the window being formed.
-\end{enumerate}
-
-All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
-inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
-read and if there are no digits left than the loop terminates.
-
-After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
-upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to
-trailing edges the entire exponent is read from most significant bit to least significant bit.
-
-At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the
-algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
-the two cases of $mode = 1$ and $mode = 2$ respectively.
-
-FIGU,expt_state,Sliding Window State Diagram
-
-By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then
-a Left-to-Right algorithm is used to process the remaining few bits.
-
-EXAM,bn_s_mp_exptmod.c
-
-Lines @31,if@ through @45,}@ determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
-from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement
-on line @37,if@ the value of $x$ is already known to be greater than $140$.
-
-The conditional piece of code beginning on line @42,ifdef@ allows the window size to be restricted to five bits.  This logic is used to ensure
-the table of precomputed powers of $G$ remains relatively small.
-
-The for loop on line @60,for@ initializes the $M$ array while lines @71,mp_init@ and @75,mp_reduce@ through @85,}@ initialize the reduction
-function that will be used for this modulus.
-
--- More later.
-
-\section{Quick Power of Two}
-Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
-equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_2expt}. \\
-\textbf{Input}.   integer $b$ \\
-\textbf{Output}.  $a \leftarrow 2^b$ \\
-\hline \\
-1.  $a \leftarrow 0$ \\
-2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
-3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
-4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_2expt}
-\end{figure}
-
-\textbf{Algorithm mp\_2expt.}
-
-EXAM,bn_mp_2expt.c
-
-\chapter{Higher Level Algorithms}
-
-This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
-routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.
-
-The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
-for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.
-These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate
-various representations of integers.  For example, converting from an mp\_int to a string of character.
-
-\section{Integer Division with Remainder}
-\label{sec:division}
-
-Integer division aside from modular exponentiation is the most intensive algorithm to compute.  Like addition, subtraction and multiplication
-the basis of this algorithm is the long-hand division algorithm taught to school children.  Throughout this discussion several common variables
-will be used.  Let $x$ represent the divisor and $y$ represent the dividend.  Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and
-let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$.  The following simple algorithm will be used to start the discussion.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\
-\textbf{Input}.   integer $x$ and $y$ \\
-\textbf{Output}.  $q = \lfloor y/x\rfloor, r = y - xq$ \\
-\hline \\
-1.  $q \leftarrow 0$ \\
-2.  $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\
-3.  for $t$ from $n$ down to $0$ do \\
-\hspace{3mm}3.1  Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\
-\hspace{3mm}3.2  $q \leftarrow q + k\beta^t$ \\
-\hspace{3mm}3.3  $y \leftarrow y - kx\beta^t$ \\
-4.  $r \leftarrow y$ \\
-5.  Return($q, r$) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Radix-$\beta$ Integer Division}
-\label{fig:raddiv}
-\end{figure}
-
-As children we are taught this very simple algorithm for the case of $\beta = 10$.  Almost instinctively several optimizations are taught for which
-their reason of existing are never explained.  For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor.
-
-To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and
-simultaneously $(k + 1)x\beta^t$ is greater than $y$.  Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have.  The habitual method
-used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient.  By only using leading
-digits a much simpler division may be used to form an educated guess at what the value must be.  In this case $k = \lfloor 54/23\rfloor = 2$ quickly
-arises as a possible  solution.  Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$.
-As a  result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$.
-
-Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder
-$y = 841 - 3x\beta = 181$.  Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the
-remainder $y = 181 - 7x = 20$.  The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since
-$237 \cdot 23 + 20 = 5471$ is true.
-
-\subsection{Quotient Estimation}
-\label{sec:divest}
-As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend.  When $p$ leading
-digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows.  Technically
-speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the
-dividend and divisor are zero.
-
-The value of the estimation may off by a few values in either direction and in general is fairly correct.  A simplification \cite[pp. 271]{TAOCPV2}
-of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$.  The estimate
-using this technique is never too small.  For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$
-represent the most significant digits of the dividend and divisor respectively.
-
-\textbf{Proof.}\textit{  The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to
-$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. }
-The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger.  For all other
-cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$.  The latter portion of the inequalility
-$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values.  Next a series of
-inequalities will prove the hypothesis.
-
-\begin{equation}
-y - \hat k x \le y - \hat k x_s\beta^s
-\end{equation}
-
-This is trivially true since $x \ge x_s\beta^s$.  Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$.
-
-\begin{equation}
-y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s)
-\end{equation}
-
-By simplifying the previous inequality the following inequality is formed.
-
-\begin{equation}
-y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s
-\end{equation}
-
-Subsequently,
-
-\begin{equation}
-y_{t-2}\beta^{t-2} + \ldots +  y_0  + x_s\beta^s - \beta^s < x_s\beta^s \le x
-\end{equation}
-
-Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof.  \textbf{QED}
-
-
-\subsection{Normalized Integers}
-For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$.  By multiplying both
-$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original
-remainder.  The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will
-lie in the domain of a single digit.  Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$.
-
-\begin{equation}
-{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta}
-\end{equation}
-
-At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small.
-
-\subsection{Radix-$\beta$ Division with Remainder}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div}. \\
-\textbf{Input}.   mp\_int $a, b$ \\
-\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
-\hline \\
-1.  If $b = 0$ return(\textit{MP\_VAL}). \\
-2.  If $\vert a \vert < \vert b \vert$ then do \\
-\hspace{3mm}2.1  $d \leftarrow a$ \\
-\hspace{3mm}2.2  $c \leftarrow 0$ \\
-\hspace{3mm}2.3  Return(\textit{MP\_OKAY}). \\
-\\
-Setup the quotient to receive the digits. \\
-3.  Grow $q$ to $a.used + 2$ digits. \\
-4.  $q \leftarrow 0$ \\
-5.  $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\
-6.  $sign \leftarrow  \left \lbrace \begin{array}{ll}
-                              MP\_ZPOS &  \mbox{if }a.sign = b.sign \\
-                              MP\_NEG  &  \mbox{otherwise} \\
-                              \end{array} \right .$ \\
-\\
-Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\
-7.  $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\
-8.  $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\
-\\
-Find the leading digit of the quotient. \\
-9.  $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\
-10.  $y \leftarrow y \cdot \beta^{n - t}$ \\
-11.  While ($x \ge y$) do \\
-\hspace{3mm}11.1  $q_{n - t} \leftarrow q_{n - t} + 1$ \\
-\hspace{3mm}11.2  $x \leftarrow x - y$ \\
-12.  $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\
-\\
-Continued on the next page. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div} (continued). \\
-\textbf{Input}.   mp\_int $a, b$ \\
-\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
-\hline \\
-Now find the remainder fo the digits. \\
-13.  for $i$ from $n$ down to $(t + 1)$ do \\
-\hspace{3mm}13.1  If $i > x.used$ then jump to the next iteration of this loop. \\
-\hspace{3mm}13.2  If $x_{i} = y_{t}$ then \\
-\hspace{6mm}13.2.1  $q_{i - t - 1} \leftarrow \beta - 1$ \\
-\hspace{3mm}13.3  else \\
-\hspace{6mm}13.3.1  $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\
-\hspace{6mm}13.3.2  $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\
-\hspace{6mm}13.3.3  $q_{i - t - 1} \leftarrow \hat r$ \\
-\hspace{3mm}13.4  $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\
-\\
-Fixup quotient estimation. \\
-\hspace{3mm}13.5  Loop \\
-\hspace{6mm}13.5.1  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
-\hspace{6mm}13.5.2  t$1 \leftarrow 0$ \\
-\hspace{6mm}13.5.3  t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\
-\hspace{6mm}13.5.4  $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\
-\hspace{6mm}13.5.5  t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\
-\hspace{6mm}13.5.6  If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\
-\hspace{3mm}13.6  t$1 \leftarrow y \cdot q_{i - t - 1}$ \\
-\hspace{3mm}13.7  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
-\hspace{3mm}13.8  $x \leftarrow x - $ t$1$ \\
-\hspace{3mm}13.9  If $x.sign = MP\_NEG$ then \\
-\hspace{6mm}13.10  t$1 \leftarrow y$ \\
-\hspace{6mm}13.11  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
-\hspace{6mm}13.12  $x \leftarrow x + $ t$1$ \\
-\hspace{6mm}13.13  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
-\\
-Finalize the result. \\
-14.  Clamp excess digits of $q$ \\
-15.  $c \leftarrow q, c.sign \leftarrow sign$ \\
-16.  $x.sign \leftarrow a.sign$ \\
-17.  $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\
-18.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div (continued)}
-\end{figure}
-\textbf{Algorithm mp\_div.}
-This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor.  The algorithm is a signed
-division and will produce a fully qualified quotient and remainder.
-
-First the divisor $b$ must be non-zero which is enforced in step one.  If the divisor is larger than the dividend than the quotient is implicitly
-zero and the remainder is the dividend.
-
-After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient.  Two unsigned copies of the
-divisor $y$ and dividend $x$ are made as well.  The core of the division algorithm is an unsigned division and will only work if the values are
-positive.  Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$.
-This is performed by shifting both to the left by enough bits to get the desired normalization.
-
-At this point the division algorithm can begin producing digits of the quotient.  Recall that maximum value of the estimation used is
-$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means.  In this case $y$ is shifted
-to the left (\textit{step ten}) so that it has the same number of digits as $x$.  The loop on step eleven will subtract multiples of the
-shifted copy of $y$ until $x$ is smaller.  Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two
-times to produce the desired leading digit of the quotient.
-
-Now the remainder of the digits can be produced.  The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly
-accurately approximate the true quotient digit.  The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by
-induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$.
-
-Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high.  The next step of the estimation process is
-to refine the estimation.  The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher
-order approximation to adjust the quotient digit.
-
-After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced
-by optimizing Barrett reduction.}.  Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of
-algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large.
-
-Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the
-remainder.  An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC}
-is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie
-outside their respective boundaries.  For example, if $t = 0$ or $i \le 1$ then the digits would be undefined.  In those cases the digits should
-respectively be replaced with a zero.
-
-EXAM,bn_mp_div.c
-
-The implementation of this algorithm differs slightly from the pseudo code presented previously.  In this algorithm either of the quotient $c$ or
-remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired.  For example, the C code to call the division
-algorithm with only the quotient is
-
-\begin{verbatim}
-mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
-\end{verbatim}
-
-Lines @108,if@ and @113,if@ handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor
-respectively.  After the two trivial cases all of the temporary variables are initialized.  Line @147,neg@ determines the sign of
-the quotient and line @148,sign@ ensures that both $x$ and $y$ are positive.
-
-The number of bits in the leading digit is calculated on line @151,norm@.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
-of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
-exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
-them to the left by $lg(\beta) - 1 - k$ bits.
-
-Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the
-leading digit of the quotient.  The loop beginning on line @184,for@ will produce the remainder of the quotient digits.
-
-The conditional ``continue'' on line @186,continue@ is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
-algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
-above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.
-
-Lines @214,t1@, @216,t1@ and @222,t2@ through @225,t2@ manually construct the high accuracy estimations by setting the digits of the two mp\_int
-variables directly.
-
-\section{Single Digit Helpers}
-
-This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants.  All of
-the helper functions assume the single digit input is positive and will treat them as such.
-
-\subsection{Single Digit Addition and Subtraction}
-
-Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction
-algorithms.   As a result these algorithms are subtantially simpler with a slight cost in performance.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_add\_d}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c = a + b$ \\
-\hline \\
-1.  $t \leftarrow b$ (\textit{mp\_set}) \\
-2.  $c \leftarrow a + t$ \\
-3.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_add\_d}
-\end{figure}
-
-\textbf{Algorithm mp\_add\_d.}
-This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together.
-
-EXAM,bn_mp_add_d.c
-
-Clever use of the letter 't'.
-
-\subsubsection{Subtraction}
-The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int.
-
-\subsection{Single Digit Multiplication}
-Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline
-multiplication algorithm.  Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands
-only has one digit.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_d}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c = ab$ \\
-\hline \\
-1.  $pa \leftarrow a.used$ \\
-2.  Grow $c$ to at least $pa + 1$ digits. \\
-3.  $oldused \leftarrow c.used$ \\
-4.  $c.used \leftarrow pa + 1$ \\
-5.  $c.sign \leftarrow a.sign$ \\
-6.  $\mu \leftarrow 0$ \\
-7.  for $ix$ from $0$ to $pa - 1$ do \\
-\hspace{3mm}7.1  $\hat r \leftarrow \mu + a_{ix}b$ \\
-\hspace{3mm}7.2  $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}7.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-8.  $c_{pa} \leftarrow \mu$ \\
-9.  for $ix$ from $pa + 1$ to $oldused$ do \\
-\hspace{3mm}9.1  $c_{ix} \leftarrow 0$ \\
-10.  Clamp excess digits of $c$. \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_d}
-\end{figure}
-\textbf{Algorithm mp\_mul\_d.}
-This algorithm quickly multiplies an mp\_int by a small single digit value.  It is specially tailored to the job and has a minimal of overhead.
-Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations.
-
-EXAM,bn_mp_mul_d.c
-
-In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is
-read from the source.  This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively.
-
-\subsection{Single Digit Division}
-Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion.  Since the
-divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_d}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c = \lfloor a / b \rfloor, d = a - cb$ \\
-\hline \\
-1.  If $b = 0$ then return(\textit{MP\_VAL}).\\
-2.  If $b = 3$ then use algorithm mp\_div\_3 instead. \\
-3.  Init $q$ to $a.used$ digits.  \\
-4.  $q.used \leftarrow a.used$ \\
-5.  $q.sign \leftarrow a.sign$ \\
-6.  $\hat w \leftarrow 0$ \\
-7.  for $ix$ from $a.used - 1$ down to $0$ do \\
-\hspace{3mm}7.1  $\hat w \leftarrow \hat w \beta + a_{ix}$ \\
-\hspace{3mm}7.2  If $\hat w \ge b$ then \\
-\hspace{6mm}7.2.1  $t \leftarrow \lfloor \hat w / b \rfloor$ \\
-\hspace{6mm}7.2.2  $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\
-\hspace{3mm}7.3  else\\
-\hspace{6mm}7.3.1  $t \leftarrow 0$ \\
-\hspace{3mm}7.4  $q_{ix} \leftarrow t$ \\
-8.  $d \leftarrow \hat w$ \\
-9.  Clamp excess digits of $q$. \\
-10.  $c \leftarrow q$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_d}
-\end{figure}
-\textbf{Algorithm mp\_div\_d.}
-This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach.  Essentially in every iteration of the
-algorithm another digit of the dividend is reduced and another digit of quotient produced.  Provided $b < \beta$ the value of $\hat w$
-after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$.
-
-If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3.  It replaces the division by three with
-a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup.  In essence it is much like the Barrett reduction
-from chapter seven.
-
-EXAM,bn_mp_div_d.c
-
-Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
-indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
-
-The division and remainder on lines @90,/@ and @91,-@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based
-processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC
-compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.
-
-\subsection{Single Digit Root Extraction}
-
-Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned.  Algorithms such as the Newton-Raphson approximation
-(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$.
-
-\begin{equation}
-x_{i+1} = x_i - {f(x_i) \over f'(x_i)}
-\label{eqn:newton}
-\end{equation}
-
-In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired.  The derivative of $f(x)$ is
-simply $f'(x) = nx^{n - 1}$.  Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain
-such as the real numbers.  As a result the root found can be above the true root by few and must be manually adjusted.  Ideally at the end of the
-algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_n\_root}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c^b \le a$ \\
-\hline \\
-1.  If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
-2.  $sign \leftarrow a.sign$ \\
-3.  $a.sign \leftarrow MP\_ZPOS$ \\
-4.  t$2 \leftarrow 2$ \\
-5.  Loop \\
-\hspace{3mm}5.1  t$1 \leftarrow $ t$2$ \\
-\hspace{3mm}5.2  t$3 \leftarrow $ t$1^{b - 1}$ \\
-\hspace{3mm}5.3  t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\
-\hspace{3mm}5.4  t$2 \leftarrow $ t$2 - a$ \\
-\hspace{3mm}5.5  t$3 \leftarrow $ t$3 \cdot b$ \\
-\hspace{3mm}5.6  t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\
-\hspace{3mm}5.7  t$2 \leftarrow $ t$1 - $ t$3$ \\
-\hspace{3mm}5.8  If t$1 \ne $ t$2$ then goto step 5.  \\
-6.  Loop \\
-\hspace{3mm}6.1  t$2 \leftarrow $ t$1^b$ \\
-\hspace{3mm}6.2  If t$2 > a$ then \\
-\hspace{6mm}6.2.1  t$1 \leftarrow $ t$1 - 1$ \\
-\hspace{6mm}6.2.2  Goto step 6. \\
-7.  $a.sign \leftarrow sign$ \\
-8.  $c \leftarrow $ t$1$ \\
-9.  $c.sign \leftarrow sign$  \\
-10.  Return(\textit{MP\_OKAY}).  \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_n\_root}
-\end{figure}
-\textbf{Algorithm mp\_n\_root.}
-This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach.  It is partially optimized based on the observation
-that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator.  That is at first the denominator is calculated by finding
-$x^{b - 1}$.  This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator.  This saves a total of $b - 1$
-multiplications by t$1$ inside the loop.
-
-The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the
-root.  Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$.
-
-EXAM,bn_mp_n_root.c
-
-\section{Random Number Generation}
-
-Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms.  Pollard-Rho
-factoring for example, can make use of random values as starting points to find factors of a composite integer.  In this case the algorithm presented
-is solely for simulations and not intended for cryptographic use.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_rand}. \\
-\textbf{Input}.   An integer $b$ \\
-\textbf{Output}.  A pseudo-random number of $b$ digits \\
-\hline \\
-1.  $a \leftarrow 0$ \\
-2.  If $b \le 0$ return(\textit{MP\_OKAY}) \\
-3.  Pick a non-zero random digit $d$. \\
-4.  $a \leftarrow a + d$ \\
-5.  for $ix$ from 1 to $d - 1$ do \\
-\hspace{3mm}5.1  $a \leftarrow a \cdot \beta$ \\
-\hspace{3mm}5.2  Pick a random digit $d$. \\
-\hspace{3mm}5.3  $a \leftarrow a + d$ \\
-6.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_rand}
-\end{figure}
-\textbf{Algorithm mp\_rand.}
-This algorithm produces a pseudo-random integer of $b$ digits.  By ensuring that the first digit is non-zero the algorithm also guarantees that the
-final result has at least $b$ digits.  It relies heavily on a third-part random number generator which should ideally generate uniformly all of
-the integers from $0$ to $\beta - 1$.
-
-EXAM,bn_mp_rand.c
-
-\section{Formatted Representations}
-The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties.  For example, the ability to
-be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers
-into a program.
-
-\subsection{Reading Radix-n Input}
-For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to
-printable characters.  For example, when the character ``N'' is read it represents the integer $23$.  The first $16$ characters of the
-map are for the common representations up to hexadecimal.  After that they match the ``base64'' encoding scheme which are suitable chosen
-such that they are printable.  While outputting as base64 may not be too helpful for human operators it does allow communication via non binary
-mediums.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{cc|cc|cc|cc}
-\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} &  \textbf{Value} & \textbf{Char} \\
-\hline
-0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\
-4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\
-8 & 8 & 9 & 9 & 10 & A & 11 & B \\
-12 & C & 13 & D & 14 & E & 15 & F \\
-16 & G & 17 & H & 18 & I & 19 & J \\
-20 & K & 21 & L & 22 & M & 23 & N \\
-24 & O & 25 & P & 26 & Q & 27 & R \\
-28 & S & 29 & T & 30 & U & 31 & V \\
-32 & W & 33 & X & 34 & Y & 35 & Z \\
-36 & a & 37 & b & 38 & c & 39 & d \\
-40 & e & 41 & f & 42 & g & 43 & h \\
-44 & i & 45 & j & 46 & k & 47 & l \\
-48 & m & 49 & n & 50 & o & 51 & p \\
-52 & q & 53 & r & 54 & s & 55 & t \\
-56 & u & 57 & v & 58 & w & 59 & x \\
-60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Lower ASCII Map}
-\label{fig:ASC}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_read\_radix}. \\
-\textbf{Input}.   A string $str$ of length $sn$ and radix $r$. \\
-\textbf{Output}.  The radix-$\beta$ equivalent mp\_int. \\
-\hline \\
-1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
-2.  $ix \leftarrow 0$ \\
-3.  If $str_0 =$ ``-'' then do \\
-\hspace{3mm}3.1  $ix \leftarrow ix + 1$ \\
-\hspace{3mm}3.2  $sign \leftarrow MP\_NEG$ \\
-4.  else \\
-\hspace{3mm}4.1  $sign \leftarrow MP\_ZPOS$ \\
-5.  $a \leftarrow 0$ \\
-6.  for $iy$ from $ix$ to $sn - 1$ do \\
-\hspace{3mm}6.1  Let $y$ denote the position in the map of $str_{iy}$. \\
-\hspace{3mm}6.2  If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\
-\hspace{3mm}6.3  $a \leftarrow a \cdot r$ \\
-\hspace{3mm}6.4  $a \leftarrow a + y$ \\
-7.  If $a \ne 0$ then $a.sign \leftarrow sign$ \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_read\_radix}
-\end{figure}
-\textbf{Algorithm mp\_read\_radix.}
-This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer.  A minus symbol ``-'' may precede the
-string  to indicate the value is negative, otherwise it is assumed to be positive.  The algorithm will read up to $sn$ characters from the input
-and will stop when it reads a character it cannot map the algorithm stops reading characters from the string.  This allows numbers to be embedded
-as part of larger input without any significant problem.
-
-EXAM,bn_mp_read_radix.c
-
-\subsection{Generating Radix-$n$ Output}
-Generating radix-$n$ output is fairly trivial with a division and remainder algorithm.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toradix}. \\
-\textbf{Input}.   A mp\_int $a$ and an integer $r$\\
-\textbf{Output}.  The radix-$r$ representation of $a$ \\
-\hline \\
-1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
-2.  If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}).  \\
-3.  $t \leftarrow a$ \\
-4.  $str \leftarrow$ ``'' \\
-5.  if $t.sign = MP\_NEG$ then \\
-\hspace{3mm}5.1  $str \leftarrow str + $ ``-'' \\
-\hspace{3mm}5.2  $t.sign = MP\_ZPOS$ \\
-6.  While ($t \ne 0$) do \\
-\hspace{3mm}6.1  $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\
-\hspace{3mm}6.2  $t \leftarrow \lfloor t / r \rfloor$ \\
-\hspace{3mm}6.3  Look up $d$ in the map and store the equivalent character in $y$. \\
-\hspace{3mm}6.4  $str \leftarrow str + y$ \\
-7.  If $str_0 = $``$-$'' then \\
-\hspace{3mm}7.1  Reverse the digits $str_1, str_2, \ldots str_n$. \\
-8.  Otherwise \\
-\hspace{3mm}8.1  Reverse the digits $str_0, str_1, \ldots str_n$. \\
-9.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toradix}
-\end{figure}
-\textbf{Algorithm mp\_toradix.}
-This algorithm computes the radix-$r$ representation of an mp\_int $a$.  The ``digits'' of the representation are extracted by reducing
-successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$.  Note that instead of actually dividing by $r^k$ in
-each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration.  As a result a series of trivial $n \times 1$ divisions
-are required instead of a series of $n \times k$ divisions.  One design flaw of this approach is that the digits are produced in the reverse order
-(see~\ref{fig:mpradix}).  To remedy this flaw the digits must be swapped or simply ``reversed''.
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\
-\hline $1234$ & -- & -- \\
-\hline $123$  & $4$ & ``4'' \\
-\hline $12$   & $3$ & ``43'' \\
-\hline $1$    & $2$ & ``432'' \\
-\hline $0$    & $1$ & ``4321'' \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Algorithm mp\_toradix.}
-\label{fig:mpradix}
-\end{figure}
-
-EXAM,bn_mp_toradix.c
-
-\chapter{Number Theoretic Algorithms}
-This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi
-symbol computation.  These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and
-various Sieve based factoring algorithms.
-
-\section{Greatest Common Divisor}
-The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of
-both $a$ and $b$.  That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur
-simultaneously.
-
-The most common approach (cite) is to reduce one input modulo another.  That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then
-$r$ is also divisible by $k$.  The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\
-\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
-\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
-\hline \\
-1.  While ($b > 0$) do \\
-\hspace{3mm}1.1  $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
-\hspace{3mm}1.2  $a \leftarrow b$ \\
-\hspace{3mm}1.3  $b \leftarrow r$ \\
-2.  Return($a$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Greatest Common Divisor (I)}
-\label{fig:gcd1}
-\end{figure}
-
-This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly.  However, divisions are
-relatively expensive operations to perform and should ideally be avoided.  There is another approach based on a similar relationship of
-greatest common divisors.  The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$.
-In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\
-\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
-\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
-\hline \\
-1.  While ($b > 0$) do \\
-\hspace{3mm}1.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
-\hspace{3mm}1.2  $b \leftarrow b - a$ \\
-2.  Return($a$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Greatest Common Divisor (II)}
-\label{fig:gcd2}
-\end{figure}
-
-\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.}
-The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$.  In other
-words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$.  Since both $a$ and $b$ are always
-divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the
-second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof.  \textbf{QED}.
-
-As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful.  Specially if $b$ is much larger than $a$ such that
-$b - a$ is still very much larger than $a$.  A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does
-not divide the greatest common divisor but will divide $b - a$.  In this case ${b - a} \over p$ is also an integer and still divisible by
-the greatest common divisor.
-
-However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first.
-Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\
-\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
-\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
-\hline \\
-1.  $k \leftarrow 0$ \\
-2.  While $a$ and $b$ are both divisible by $p$ do \\
-\hspace{3mm}2.1  $a \leftarrow \lfloor a / p \rfloor$ \\
-\hspace{3mm}2.2  $b \leftarrow \lfloor b / p \rfloor$ \\
-\hspace{3mm}2.3  $k \leftarrow k + 1$ \\
-3.  While $a$ is divisible by $p$ do \\
-\hspace{3mm}3.1  $a \leftarrow \lfloor a / p \rfloor$ \\
-4.  While $b$ is divisible by $p$ do \\
-\hspace{3mm}4.1  $b \leftarrow \lfloor b / p \rfloor$ \\
-5.  While ($b > 0$) do \\
-\hspace{3mm}5.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
-\hspace{3mm}5.2  $b \leftarrow b - a$ \\
-\hspace{3mm}5.3  While $b$ is divisible by $p$ do \\
-\hspace{6mm}5.3.1  $b \leftarrow \lfloor b / p \rfloor$ \\
-6.  Return($a \cdot p^k$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Greatest Common Divisor (III)}
-\label{fig:gcd3}
-\end{figure}
-
-This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$
-decreases more rapidly.  The first loop on step two removes powers of $p$ that are in common.  A count, $k$, is kept which will present a common
-divisor of $p^k$.  After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$.  This means that $p$ can be safely
-divided out of the difference $b - a$ so long as the division leaves no remainder.
-
-In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often.  It also helps that division by $p$ be easy
-to compute.  The ideal choice of $p$ is two since division by two amounts to a right logical shift.  Another important observation is that by
-step five both $a$ and $b$ are odd.  Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the
-largest of the pair.
-
-\subsection{Complete Greatest Common Divisor}
-The algorithms presented so far cannot handle inputs which are zero or negative.  The following algorithm can handle all input cases properly
-and will produce the greatest common divisor.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_gcd}. \\
-\textbf{Input}.   mp\_int $a$ and $b$ \\
-\textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
-\hline \\
-1.  If $a = 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow \vert b \vert $ \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $b = 0$ then \\
-\hspace{3mm}2.1  $c \leftarrow \vert a \vert $ \\
-\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
-4.  $k \leftarrow 0$ \\
-5.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}5.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}5.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-8.  While $v.used > 0$ \\
-\hspace{3mm}8.1  If $\vert u \vert > \vert v \vert$ then \\
-\hspace{6mm}8.1.1  Swap $u$ and $v$. \\
-\hspace{3mm}8.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
-\hspace{3mm}8.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{6mm}8.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-9.  $c \leftarrow u \cdot 2^k$ \\
-10.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_gcd}
-\end{figure}
-\textbf{Algorithm mp\_gcd.}
-This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$.  The algorithm was originally based on Algorithm B of
-Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
-Algorithm B and in practice this appears to be true.
-
-The first two steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the
-largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of
-$a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
-
-Step five will divide out any common factors of two and keep track of the count in the variable $k$.  After this step, two is no longer a
-factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step
-six and seven ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while--loops will iterate since
-they cannot both be even.
-
-By step eight both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
-or greater than $u$.  This ensures that the subtraction on step 8.2 will always produce a positive and even result.  Step 8.3 removes any
-factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
-
-After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
-must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier.
-
-EXAM,bn_mp_gcd.c
-
-This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the
-integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
-it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three
-trivial cases of inputs are handled on lines @23,zero@ through @29,}@.  After those lines the inputs are assumed to be non-zero.
-
-Lines @32,if@ and @36,if@ make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two
-must be divided out of the two inputs.  The block starting at line @43,common@ removes common factors of two by first counting the number of trailing
-zero bits in both.  The local integer $k$ is used to keep track of how many factors of $2$ are pulled out of both values.  It is assumed that
-the number of factors will not exceed the maximum value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than
-entries than are accessible by an ``int'' so this is not a limitation.}.
-
-At this point there are no more common factors of two in the two values.  The divisions by a power of two on lines @60,div_2d@ and @67,div_2d@ remove
-any independent factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
-on line @72, while@ performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
-place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
-
-\section{Least Common Multiple}
-The least common multiple of a pair of integers is their product divided by their greatest common divisor.  For two integers $a$ and $b$ the
-least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$.  For example, if $a = 2 \cdot 2 \cdot 3 = 12$
-and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$.
-
-The least common multiple arises often in coding theory as well as number theory.  If two functions have periods of $a$ and $b$ respectively they will
-collide, that is be in synchronous states, after only $[ a, b ]$ iterations.  This is why, for example, random number generators based on
-Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}).
-Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_lcm}. \\
-\textbf{Input}.   mp\_int $a$ and $b$ \\
-\textbf{Output}.  The least common multiple $c = [a, b]$.  \\
-\hline \\
-1.  $c \leftarrow (a, b)$ \\
-2.  $t \leftarrow a \cdot b$ \\
-3.  $c \leftarrow \lfloor t / c \rfloor$ \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_lcm}
-\end{figure}
-\textbf{Algorithm mp\_lcm.}
-This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$.  It computes the least common multiple directly by
-dividing the product of the two inputs by their greatest common divisor.
-
-EXAM,bn_mp_lcm.c
-
-\section{Jacobi Symbol Computation}
-To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg.  What is the name of this?} off which the Jacobi symbol is
-defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
-equivalent to equation \ref{eqn:legendre}.
-
-\textit{-- Tom, don't be an ass, cite your source here...!}
-
-\begin{equation}
-a^{(p-1)/2} \equiv \begin{array}{rl}
-                              -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
-                              0  &  \mbox{if }a\mbox{ divides }p\mbox{.} \\
-                              1  &  \mbox{if }a\mbox{ is a quadratic residue}.
-                              \end{array} \mbox{ (mod }p\mbox{)}
-\label{eqn:legendre}
-\end{equation}
-
-\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.}
-An integer $a$ is a quadratic residue if the following equation has a solution.
-
-\begin{equation}
-x^2 \equiv a \mbox{ (mod }p\mbox{)}
-\label{eqn:root}
-\end{equation}
-
-Consider the following equation.
-
-\begin{equation}
-0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)}
-\label{eqn:rooti}
-\end{equation}
-
-Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true.  If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$
-then the quantity in the braces must be zero.  By reduction,
-
-\begin{eqnarray}
-\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0  \nonumber \\
-\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\
-x^2 \equiv a \mbox{ (mod }p\mbox{)}
-\end{eqnarray}
-
-As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue.  If $a$ does not divide $p$ and $a$
-is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since
-\begin{equation}
-0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)}
-\end{equation}
-One of the terms on the right hand side must be zero.  \textbf{QED}
-
-\subsection{Jacobi Symbol}
-The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2.  If $p = \prod_{i=0}^n p_i$ then
-the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation.
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right )
-\end{equation}
-
-By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function.  The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for
-further details.} will be used to derive an efficient Jacobi symbol algorithm.  Where $p$ is an odd integer greater than two and $a, b \in \Z$ the
-following are true.
-
-\begin{enumerate}
-\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$.
-\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$.
-\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$.
-\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$.  Otherwise, it equals $-1$.
-\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$.  More specifically
-$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$.
-\end{enumerate}
-
-Using these facts if $a = 2^k \cdot a'$ then
-
-\begin{eqnarray}
-\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\
-                               = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right )
-\label{eqn:jacobi}
-\end{eqnarray}
-
-By fact five,
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4}
-\end{equation}
-
-Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4}
-\end{equation}
-
-By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed.
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right )  \cdot (-1)^{(p-1)(a'-1)/4}
-\end{equation}
-
-The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively.  The value of
-$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$.  Using this approach the
-factors of $p$ do not have to be known.  Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the
-Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_jacobi}. \\
-\textbf{Input}.   mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\
-\textbf{Output}.  The Jacobi symbol $c = \left ( {a \over p } \right )$. \\
-\hline \\
-1.  If $a = 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow 0$ \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $a = 1$ then \\
-\hspace{3mm}2.1  $c \leftarrow 1$ \\
-\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  $a' \leftarrow a$ \\
-4.  $k \leftarrow 0$ \\
-5.  While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}5.2  $a' \leftarrow \lfloor a' / 2 \rfloor$ \\
-6.  If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\
-\hspace{3mm}6.1  $s \leftarrow 1$ \\
-7.  else \\
-\hspace{3mm}7.1  $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\
-\hspace{3mm}7.2  If $r = 1$ or $r = 7$ then \\
-\hspace{6mm}7.2.1  $s \leftarrow 1$ \\
-\hspace{3mm}7.3  else \\
-\hspace{6mm}7.3.1  $s \leftarrow -1$ \\
-8.  If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\
-\hspace{3mm}8.1  $s \leftarrow -s$ \\
-9.  If $a' \ne 1$ then \\
-\hspace{3mm}9.1  $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\
-\hspace{3mm}9.2  $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\
-10.  $c \leftarrow s$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_jacobi}
-\end{figure}
-\textbf{Algorithm mp\_jacobi.}
-This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three.  The algorithm
-is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}.
-
-Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively.  Step five determines the number of two factors in the
-input $a$.  If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one.  If $k$ is odd than the term evaluates to one
-if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled
-the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$.  The latter term evaluates to one if both $p$ and $a'$
-are congruent to one modulo four, otherwise it evaluates to negative one.
-
-By step nine if $a'$ does not equal one a recursion is required.  Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute
-$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product.
-
-EXAM,bn_mp_jacobi.c
-
-As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C
-variable name character.
-
-The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm.  If the input is non-trivial the algorithm
-has to proceed compute the Jacobi.  The variable $s$ is used to hold the current Jacobi product.  Note that $s$ is merely a C ``int'' data type since
-the values it may obtain are merely $-1$, $0$ and $1$.
-
-After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$.  Technically only the least significant
-bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same
-processor requirements and neither is faster than the other.
-
-Line @59, if@ through @70, }@ determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
-$k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
-$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines @73, if@ through @75, }@.
-
-Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.
-
-\textit{-- Comment about default $s$ and such...}
-
-\section{Modular Inverse}
-\label{sec:modinv}
-The modular inverse of a number actually refers to the modular multiplicative inverse.  Essentially for any integer $a$ such that $(a, p) = 1$ there
-exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$.  The integer $b$ is called the multiplicative inverse of $a$ which is
-denoted as $b = a^{-1}$.  Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and
-fields of integers.  However, the former will be the matter of discussion.
-
-The simplest approach is to compute the algebraic inverse of the input.  That is to compute $b \equiv a^{\Phi(p) - 1}$.  If $\Phi(p)$ is the
-order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$.  The proof of which is trivial.
-
-\begin{equation}
-ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)}
-\end{equation}
-
-However, as simple as this approach may be it has two serious flaws.  It requires that the value of $\Phi(p)$ be known which if $p$ is composite
-requires all of the prime factors.  This approach also is very slow as the size of $p$ grows.
-
-A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear
-Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation.
-
-\begin{equation}
-ab + pq = 1
-\end{equation}
-
-Where $a$, $b$, $p$ and $q$ are all integers.  If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of
-$a$ modulo $p$.  The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$.
-However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place.  The
-binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine
-equation.
-
-\subsection{General Case}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_invmod}. \\
-\textbf{Input}.   mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$.  \\
-\textbf{Output}.  The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then return(\textit{MP\_VAL}). \\
-2.  If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\
-3.  $x \leftarrow \vert a \vert, y \leftarrow b$ \\
-4.  If $x_0 \equiv y_0  \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\
-5.  $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\
-6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}6.2  If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
-\hspace{6mm}6.2.1  $A \leftarrow A + y$ \\
-\hspace{6mm}6.2.2  $B \leftarrow B - x$ \\
-\hspace{3mm}6.3  $A \leftarrow \lfloor A / 2 \rfloor$ \\
-\hspace{3mm}6.4  $B \leftarrow \lfloor B / 2 \rfloor$ \\
-7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-\hspace{3mm}7.2  If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
-\hspace{6mm}7.2.1  $C \leftarrow C + y$ \\
-\hspace{6mm}7.2.2  $D \leftarrow D - x$ \\
-\hspace{3mm}7.3  $C \leftarrow \lfloor C / 2 \rfloor$ \\
-\hspace{3mm}7.4  $D \leftarrow \lfloor D / 2 \rfloor$ \\
-8.  If $u \ge v$ then \\
-\hspace{3mm}8.1  $u \leftarrow u - v$ \\
-\hspace{3mm}8.2  $A \leftarrow A - C$ \\
-\hspace{3mm}8.3  $B \leftarrow B - D$ \\
-9.  else \\
-\hspace{3mm}9.1  $v \leftarrow v - u$ \\
-\hspace{3mm}9.2  $C \leftarrow C - A$ \\
-\hspace{3mm}9.3  $D \leftarrow D - B$ \\
-10.  If $u \ne 0$ goto step 6. \\
-11.  If $v \ne 1$ return(\textit{MP\_VAL}). \\
-12.  While $C \le 0$ do \\
-\hspace{3mm}12.1  $C \leftarrow C + b$ \\
-13.  While $C \ge b$ do \\
-\hspace{3mm}13.1  $C \leftarrow C - b$ \\
-14.  $c \leftarrow C$ \\
-15.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\end{figure}
-\textbf{Algorithm mp\_invmod.}
-This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$.  This algorithm is a variation of the
-extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}.  It has been modified to only compute the modular inverse and not a complete
-Diophantine solution.
-
-If $b \le 0$ than the modulus is invalid and MP\_VAL is returned.  Similarly if both $a$ and $b$ are even then there cannot be a multiplicative
-inverse for $a$ and the error is reported.
-
-The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd.  In this case
-the other variables to the Diophantine equation are solved.  The algorithm terminates when $u = 0$ in which case the solution is
-
-\begin{equation}
-Ca + Db = v
-\end{equation}
-
-If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists.  Otherwise, $C$
-is the modular inverse of $a$.  The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie
-within $1 \le a^{-1} < b$.  Step numbers twelve and thirteen adjust the inverse until it is in range.  If the original input $a$ is within $0 < a < p$
-then only a couple of additions or subtractions will be required to adjust the inverse.
-
-EXAM,bn_mp_invmod.c
-
-\subsubsection{Odd Moduli}
-
-When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse.  In particular by attempting to solve
-the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$.
-
-The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed.  This
-optimization will halve the time required to compute the modular inverse.
-
-\section{Primality Tests}
-
-A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself.  For example, $a = 7$ is prime
-since the integers $2 \ldots 6$ do not evenly divide $a$.  By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$.
-
-Prime numbers arise in cryptography considerably as they allow finite fields to be formed.  The ability to determine whether an integer is prime or
-not quickly has been a viable subject in cryptography and number theory for considerable time.  The algorithms that will be presented are all
-probablistic algorithms in that when they report an integer is composite it must be composite.  However, when the algorithms report an integer is
-prime the algorithm may be incorrect.
-
-As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as
-well be zero.  For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question.
-
-\subsection{Trial Division}
-
-Trial division means to attempt to evenly divide a candidate integer by small prime integers.  If the candidate can be evenly divided it obviously
-cannot be prime.  By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime.  However, such a test
-would require a prohibitive amount of time as $n$ grows.
-
-Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead.  By performing trial division with only a subset
-of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime.  However, often it can prove a candidate is not prime.
-
-The benefit of this test is that trial division by small values is fairly efficient.  Specially compared to the other algorithms that will be
-discussed shortly.  The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by
-$1 - {1.12 \over ln(q)}$.  The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range
-$3 \le q \le 100$.
-
-At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly.  At $q = 90$ further testing is generally not going to
-be of any practical use.  In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate
-approximately $80\%$ of all candidate integers.  The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base.  The
-array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$.  \\
-\hline \\
-1.  for $ix$ from $0$ to $PRIME\_SIZE$ do \\
-\hspace{3mm}1.1  $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\
-\hspace{3mm}1.2  If $d = 0$ then \\
-\hspace{6mm}1.2.1  $c \leftarrow 1$ \\
-\hspace{6mm}1.2.2  Return(\textit{MP\_OKAY}). \\
-2.  $c \leftarrow 0$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_prime\_is\_divisible}
-\end{figure}
-\textbf{Algorithm mp\_prime\_is\_divisible.}
-This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions.
-
-EXAM,bn_mp_prime_is_divisible.c
-
-The algorithm defaults to a return of $0$ in case an error occurs.  The values in the prime table are all specified to be in the range of a
-mp\_digit.  The table \_\_prime\_tab is defined in the following file.
-
-EXAM,bn_prime_tab.c
-
-Note that there are two possible tables.  When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes
-upto $1619$ are used.  Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit.
-
-\subsection{The Fermat Test}
-The Fermat test is probably one the oldest tests to have a non-trivial probability of success.  It is based on the fact that if $n$ is in
-fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$.  The reason being that if $n$ is prime than the order of
-the multiplicative sub group is $n - 1$.  Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to
-$a^1 = a$.
-
-If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$.  In which case
-it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$.  However, this test is not absolute as it is possible that the order
-of a base will divide $n - 1$ which would then be reported as prime.  Such a base yields what is known as a Fermat pseudo-prime.  Several
-integers known as Carmichael numbers will be a pseudo-prime to all valid bases.  Fortunately such numbers are extremely rare as $n$ grows
-in size.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_prime\_fermat}. \\
-\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
-\textbf{Output}.  $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$.  \\
-\hline \\
-1.  $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\
-2.  If $t = b$ then \\
-\hspace{3mm}2.1  $c = 1$ \\
-3.  else \\
-\hspace{3mm}3.1  $c = 0$ \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_prime\_fermat}
-\end{figure}
-\textbf{Algorithm mp\_prime\_fermat.}
-This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not.  It uses a single modular exponentiation to
-determine the result.
-
-EXAM,bn_mp_prime_fermat.c
-
-\subsection{The Miller-Rabin Test}
-The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen
-candidate  integers.  The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the
-value must be equal to $-1$.  The squarings are stopped as soon as $-1$ is observed.  If the value of $1$ is observed first it means that
-some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\
-\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
-\textbf{Output}.  $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$.  \\
-\hline
-1.  $a' \leftarrow a - 1$ \\
-2.  $r  \leftarrow n1$    \\
-3.  $c \leftarrow 0, s  \leftarrow 0$ \\
-4.  While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}4.1  $s \leftarrow s + 1$ \\
-\hspace{3mm}4.2  $r \leftarrow \lfloor r / 2 \rfloor$ \\
-5.  $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\
-6.  If $y \nequiv \pm 1$ then \\
-\hspace{3mm}6.1  $j \leftarrow 1$ \\
-\hspace{3mm}6.2  While $j \le (s - 1)$ and $y \nequiv a'$ \\
-\hspace{6mm}6.2.1  $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\
-\hspace{6mm}6.2.2  If $y = 1$ then goto step 8. \\
-\hspace{6mm}6.2.3  $j \leftarrow j + 1$ \\
-\hspace{3mm}6.3  If $y \nequiv a'$ goto step 8. \\
-7.  $c \leftarrow 1$\\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_prime\_miller\_rabin}
-\end{figure}
-\textbf{Algorithm mp\_prime\_miller\_rabin.}
-This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$.  It will set $c = 1$ if the algorithm cannot determine
-if $b$ is composite or $c = 0$ if $b$ is provably composite.  The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$.
-
-If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not.  Otherwise, the algorithm will
-square $y$ upto $s - 1$ times stopping only when $y \equiv -1$.  If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$
-is provably composite.  If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite.  If $a$ is not provably
-composite then it is \textit{probably} prime.
-
-EXAM,bn_mp_prime_miller_rabin.c
-
-
-
-
-\backmatter
-\appendix
-\begin{thebibliography}{ABCDEF}
-\bibitem[1]{TAOCPV2}
-Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
-
-\bibitem[2]{HAC}
-A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
-
-\bibitem[3]{ROSE}
-Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
-
-\bibitem[4]{COMBA}
-Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
-
-\bibitem[5]{KARA}
-A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
-
-\bibitem[6]{KARAP}
-Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
-
-\bibitem[7]{BARRETT}
-Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
-
-\bibitem[8]{MONT}
-P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
-
-\bibitem[9]{DRMET}
-Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
-
-\bibitem[10]{MMB}
-J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
-
-\bibitem[11]{RSAREF}
-R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems}
-
-\bibitem[12]{DHREF}
-Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976
-
-\bibitem[13]{IEEE}
-IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985)
-
-\bibitem[14]{GMP}
-GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/}
-
-\bibitem[15]{MPI}
-Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/}
-
-\bibitem[16]{OPENSSL}
-OpenSSL Cryptographic Toolkit, \url{http://openssl.org}
-
-\bibitem[17]{LIP}
-Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip}
-
-\bibitem[18]{ISOC}
-JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.''
-
-\bibitem[19]{JAVA}
-The Sun Java Website, \url{http://java.sun.com/}
-
-\end{thebibliography}
-
-\input{tommath.ind}
-
-\end{document}
diff --git a/libtommath/tommath.tex b/libtommath/tommath.tex
deleted file mode 100644
index d70b64b..0000000
--- a/libtommath/tommath.tex
+++ /dev/null
@@ -1,10816 +0,0 @@
-\documentclass[b5paper]{book}
-\usepackage{hyperref}
-\usepackage{makeidx}
-\usepackage{amssymb}
-\usepackage{color}
-\usepackage{alltt}
-\usepackage{graphicx}
-\usepackage{layout}
-\def\union{\cup}
-\def\intersect{\cap}
-\def\getsrandom{\stackrel{\rm R}{\gets}}
-\def\cross{\times}
-\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
-\def\catn{$\|$}
-\def\divides{\hspace{0.3em} | \hspace{0.3em}}
-\def\nequiv{\not\equiv}
-\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
-\def\lcm{{\rm lcm}}
-\def\gcd{{\rm gcd}}
-\def\log{{\rm log}}
-\def\ord{{\rm ord}}
-\def\abs{{\mathit abs}}
-\def\rep{{\mathit rep}}
-\def\mod{{\mathit\ mod\ }}
-\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
-\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
-\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
-\def\Or{{\rm\ or\ }}
-\def\And{{\rm\ and\ }}
-\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
-\def\implies{\Rightarrow}
-\def\undefined{{\rm ``undefined"}}
-\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
-\let\oldphi\phi
-\def\phi{\varphi}
-\def\Pr{{\rm Pr}}
-\newcommand{\str}[1]{{\mathbf{#1}}}
-\def\F{{\mathbb F}}
-\def\N{{\mathbb N}}
-\def\Z{{\mathbb Z}}
-\def\R{{\mathbb R}}
-\def\C{{\mathbb C}}
-\def\Q{{\mathbb Q}}
-\definecolor{DGray}{gray}{0.5}
-\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
-\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
-\def\gap{\vspace{0.5ex}}
-\makeindex
-\begin{document}
-\frontmatter
-\pagestyle{empty}
-\title{Multi--Precision Math}
-\author{\mbox{
-%\begin{small}
-\begin{tabular}{c}
-Tom St Denis \\
-Algonquin College \\
-\\
-Mads Rasmussen \\
-Open Communications Security \\
-\\
-Greg Rose \\
-QUALCOMM Australia \\
-\end{tabular}
-%\end{small}
-}
-}
-\maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.39 release of the
-LibTomMath project.
-
-This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{}
-{\em book} macro package and the Perl {\em booker} package.
-
-\tableofcontents
-\listoffigures
-\chapter*{Prefaces}
-When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.
-They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''
-Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which
-perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
-others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give
-back to society in the form of tools and knowledge that can help others in their endeavours.
-
-I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
-code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
-explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
-itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
-of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
-from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
-
-This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
-of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
-length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
-comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg
-were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to
-continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.
-
-To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
-honour your kind gestures with this project.
-
-Open Source.  Open Academia.  Open Minds.
-
-\begin{flushright} Tom St Denis \end{flushright}
-
-\newpage
-I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also
-contribute to educate others facing the problem of having to handle big number mathematical calculations.
-
-This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of
-how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about
-the layout and language used.
-
-I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the
-practical aspects of cryptography.
-
-Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a
-great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up
-multiple precision calculations is often very important since we deal with outdated machine architecture where modular
-reductions, for example, become painfully slow.
-
-This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks
-themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?''
-
-\begin{flushright}
-Mads Rasmussen
-
-S\~{a}o Paulo - SP
-
-Brazil
-\end{flushright}
-
-\newpage
-It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about
-Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not
-really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once.
-
-At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the
-sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real
-contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity.
-Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake.
-
-When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully,
-and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close
-friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort,
-and I'm pleased to be involved with it.
-
-\begin{flushright}
-Greg Rose, Sydney, Australia, June 2003.
-\end{flushright}
-
-\mainmatter
-\pagestyle{headings}
-\chapter{Introduction}
-\section{Multiple Precision Arithmetic}
-
-\subsection{What is Multiple Precision Arithmetic?}
-When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively
-raise or lower the precision of the numbers we are dealing with.  For example, in decimal we almost immediate can
-reason that $7$ times $6$ is $42$.  However, $42$ has two digits of precision as opposed to one digit we started with.
-Further multiplications of say $3$ result in a larger precision result $126$.  In these few examples we have multiple
-precisions for the numbers we are working with.  Despite the various levels of precision a single subset\footnote{With the occasional optimization.}
- of algorithms can be designed to accomodate them.
-
-By way of comparison a fixed or single precision operation would lose precision on various operations.  For example, in
-the decimal system with fixed precision $6 \cdot 7 = 2$.
-
-Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in
-schools to manually add, subtract, multiply and divide.
-
-\subsection{The Need for Multiple Precision Arithmetic}
-The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation
-of public-key cryptography algorithms.   Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require
-integers of significant magnitude to resist known cryptanalytic attacks.  For example, at the time of this writing a
-typical RSA modulus would be at least greater than $10^{309}$.  However, modern programming languages such as ISO C \cite{ISOC} and
-Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision.
-
-\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{|r|c|}
-\hline \textbf{Data Type} & \textbf{Range} \\
-\hline char  & $-128 \ldots 127$ \\
-\hline short & $-32768 \ldots 32767$ \\
-\hline long  & $-2147483648 \ldots 2147483647$ \\
-\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Typical Data Types for the C Programming Language}
-\label{fig:ISOC}
-\end{figure}
-
-The largest data type guaranteed to be provided by the ISO C programming
-language\footnote{As per the ISO C standard.  However, each compiler vendor is allowed to augment the precision as they
-see fit.}  can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is
-insufficient to accomodate the magnitude required for the problem at hand.  An RSA modulus of magnitude $10^{19}$ could be
-trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer,
-rendering any protocol based on the algorithm insecure.  Multiple precision algorithms solve this very problem by
-extending the range of representable integers while using single precision data types.
-
-Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic
-primitives.  Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in
-various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient.  In fact, several
-major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and
-deployment of efficient algorithms.
-
-However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines.
-Another auxiliary use of multiple precision integers is high precision floating point data types.
-The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$.
-Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE.  Since IEEE
-floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small
-(\textit{23, 48 and 64 bits}).  The mantissa is merely an integer and a multiple precision integer could be used to create
-a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where
-scientific applications must minimize the total output error over long calculations.
-
-Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
-In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
-
-\subsection{Benefits of Multiple Precision Arithmetic}
-\index{precision}
-The benefit of multiple precision representations over single or fixed precision representations is that
-no precision is lost while representing the result of an operation which requires excess precision.  For example,
-the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully.  A multiple
-precision algorithm would augment the precision of the destination to accomodate the result while a single precision system
-would truncate excess bits to maintain a fixed level of precision.
-
-It is possible to implement algorithms which require large integers with fixed precision algorithms.  For example, elliptic
-curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum
-size the system will ever need.  Such an approach can lead to vastly simpler algorithms which can accomodate the
-integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard
-processor has an 8 bit accumulator.}.  However, as efficient as such an approach may be, the resulting source code is not
-normally very flexible.  It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated.
-
-Multiple precision algorithms have the most overhead of any style of arithmetic.  For the the most part the
-overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved
-platforms.  However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the
-inputs.  That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input
-without the designer's explicit forethought.  This leads to lower cost of ownership for the code as it only has to
-be written and tested once.
-
-\section{Purpose of This Text}
-The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms.
-That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping''
-elements that are neglected by authors of other texts on the subject.  Several well reknowned texts \cite{TAOCPV2,HAC}
-give considerably detailed explanations of the theoretical aspects of algorithms and often very little information
-regarding the practical implementation aspects.
-
-In most cases how an algorithm is explained and how it is actually implemented are two very different concepts.  For
-example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple
-algorithm for performing multiple precision integer addition.  However, the description lacks any discussion concerning
-the fact that the two integer inputs may be of differing magnitudes.  As a result the implementation is not as simple
-as the text would lead people to believe.  Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not
-discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}).
-
-Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers
-and fast modular inversion, which we consider practical oversights.  These optimal algorithms are vital to achieve
-any form of useful performance in non-trivial applications.
-
-To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
-package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.com}} package is used
-to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field
-tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text
-discusses a very large portion of the inner workings of the library.
-
-The algorithms that are presented will always include at least one ``pseudo-code'' description followed
-by the actual C source code that implements the algorithm.  The pseudo-code can be used to implement the same
-algorithm in other programming languages as the reader sees fit.
-
-This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch.  Showing
-the reader how the algorithms fit together as well as where to start on various taskings.
-
-\section{Discussion and Notation}
-\subsection{Notation}
-A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
-the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits
-of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer
-$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.
-
-\index{mp\_int}
-The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well
-as auxilary data required to manipulate the data.  These additional members are discussed further in section
-\ref{sec:MPINT}.  For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be
-synonymous.  When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members
-are present as well.  An expression of the type \textit{variablename.item} implies that it should evaluate to the
-member named ``item'' of the variable.  For example, a string of characters may have a member ``length'' which would
-evaluate to the number of characters in the string.  If the string $a$ equals ``hello'' then it follows that
-$a.length = 5$.
-
-For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used
-to solve a given problem.  When an algorithm is described as accepting an integer input it is assumed the input is
-a plain integer with no additional multiple-precision members.  That is, algorithms that use integers as opposed to
-mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management.  These
-algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple
-precision algorithm to solve the same problem.
-
-\subsection{Precision Notation}
-The variable $\beta$ represents the radix of a single digit of a multiple precision integer and
-must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in
-the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range
-$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the
-carry.  Since all modern computers are binary, it is assumed that $q$ is two.
-
-\index{mp\_digit} \index{mp\_word}
-Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent
-a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type.  In
-several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words.
-For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to
-the $j$'th digit of a double precision array.  Whenever an expression is to be assigned to a double precision
-variable it is assumed that all single precision variables are promoted to double precision during the evaluation.
-Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single
-precision data type.
-
-For example, if $\beta = 10^2$ a single precision data type may represent a value in the
-range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$.  Let
-$a = 23$ and $b = 49$ represent two single precision variables.  The single precision product shall be written
-as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$.
-In this particular case, $\hat c = 1127$ and $c = 127$.  The most significant digit of the product would not fit
-in a single precision data type and as a result $c \ne \hat c$.
-
-\subsection{Algorithm Inputs and Outputs}
-Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision
-as indicated.  The only exception to this rule is when variables have been indicated to be of type mp\_int.  This
-distinction is important as scalars are often used as array indicies and various other counters.
-
-\subsection{Mathematical Expressions}
-The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression
-itself.  For example, $\lfloor 5.7 \rfloor = 5$.  Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression
-rounded to an integer not less than the expression itself.  For example, $\lceil 5.1 \rceil = 6$.  Typically when
-the $/$ division symbol is used the intention is to perform an integer division with truncation.  For example,
-$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a
-fraction a real value division is implied, for example ${5 \over 2} = 2.5$.
-
-The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
-of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.
-
-\subsection{Work Effort}
-\index{big-Oh}
-To measure the efficiency of the specified algorithms, a modified big-Oh notation is used.  In this system all
-single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.
-That is a single precision addition, multiplication and division are assumed to take the same time to
-complete.  While this is generally not true in practice, it will simplify the discussions considerably.
-
-Some algorithms have slight advantages over others which is why some constants will not be removed in
-the notation.  For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a
-baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work.  In standard big-Oh notation these
-would both be said to be equivalent to $O(n^2)$.  However,
-in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small.  As a
-result small constant factors in the work effort will make an observable difference in algorithm efficiency.
-
-All of the algorithms presented in this text have a polynomial time work level.  That is, of the form
-$O(n^k)$ for $n, k \in \Z^{+}$.  This will help make useful comparisons in terms of the speed of the algorithms and how
-various optimizations will help pay off in the long run.
-
-\section{Exercises}
-Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to
-the discussion at hand.  These exercises are not designed to be prize winning problems, but instead to be thought
-provoking.  Wherever possible the problems are forward minded, stating problems that will be answered in subsequent
-chapters.  The reader is encouraged to finish the exercises as they appear to get a better understanding of the
-subject material.
-
-That being said, the problems are designed to affirm knowledge of a particular subject matter.  Students in particular
-are encouraged to verify they can answer the problems correctly before moving on.
-
-Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of
-the problem.  However, unlike \cite{TAOCPV2} the problems do not get nearly as hard.  The scoring of these
-exercises ranges from one (the easiest) to five (the hardest).  The following table sumarizes the
-scoring system used.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|l|}
-\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
-                            & minutes to solve.  Usually does not involve much computer time \\
-                            & to solve. \\
-\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
-                     & time usage.  Usually requires a program to be written to \\
-                     & solve the problem. \\
-\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
-                     & of work.  Usually involves trivial research and development of \\
-                     & new theory from the perspective of a student. \\
-\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
-                     & of work and research, the solution to which will demonstrate \\
-                     & a higher mastery of the subject matter. \\
-\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\
-                     & novice to solve.  Solutions to these problems will demonstrate a \\
-                     & complete mastery of the given subject. \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Exercise Scoring System}
-\end{figure}
-
-Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
-devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level
-are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  These
-two levels are essentially entry level questions.
-
-Problems at the third level are meant to be a bit more difficult than the first two levels.  The answer is often
-fairly obvious but arriving at an exacting solution requires some thought and skill.  These problems will almost always
-involve devising a new algorithm or implementing a variation of another algorithm previously presented.  Readers who can
-answer these questions will feel comfortable with the concepts behind the topic at hand.
-
-Problems at the fourth level are meant to be similar to those of the level three questions except they will require
-additional research to be completed.  The reader will most likely not know the answer right away, nor will the text provide
-the exact details of the answer until a subsequent chapter.
-
-Problems at the fifth level are meant to be the hardest
-problems relative to all the other problems in the chapter.  People who can correctly answer fifth level problems have a
-mastery of the subject matter at hand.
-
-Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
-is encouraged to answer the follow-up problems and try to draw the relevance of problems.
-
-\section{Introduction to LibTomMath}
-
-\subsection{What is LibTomMath?}
-LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C.  By portable it
-is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on
-any given platform.
-
-The library has been successfully tested under numerous operating systems including Unix\footnote{All of these
-trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such
-as the Gameboy Advance.  The library is designed to contain enough functionality to be able to develop applications such
-as public key cryptosystems and still maintain a relatively small footprint.
-
-\subsection{Goals of LibTomMath}
-
-Libraries which obtain the most efficiency are rarely written in a high level programming language such as C.  However,
-even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the
-library.  Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM
-processors.  Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window
-exponentiation and Montgomery reduction have been provided to make the library more efficient.
-
-Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface
-(\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized
-algorithms automatically without the developer's specific attention.  One such example is the generic multiplication
-algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication
-based on the magnitude of the inputs and the configuration of the library.
-
-Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should
-be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
-MPI library was used as a API template for all the basic functions.  MPI was chosen because it is another library that fits
-in the same niche as LibTomMath.  Even though LibTomMath uses MPI as the template for the function names and argument
-passing conventions, it has been written from scratch by Tom St Denis.
-
-The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum''
-library exists which can be used to teach computer science students how to perform fast and reliable multiple precision
-integer arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.
-
-\section{Choice of LibTomMath}
-LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
-for more worthy reasons.  Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL
-\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for
-reasons that will be explained in the following sub-sections.
-
-\subsection{Code Base}
-The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
-segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
-developer can more readily discern the true intent of a given section of source code without trying to keep track of
-what conditional code will be used.
-
-The code base of LibTomMath is well organized.  Each function is in its own separate source code file
-which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
-file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
-very hard.  GMP has many conditional code segments which also hinder tracing.
-
-When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
- which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about
-$50$KiB) but LibTomMath is also much faster and more complete than MPI.
-
-\subsection{API Simplicity}
-LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build
-with LibTomMath without change. The function names correlate directly to the action they perform.  Almost all of the
-functions share the same parameter passing convention.  The learning curve is fairly shallow with the API provided
-which is an extremely valuable benefit for the student and developer alike.
-
-The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to
-illegible short hand.  LibTomMath does not share this characteristic.
-
-The GMP library also does not return error codes.  Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors
-are signaled to the host application.  This happens to be the fastest approach but definitely not the most versatile.  In
-effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely
-undersireable in many situations.
-
-\subsection{Optimizations}
-While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does
-feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring.  GMP
-and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations.  GMP lacks a few
-of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP
-only had Barrett and Montgomery modular reduction algorithms.}.
-
-LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
-exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually
-slower than the best libraries such as GMP and OpenSSL by only a small factor.
-
-\subsection{Portability and Stability}
-LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler
-(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any
-variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of
-MPI has recently stopped working on his library and LIP has long since been discontinued.
-
-GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
-development and are very stable across a variety of platforms.
-
-\subsection{Choice}
-LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
-the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However,
-the reader is encouraged to download their own copy of the library to actually be able to work with the library.
-
-\chapter{Getting Started}
-\section{Library Basics}
-The trick to writing any useful library of source code is to build a solid foundation and work outwards from it.  First,
-a problem along with allowable solution parameters should be identified and analyzed.  In this particular case the
-inability to accomodate multiple precision integers is the problem.  Futhermore, the solution must be written
-as portable source code that is reasonably efficient across several different computer platforms.
-
-After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion.
-That is, to implement the lowest level dependencies first and work towards the most abstract functions last.  For example,
-before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm.
-By building outwards from a base foundation instead of using a parallel design methodology the resulting project is
-highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
-has a small footprint and updates are easy to perform.
-
-Usually when I start a project I will begin with the header files.  I define the data types I think I will need and
-prototype the initial functions that are not dependent on other functions (within the library).  After I
-implement these base functions I prototype more dependent functions and implement them.   The process repeats until
-I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as
-mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod().  As an example as to
-why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the
-dependent function mp\_exptmod() was written.  Adding the new multiplication algorithms did not require changes to the
-mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development
-for new algorithms.  This methodology allows new algorithms to be tested in a complete framework with relative ease.
-
-\begin{center}
-\begin{figure}[here]
-\includegraphics{pics/design_process.ps}
-\caption{Design Flow of the First Few Original LibTomMath Functions.}
-\label{pic:design_process}
-\end{figure}
-\end{center}
-
-Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing
-the source code.  For example, one day I may audit the multipliers and the next day the polynomial basis functions.
-
-It only makes sense to begin the text with the preliminary data types and support algorithms required as well.
-This chapter discusses the core algorithms of the library which are the dependents for every other algorithm.
-
-\section{What is a Multiple Precision Integer?}
-Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot
-be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is
-to use fixed precision data types to create and manipulate multiple precision integers which may represent values
-that are very large.
-
-As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
-the largest single digit value is $9$.  However, by concatenating digits together larger numbers may be represented.  Newly prepended digits
-(\textit{to the left}) are said to be in a different power of ten column.  That is, the number $123$ can be described as having a $1$ in the hundreds
-column, $2$ in the tens column and $3$ in the ones column.  Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$.  Computer based
-multiple precision arithmetic is essentially the same concept.  Larger integers are represented by adjoining fixed
-precision computer words with the exception that a different radix is used.
-
-What most people probably do not think about explicitly are the various other attributes that describe a multiple precision
-integer.  For example, the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive,
-that is the sign of this particular integer is positive as opposed to negative.  Second, the integer has three digits in
-its representation.  There is an additional property that the integer posesses that does not concern pencil-and-paper
-arithmetic.  The third property is how many digits placeholders are available to hold the integer.
-
-The human analogy of this third property is ensuring there is enough space on the paper to write the integer.  For example,
-if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left.
-Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer
-will not exceed the allowed boundaries.  These three properties make up what is known as a multiple precision
-integer or mp\_int for short.
-
-\subsection{The mp\_int Structure}
-\label{sec:MPINT}
-The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for
-any such data type but it does provide for making composite data types known as structures.  The following is the structure definition
-used within LibTomMath.
-
-\index{mp\_int}
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-%\begin{verbatim}
-\begin{tabular}{|l|}
-\hline
-typedef struct \{ \\
-\hspace{3mm}int used, alloc, sign;\\
-\hspace{3mm}mp\_digit *dp;\\
-\} \textbf{mp\_int}; \\
-\hline
-\end{tabular}
-%\end{verbatim}
-\end{small}
-\caption{The mp\_int Structure}
-\label{fig:mpint}
-\end{center}
-\end{figure}
-
-The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
-
-\begin{enumerate}
-\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
-a given integer.  The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count.
-
-\item The \textbf{alloc} parameter denotes how
-many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count
-of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the
-array to accommodate the precision of the result.
-
-\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple
-precision integer.  It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits.  The array is maintained in a least
-significant digit order.  As a pencil and paper analogy the array is organized such that the right most digits are stored
-first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array.  For example,
-if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then
-it would represent the integer $a + b\beta + c\beta^2 + \ldots$
-
-\index{MP\_ZPOS} \index{MP\_NEG}
-\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).
-\end{enumerate}
-
-\subsubsection{Valid mp\_int Structures}
-Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency.
-The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy().
-
-\begin{enumerate}
-\item The value of \textbf{alloc} may not be less than one.  That is \textbf{dp} always points to a previously allocated
-array of digits.
-\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero.
-\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero.  That is,
-leading zero digits in the most significant positions must be trimmed.
-   \begin{enumerate}
-   \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero.
-   \end{enumerate}
-\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero;
-this represents the mp\_int value of zero.
-\end{enumerate}
-
-\section{Argument Passing}
-A convention of argument passing must be adopted early on in the development of any library.  Making the function
-prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity.
-In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int
-structures.  That means that the source (input) operands are placed on the left and the destination (output) on the right.
-Consider the following examples.
-
-\begin{verbatim}
-   mp_mul(&a, &b, &c);   /* c = a * b */
-   mp_add(&a, &b, &a);   /* a = a + b */
-   mp_sqr(&a, &b);       /* b = a * a */
-\end{verbatim}
-
-The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
-functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
-
-Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order
-of assignment expressions.  That is, the destination (output) is on the left and arguments (inputs) are on the right.  In
-truth, it is entirely a matter of preference.  In the case of LibTomMath the convention from the MPI library has been
-adopted.
-
-Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a
-destination.  For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important
-feature to implement since it allows the calling functions to cut down on the number of variables it must maintain.
-However, to implement this feature specific care has to be given to ensure the destination is not modified before the
-source is fully read.
-
-\section{Return Values}
-A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them
-to the caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  However, the end
-developer can still manage to cause a library to crash.  For example, by passing an invalid pointer an application may
-fault by dereferencing memory not owned by the application.
-
-In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for
-instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor
-will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an
-\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
-
-\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Value} & \textbf{Meaning} \\
-\hline \textbf{MP\_OKAY} & The function was successful \\
-\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
-\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
-\hline
-\end{tabular}
-\end{center}
-\caption{LibTomMath Error Codes}
-\label{fig:errcodes}
-\end{figure}
-
-When an error is detected within a function it should free any memory it allocated, often during the initialization of
-temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the
-function was called.  Error checking with this style of API is fairly simple.
-
-\begin{verbatim}
-   int err;
-   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
-      printf("Error: %s\n", mp_error_to_string(err));
-      exit(EXIT_FAILURE);
-   }
-\end{verbatim}
-
-The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal
-and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
-
-\section{Initialization and Clearing}
-The logical starting point when actually writing multiple precision integer functions is the initialization and
-clearing of the mp\_int structures.  These two algorithms will be used by the majority of the higher level algorithms.
-
-Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
-the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even though
-the initial integer will represent zero.  If only a single digit were allocated quite a few subsequent re-allocations
-would occur when operations are performed on the integers.  There is a tradeoff between how many default digits to allocate
-and how many re-allocations are tolerable.  Obviously allocating an excessive amount of digits initially will waste
-memory and become unmanageable.
-
-If the memory for the digits has been successfully allocated then the rest of the members of the structure must
-be initialized.  Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set
-to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
-
-\subsection{Initializing an mp\_int}
-An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
-structure are set to valid values.  The mp\_init algorithm will perform such an action.
-
-\index{mp\_init}
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Allocate memory and initialize $a$ to a known valid mp\_int state.  \\
-\hline \\
-1.  Allocate memory for \textbf{MP\_PREC} digits. \\
-2.  If the allocation failed return(\textit{MP\_MEM}) \\
-3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$\\
-4.  $a.sign \leftarrow MP\_ZPOS$\\
-5.  $a.used \leftarrow 0$\\
-6.  $a.alloc \leftarrow MP\_PREC$\\
-7.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init}
-\end{figure}
-
-\textbf{Algorithm mp\_init.}
-The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
-manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
-a valid assumption if the input resides on the stack.
-
-Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
-the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC}
-name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.}
-used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
-precision number you'll be working with.
-
-Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
-heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack
-memory and the number of heap operations will be trivial.
-
-Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
-\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
-of the original condition of the input.
-
-\textbf{Remark.}
-This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
-when the ``to'' keyword is placed between two expressions.  For example, ``for $a$ from $b$ to $c$ do'' means that
-a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$.  In each
-iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$.  If $b > c$ occured
-the loop would not iterate.  By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate
-decrementally.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* init a new mp_int */
-018   int mp_init (mp_int * a)
-019   \{
-020     int i;
-021
-022     /* allocate memory required and clear it */
-023     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
-024     if (a->dp == NULL) \{
-025       return MP_MEM;
-026     \}
-027
-028     /* set the digits to zero */
-029     for (i = 0; i < MP_PREC; i++) \{
-030         a->dp[i] = 0;
-031     \}
-032
-033     /* set the used to zero, allocated digits to the default precision
-034      * and sign to positive */
-035     a->used  = 0;
-036     a->alloc = MP_PREC;
-037     a->sign  = MP_ZPOS;
-038
-039     return MP_OKAY;
-040   \}
-041   #endif
-042
-\end{alltt}
-\end{small}
-
-One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure.  It
-is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The
-call to mp\_init() is used only to initialize the members of the structure to a known default state.
-
-Here we see (line 23) the memory allocation is performed first.  This allows us to exit cleanly and quickly
-if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
-was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
-but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
-memory allocation routine.
-
-In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
-accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a
-portable fashion you have to actually assign the value.  The for loop (line 29) performs this required
-operation.
-
-After the memory has been successfully initialized the remainder of the members are initialized
-(lines 33 through 34) to their respective default states.  At this point the algorithm has succeeded and
-a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the
-mp\_int structure has been properly initialized and is safe to use with other functions within the library.
-
-\subsection{Clearing an mp\_int}
-When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be
-returned to the application's memory pool with the mp\_clear algorithm.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clear}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
-\hline \\
-1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
-2.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}2.1  $a_n \leftarrow 0$ \\
-3.  Free the memory allocated for the digits of $a$. \\
-4.  $a.used \leftarrow 0$ \\
-5.  $a.alloc \leftarrow 0$ \\
-6.  $a.sign \leftarrow MP\_ZPOS$ \\
-7.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clear}
-\end{figure}
-
-\textbf{Algorithm mp\_clear.}
-This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that
-if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
-is to free the allocated memory.
-
-The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
-algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid
-digit pointer \textbf{dp} setting.
-
-Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
-with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* clear one (frees)  */
-018   void
-019   mp_clear (mp_int * a)
-020   \{
-021     int i;
-022
-023     /* only do anything if a hasn't been freed previously */
-024     if (a->dp != NULL) \{
-025       /* first zero the digits */
-026       for (i = 0; i < a->used; i++) \{
-027           a->dp[i] = 0;
-028       \}
-029
-030       /* free ram */
-031       XFREE(a->dp);
-032
-033       /* reset members to make debugging easier */
-034       a->dp    = NULL;
-035       a->alloc = a->used = 0;
-036       a->sign  = MP_ZPOS;
-037     \}
-038   \}
-039   #endif
-040
-\end{alltt}
-\end{small}
-
-The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line 24)
-checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
-\textbf{NULL} in which case the if statement will evaluate to true.
-
-The digits of the mp\_int are cleared by the for loop (line 26) which assigns a zero to every digit.  Similar to mp\_init()
-the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.
-
-The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
-a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
-still has to be reset to \textbf{NULL} manually (line 34).
-
-Now that the digits have been cleared and deallocated the other members are set to their final values (lines 35 and 36).
-
-\section{Maintenance Algorithms}
-
-The previous sections describes how to initialize and clear an mp\_int structure.  To further support operations
-that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be
-able to augment the precision of an mp\_int and
-initialize mp\_ints with differing initial conditions.
-
-These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level
-algorithms such as addition, multiplication and modular exponentiation.
-
-\subsection{Augmenting an mp\_int's Precision}
-When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire
-result of an operation without loss of precision.  Quite often the size of the array given by the \textbf{alloc} member
-is large enough to simply increase the \textbf{used} digit count.  However, when the size of the array is too small it
-must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_grow}. \\
-\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
-\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
-\hline \\
-1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
-2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
-3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-4.  Re-allocate the array of digits $a$ to size $v$ \\
-5.  If the allocation failed then return(\textit{MP\_MEM}). \\
-6.  for n from a.alloc to $v - 1$ do  \\
-\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.alloc \leftarrow v$ \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_grow}
-\end{figure}
-
-\textbf{Algorithm mp\_grow.}
-It is ideal to prevent re-allocations from being performed if they are not required (step one).  This is useful to
-prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow.
-
-The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three).
-This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values.
-
-It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact.  This is much
-akin to how the \textit{realloc} function from the standard C library works.  Since the newly allocated digits are
-assumed to contain undefined values they are initially set to zero.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* grow as required */
-018   int mp_grow (mp_int * a, int size)
-019   \{
-020     int     i;
-021     mp_digit *tmp;
-022
-023     /* if the alloc size is smaller alloc more ram */
-024     if (a->alloc < size) \{
-025       /* ensure there are always at least MP_PREC digits extra on top */
-026       size += (MP_PREC * 2) - (size % MP_PREC);
-027
-028       /* reallocate the array a->dp
-029        *
-030        * We store the return in a temporary variable
-031        * in case the operation failed we don't want
-032        * to overwrite the dp member of a.
-033        */
-034       tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
-035       if (tmp == NULL) \{
-036         /* reallocation failed but "a" is still valid [can be freed] */
-037         return MP_MEM;
-038       \}
-039
-040       /* reallocation succeeded so set a->dp */
-041       a->dp = tmp;
-042
-043       /* zero excess digits */
-044       i        = a->alloc;
-045       a->alloc = size;
-046       for (; i < a->alloc; i++) \{
-047         a->dp[i] = 0;
-048       \}
-049     \}
-050     return MP_OKAY;
-051   \}
-052   #endif
-053
-\end{alltt}
-\end{small}
-
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 24) checks
-if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
-the function skips the re-allocation part thus saving time.
-
-When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
-padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line 26).  The XREALLOC function is used
-to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
-function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
-the re-allocation.  All	that is left is to clear the newly allocated digits and return.
-
-Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
-an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
-result in a memory leak if XREALLOC ever failed.
-
-\subsection{Initializing Variable Precision mp\_ints}
-Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size
-of input mp\_ints to a given algorithm.  The purpose of algorithm mp\_init\_size is similar to mp\_init except that it
-will allocate \textit{at least} a specified number of digits.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_size}. \\
-\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$. \\
-\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
-\hline \\
-1.  $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\
-2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-3.  Allocate $v$ digits. \\
-4.  for $n$ from $0$ to $v - 1$ do \\
-\hspace{3mm}4.1  $a_n \leftarrow 0$ \\
-5.  $a.sign \leftarrow MP\_ZPOS$\\
-6.  $a.used \leftarrow 0$\\
-7.  $a.alloc \leftarrow v$\\
-8.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_init\_size}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_size.}
-This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of
-digits allocated can be controlled by the second input argument $b$.  The input size is padded upwards so it is a
-multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits.  This padding is used to prevent trivial
-allocations from becoming a bottleneck in the rest of the algorithms.
-
-Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero.  This
-particular algorithm is useful if it is known ahead of time the approximate size of the input.  If the approximation is
-correct no further memory re-allocations are required to work with the mp\_int.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* init an mp_init for a given size */
-018   int mp_init_size (mp_int * a, int size)
-019   \{
-020     int x;
-021
-022     /* pad size so there are always extra digits */
-023     size += (MP_PREC * 2) - (size % MP_PREC);
-024
-025     /* alloc mem */
-026     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
-027     if (a->dp == NULL) \{
-028       return MP_MEM;
-029     \}
-030
-031     /* set the members */
-032     a->used  = 0;
-033     a->alloc = size;
-034     a->sign  = MP_ZPOS;
-035
-036     /* zero the digits */
-037     for (x = 0; x < size; x++) \{
-038         a->dp[x] = 0;
-039     \}
-040
-041     return MP_OKAY;
-042   \}
-043   #endif
-044
-\end{alltt}
-\end{small}
-
-The number of digits $b$ requested is padded (line 23) by first augmenting it to the next multiple of
-\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the
-mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be
-returned (line 28).
-
-The digits are allocated with the malloc() function (line 26) and set to zero afterwards (line 37).  The
-\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set
-to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines 32, 33 and 34).  If the function
-returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the
-functions to work with.
-
-\subsection{Multiple Integer Initializations and Clearings}
-Occasionally a function will require a series of mp\_int data types to be made available simultaneously.
-The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single
-statement.  It is essentially a shortcut to multiple initializations.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_multi}. \\
-\textbf{Input}.   Variable length array $V_k$ of mp\_int variables of length $k$. \\
-\textbf{Output}.  The array is initialized such that each mp\_int of $V_k$ is ready to use. \\
-\hline \\
-1.  for $n$ from 0 to $k - 1$ do \\
-\hspace{+3mm}1.1.  Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\
-\hspace{+3mm}1.2.  If initialization failed then do \\
-\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
-\hspace{+9mm}1.2.1.1.  Free the mp\_int $V_j$ (\textit{mp\_clear}) \\
-\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
-2.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_multi}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_multi.}
-The algorithm will initialize the array of mp\_int variables one at a time.  If a runtime error has been detected
-(\textit{step 1.2}) all of the previously initialized variables are cleared.  The goal is an ``all or nothing''
-initialization which allows for quick recovery from runtime errors.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_multi.c
-\vspace{-3mm}
-\begin{alltt}
-016   #include <stdarg.h>
-017
-018   int mp_init_multi(mp_int *mp, ...)
-019   \{
-020       mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
-021       int n = 0;                 /* Number of ok inits */
-022       mp_int* cur_arg = mp;
-023       va_list args;
-024
-025       va_start(args, mp);        /* init args to next argument from caller */
-026       while (cur_arg != NULL) \{
-027           if (mp_init(cur_arg) != MP_OKAY) \{
-028               /* Oops - error! Back-track and mp_clear what we already
-029                  succeeded in init-ing, then return error.
-030               */
-031               va_list clean_args;
-032
-033               /* end the current list */
-034               va_end(args);
-035
-036               /* now start cleaning up */
-037               cur_arg = mp;
-038               va_start(clean_args, mp);
-039               while (n-- != 0) \{
-040                   mp_clear(cur_arg);
-041                   cur_arg = va_arg(clean_args, mp_int*);
-042               \}
-043               va_end(clean_args);
-044               res = MP_MEM;
-045               break;
-046           \}
-047           n++;
-048           cur_arg = va_arg(args, mp_int*);
-049       \}
-050       va_end(args);
-051       return res;                /* Assumed ok, if error flagged above. */
-052   \}
-053
-054   #endif
-055
-\end{alltt}
-\end{small}
-
-This function intializes a variable length list of mp\_int structure pointers.  However, instead of having the mp\_int
-structures in an actual C array they are simply passed as arguments to the function.  This function makes use of the
-``...'' argument syntax of the C programming language.  The list is terminated with a final \textbf{NULL} argument
-appended on the right.
-
-The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
-$n$ of succesfully initialized mp\_int structures is maintained (line 47) such that if a failure does occur,
-the algorithm can backtrack and free the previously initialized structures (lines 27 to 46).
-
-
-\subsection{Clamping Excess Digits}
-When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of
-the function instead of checking during the computation.  For example, a multiplication of a $i$ digit number by a
-$j$ digit produces a result of at most $i + j$ digits.  It is entirely possible that the result is $i + j - 1$
-though, with no final carry into the last position.  However, suppose the destination had to be first expanded
-(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry.
-That would be a considerable waste of time since heap operations are relatively slow.
-
-The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
-terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
-there would be an excess high order zero digit.
-
-For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit
-will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
-accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very
-low the representation is excessively large.
-
-The mp\_clamp algorithm is designed to solve this very problem.  It will trim high-order zeros by decrementing the
-\textbf{used} count until a non-zero most significant digit is found.  Also in this system, zero is considered to be a
-positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to
-\textbf{MP\_ZPOS}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clamp}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
-\hline \\
-1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
-\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
-2.  if $a.used = 0$ then do \\
-\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
-\hline \\
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clamp}
-\end{figure}
-
-\textbf{Algorithm mp\_clamp.}
-As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
-the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for
-when all of the digits are zero to ensure that the mp\_int is valid at all times.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* trim unused digits
-018    *
-019    * This is used to ensure that leading zero digits are
-020    * trimed and the leading "used" digit will be non-zero
-021    * Typically very fast.  Also fixes the sign if there
-022    * are no more leading digits
-023    */
-024   void
-025   mp_clamp (mp_int * a)
-026   \{
-027     /* decrease used while the most significant digit is
-028      * zero.
-029      */
-030     while ((a->used > 0) && (a->dp[a->used - 1] == 0)) \{
-031       --(a->used);
-032     \}
-033
-034     /* reset the sign flag if used == 0 */
-035     if (a->used == 0) \{
-036       a->sign = MP_ZPOS;
-037     \}
-038   \}
-039   #endif
-040
-\end{alltt}
-\end{small}
-
-Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
-language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is
-important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously
-undesirable.  The parenthesis on line 30 is used to make sure the \textbf{used} count is decremented and not
-the pointer ``a''.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
-                     & \\
-$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
-                     & encryption when $\beta = 2^{28}$.  \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
-                     & \\
-$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
-                     & \\
-\end{tabular}
-
-
-%%%
-% CHAPTER FOUR
-%%%
-
-\chapter{Basic Operations}
-
-\section{Introduction}
-In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining
-mp\_int structures.  This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low
-level basis of the entire library.  While these algorithm are relatively trivial it is important to understand how they
-work before proceeding since these algorithms will be used almost intrinsically in the following chapters.
-
-The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of
-mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures
-represent.
-
-\section{Assigning Values to mp\_int Structures}
-\subsection{Copying an mp\_int}
-Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making
-a copy for the purposes of this text.  The copy of the mp\_int will be a separate entity that represents the same
-value as the mp\_int it was copied from.  The mp\_copy algorithm provides this functionality.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_copy}. \\
-\textbf{Input}.  An mp\_int $a$ and $b$. \\
-\textbf{Output}.  Store a copy of $a$ in $b$. \\
-\hline \\
-1.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
-2.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}2.1  $b_{n} \leftarrow a_{n}$ \\
-3.  for $n$ from $a.used$ to $b.used - 1$ do \\
-\hspace{3mm}3.1  $b_{n} \leftarrow 0$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $b.sign \leftarrow a.sign$ \\
-6.  return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_copy.}
-This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will
-represent the same integer as the mp\_int $a$.  The mp\_int $b$ shall be a complete and distinct copy of the
-mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$.
-
-If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow
-algorithm.  The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two
-and three).  The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of
-$b$.
-
-\textbf{Remark.}  This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the
-text.  The error return codes of other algorithms are not explicitly checked in the pseudo-code presented.  For example, in
-step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded.  Text space is
-limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return
-the error code itself.  However, the C code presented will demonstrate all of the error handling logic required to
-implement the pseudo-code.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* copy, b = a */
-018   int
-019   mp_copy (mp_int * a, mp_int * b)
-020   \{
-021     int     res, n;
-022
-023     /* if dst == src do nothing */
-024     if (a == b) \{
-025       return MP_OKAY;
-026     \}
-027
-028     /* grow dest */
-029     if (b->alloc < a->used) \{
-030        if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
-031           return res;
-032        \}
-033     \}
-034
-035     /* zero b and copy the parameters over */
-036     \{
-037       mp_digit *tmpa, *tmpb;
-038
-039       /* pointer aliases */
-040
-041       /* source */
-042       tmpa = a->dp;
-043
-044       /* destination */
-045       tmpb = b->dp;
-046
-047       /* copy all the digits */
-048       for (n = 0; n < a->used; n++) \{
-049         *tmpb++ = *tmpa++;
-050       \}
-051
-052       /* clear high digits */
-053       for (; n < b->used; n++) \{
-054         *tmpb++ = 0;
-055       \}
-056     \}
-057
-058     /* copy used count and sign */
-059     b->used = a->used;
-060     b->sign = a->sign;
-061     return MP_OKAY;
-062   \}
-063   #endif
-064
-\end{alltt}
-\end{small}
-
-Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
-mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without
-copying digits (line 24).
-
-The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
-$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines 29 to 33).  In order to
-simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
-of the mp\_ints $a$ and $b$ respectively.  These aliases (lines 42 and 45) allow the compiler to access the digits without first dereferencing the
-mp\_int pointers and then subsequently the pointer to the digits.
-
-After the aliases are established the digits from $a$ are copied into $b$ (lines 48 to 50) and then the excess
-digits of $b$ are set to zero (lines 53 to 55).  Both ``for'' loops make use of the pointer aliases and in
-fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization
-allows the alias to stay in a machine register fairly easy between the two loops.
-
-\textbf{Remarks.}  The use of pointer aliases is an implementation methodology first introduced in this function that will
-be used considerably in other functions.  Technically, a pointer alias is simply a short hand alias used to lower the
-number of pointer dereferencing operations required to access data.  For example, a for loop may resemble
-
-\begin{alltt}
-for (x = 0; x < 100; x++) \{
-    a->num[4]->dp[x] = 0;
-\}
-\end{alltt}
-
-This could be re-written using aliases as
-
-\begin{alltt}
-mp_digit *tmpa;
-a = a->num[4]->dp;
-for (x = 0; x < 100; x++) \{
-    *a++ = 0;
-\}
-\end{alltt}
-
-In this case an alias is used to access the
-array of digits within an mp\_int structure directly.  It may seem that a pointer alias is strictly not required
-as a compiler may optimize out the redundant pointer operations.  However, there are two dominant reasons to use aliases.
-
-The first reason is that most compilers will not effectively optimize pointer arithmetic.  For example, some optimizations
-may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC).  Also some optimizations may
-work for GCC and not MSVC.  As such it is ideal to find a common ground for as many compilers as possible.  Pointer
-aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code
-stands a better chance of being faster.
-
-The second reason is that pointer aliases often can make an algorithm simpler to read.  Consider the first ``for''
-loop of the function mp\_copy() re-written to not use pointer aliases.
-
-\begin{alltt}
-    /* copy all the digits */
-    for (n = 0; n < a->used; n++) \{
-      b->dp[n] = a->dp[n];
-    \}
-\end{alltt}
-
-Whether this code is harder to read depends strongly on the individual.  However, it is quantifiably slightly more
-complicated as there are four variables within the statement instead of just two.
-
-\subsubsection{Nested Statements}
-Another commonly used technique in the source routines is that certain sections of code are nested.  This is used in
-particular with the pointer aliases to highlight code phases.  For example, a Comba multiplier (discussed in chapter six)
-will typically have three different phases.  First the temporaries are initialized, then the columns calculated and
-finally the carries are propagated.  In this example the middle column production phase will typically be nested as it
-uses temporary variables and aliases the most.
-
-The nesting also simplies the source code as variables that are nested are only valid for their scope.  As a result
-the various temporary variables required do not propagate into other sections of code.
-
-
-\subsection{Creating a Clone}
-Another common operation is to make a local temporary copy of an mp\_int argument.  To initialize an mp\_int
-and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone.  This is
-useful within functions that need to modify an argument but do not wish to actually modify the original copy.  The
-mp\_init\_copy algorithm has been designed to help perform this task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_copy}. \\
-\textbf{Input}.   An mp\_int $a$ and $b$\\
-\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
-\hline \\
-1.  Init $a$.  (\textit{mp\_init}) \\
-2.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
-3.  Return the status of the copy operation. \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_copy.}
-This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it.  As
-such this algorithm will perform two operations in one step.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* creates "a" then copies b into it */
-018   int mp_init_copy (mp_int * a, mp_int * b)
-019   \{
-020     int     res;
-021
-022     if ((res = mp_init_size (a, b->used)) != MP_OKAY) \{
-023       return res;
-024     \}
-025     return mp_copy (b, a);
-026   \}
-027   #endif
-028
-\end{alltt}
-\end{small}
-
-This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that
-\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
-and \textbf{a} will be left intact.
-
-\section{Zeroing an Integer}
-Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
-perform this task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_zero}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Zero the contents of $a$ \\
-\hline \\
-1.  $a.used \leftarrow 0$ \\
-2.  $a.sign \leftarrow$ MP\_ZPOS \\
-3.  for $n$ from 0 to $a.alloc - 1$ do \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_zero}
-\end{figure}
-
-\textbf{Algorithm mp\_zero.}
-This algorithm simply resets a mp\_int to the default state.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* set to zero */
-018   void mp_zero (mp_int * a)
-019   \{
-020     int       n;
-021     mp_digit *tmp;
-022
-023     a->sign = MP_ZPOS;
-024     a->used = 0;
-025
-026     tmp = a->dp;
-027     for (n = 0; n < a->alloc; n++) \{
-028        *tmp++ = 0;
-029     \}
-030   \}
-031   #endif
-032
-\end{alltt}
-\end{small}
-
-After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the
-\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
-
-\section{Sign Manipulation}
-\subsection{Absolute Value}
-With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
-the absolute value of an mp\_int.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_abs}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = \vert a \vert$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  $b.sign \leftarrow MP\_ZPOS$ \\
-4.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_abs}
-\end{figure}
-
-\textbf{Algorithm mp\_abs.}
-This algorithm computes the absolute of an mp\_int input.  First it copies $a$ over $b$.  This is an example of an
-algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful.  This allows,
-for instance, the developer to pass the same mp\_int as the source and destination to this function without addition
-logic to handle it.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* b = |a|
-018    *
-019    * Simple function copies the input and fixes the sign to positive
-020    */
-021   int
-022   mp_abs (mp_int * a, mp_int * b)
-023   \{
-024     int     res;
-025
-026     /* copy a to b */
-027     if (a != b) \{
-028        if ((res = mp_copy (a, b)) != MP_OKAY) \{
-029          return res;
-030        \}
-031     \}
-032
-033     /* force the sign of b to positive */
-034     b->sign = MP_ZPOS;
-035
-036     return MP_OKAY;
-037   \}
-038   #endif
-039
-\end{alltt}
-\end{small}
-
-This fairly trivial algorithm first eliminates non--required duplications (line 27) and then sets the
-\textbf{sign} flag to \textbf{MP\_ZPOS}.
-
-\subsection{Integer Negation}
-With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
-the negative of an mp\_int input.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_neg}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = -a$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  If $a.used = 0$ then return(\textit{MP\_OKAY}). \\
-4.  If $a.sign = MP\_ZPOS$ then do \\
-\hspace{3mm}4.1  $b.sign = MP\_NEG$. \\
-5.  else do \\
-\hspace{3mm}5.1  $b.sign = MP\_ZPOS$. \\
-6.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_neg}
-\end{figure}
-
-\textbf{Algorithm mp\_neg.}
-This algorithm computes the negation of an input.  First it copies $a$ over $b$.  If $a$ has no used digits then
-the algorithm returns immediately.  Otherwise it flips the sign flag and stores the result in $b$.  Note that if
-$a$ had no digits then it must be positive by definition.  Had step three been omitted then the algorithm would return
-zero as negative.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* b = -a */
-018   int mp_neg (mp_int * a, mp_int * b)
-019   \{
-020     int     res;
-021     if (a != b) \{
-022        if ((res = mp_copy (a, b)) != MP_OKAY) \{
-023           return res;
-024        \}
-025     \}
-026
-027     if (mp_iszero(b) != MP_YES) \{
-028        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-029     \} else \{
-030        b->sign = MP_ZPOS;
-031     \}
-032
-033     return MP_OKAY;
-034   \}
-035   #endif
-036
-\end{alltt}
-\end{small}
-
-Like mp\_abs() this function avoids non--required duplications (line 21) and then sets the sign.  We
-have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
-than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
-
-\section{Small Constants}
-\subsection{Setting Small Constants}
-Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set}. \\
-\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}). \\
-2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
-3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
-                              1 &  \mbox{if }a_0 > 0 \\
-                              0 &  \mbox{if }a_0 = 0
-                              \end{array} \right .$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set}
-\end{figure}
-
-\textbf{Algorithm mp\_set.}
-This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
-single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* set to a digit */
-018   void mp_set (mp_int * a, mp_digit b)
-019   \{
-020     mp_zero (a);
-021     a->dp[0] = b & MP_MASK;
-022     a->used  = (a->dp[0] != 0) ? 1 : 0;
-023   \}
-024   #endif
-025
-\end{alltt}
-\end{small}
-
-First we zero (line 20) the mp\_int to make sure that the other members are initialized for a
-small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
-is zero.  Next we set the digit and reduce it modulo $\beta$ (line 21).  After this step we have to
-check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
-to zero.
-
-We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with
-$2^k - 1$ will perform the same operation.
-
-One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses
-this function should take that into account.  Only trivially small constants can be set using this function.
-
-\subsection{Setting Large Constants}
-To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal.  It accepts a ``long''
-data type as input and will always treat it as a 32-bit integer.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set\_int}. \\
-\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}) \\
-2.  for $n$ from 0 to 7 do \\
-\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
-\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
-\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
-\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
-3.  Clamp excess used digits (\textit{mp\_clamp}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set\_int}
-\end{figure}
-
-\textbf{Algorithm mp\_set\_int.}
-The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the
-mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
-next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is
-incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
-zero digits used and the newly added four bits would be ignored.
-
-Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* set a 32-bit const */
-018   int mp_set_int (mp_int * a, unsigned long b)
-019   \{
-020     int     x, res;
-021
-022     mp_zero (a);
-023
-024     /* set four bits at a time */
-025     for (x = 0; x < 8; x++) \{
-026       /* shift the number up four bits */
-027       if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{
-028         return res;
-029       \}
-030
-031       /* OR in the top four bits of the source */
-032       a->dp[0] |= (b >> 28) & 15;
-033
-034       /* shift the source up to the next four bits */
-035       b <<= 4;
-036
-037       /* ensure that digits are not clamped off */
-038       a->used += 1;
-039     \}
-040     mp_clamp (a);
-041     return MP_OKAY;
-042   \}
-043   #endif
-044
-\end{alltt}
-\end{small}
-
-This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
-addition on line 38 ensures that the newly added in bits are added to the number of digits.  While it may not
-seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27
-as well as the  call to mp\_clamp() on line 40.  Both functions will clamp excess leading digits which keeps
-the number of used digits low.
-
-\section{Comparisons}
-\subsection{Unsigned Comparisions}
-Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
-to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
-to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude
-positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.
-
-The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
-mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the
-signs are known to agree in advance.
-
-To facilitate working with the results of the comparison functions three constants are required.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|r|l|}
-\hline \textbf{Constant} & \textbf{Meaning} \\
-\hline \textbf{MP\_GT} & Greater Than \\
-\hline \textbf{MP\_EQ} & Equal To \\
-\hline \textbf{MP\_LT} & Less Than \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Comparison Return Codes}
-\end{figure}
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp\_mag}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
-\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
-\hline \\
-1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
-2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
-3.  for n from $a.used - 1$ to 0 do \\
-\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
-\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
-4.  Return(\textit{MP\_EQ}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp\_mag}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp\_mag.}
-By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
-\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.
-Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.
-If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.
-
-By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
-the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* compare maginitude of two ints (unsigned) */
-018   int mp_cmp_mag (mp_int * a, mp_int * b)
-019   \{
-020     int     n;
-021     mp_digit *tmpa, *tmpb;
-022
-023     /* compare based on # of non-zero digits */
-024     if (a->used > b->used) \{
-025       return MP_GT;
-026     \}
-027
-028     if (a->used < b->used) \{
-029       return MP_LT;
-030     \}
-031
-032     /* alias for a */
-033     tmpa = a->dp + (a->used - 1);
-034
-035     /* alias for b */
-036     tmpb = b->dp + (a->used - 1);
-037
-038     /* compare based on digits  */
-039     for (n = 0; n < a->used; ++n, --tmpa, --tmpb) \{
-040       if (*tmpa > *tmpb) \{
-041         return MP_GT;
-042       \}
-043
-044       if (*tmpa < *tmpb) \{
-045         return MP_LT;
-046       \}
-047     \}
-048     return MP_EQ;
-049   \}
-050   #endif
-051
-\end{alltt}
-\end{small}
-
-The two if statements (lines 24 and 28) compare the number of digits in the two inputs.  These two are
-performed before all of the digits are compared since it is a very cheap test to perform and can potentially save
-considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be
-smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
-
-
-
-\subsection{Signed Comparisons}
-Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude
-comparison a trivial signed comparison algorithm can be written.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
-\hline \\
-1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
-2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
-3.  if $a.sign = MP\_NEG$ then \\
-\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
-4   Otherwise \\
-\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp.}
-The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate
-comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step
-three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then
-$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* compare two ints (signed)*/
-018   int
-019   mp_cmp (mp_int * a, mp_int * b)
-020   \{
-021     /* compare based on sign */
-022     if (a->sign != b->sign) \{
-023        if (a->sign == MP_NEG) \{
-024           return MP_LT;
-025        \} else \{
-026           return MP_GT;
-027        \}
-028     \}
-029
-030     /* compare digits */
-031     if (a->sign == MP_NEG) \{
-032        /* if negative compare opposite direction */
-033        return mp_cmp_mag(b, a);
-034     \} else \{
-035        return mp_cmp_mag(a, b);
-036     \}
-037   \}
-038   #endif
-039
-\end{alltt}
-\end{small}
-
-The two if statements (lines 22 and 23) perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   The inputs are compared (line 31) based on magnitudes.  If the signs were both
-negative then the unsigned comparison is performed in the opposite direction (line 33).  Otherwise, the signs are assumed to
-be both positive and a forward direction unsigned comparison is performed.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
-                     & \\
-$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
-                     & of two random digits (of equal magnitude) before a difference is found. \\
-                     & \\
-$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
-                     & on the observations made in the previous problem. \\
-                     &
-\end{tabular}
-
-\chapter{Basic Arithmetic}
-\section{Introduction}
-At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been
-established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These
-algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important
-that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms
-which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.
-
-All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right
-logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real
-number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}).
-Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two.
-For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
-
-One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
-from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the
-result is $110_2$.
-
-\section{Addition and Subtraction}
-In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
-$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.
-As a result subtraction can be performed with a trivial series of logical operations and an addition.
-
-However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
-sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or
-subtraction algorithms with the sign fixed up appropriately.
-
-The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
-the integers respectively.
-
-\subsection{Low Level Addition}
-An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the
-trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.
-Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
-
-\newpage
-\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
-\hline \\
-1.  if $a.used > b.used$ then \\
-\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
-\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
-\hspace{+3mm}1.3  $x   \leftarrow a$ \\
-2.  else  \\
-\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
-\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
-\hspace{+3mm}2.3  $x   \leftarrow b$ \\
-3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
-4.  $oldused \leftarrow c.used$ \\
-5.  $c.used \leftarrow max + 1$ \\
-6.  $u \leftarrow 0$ \\
-7.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{+3mm}7.1  $c_n \leftarrow a_n + b_n + u$ \\
-\hspace{+3mm}7.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-8.  if $min \ne max$ then do \\
-\hspace{+3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{+6mm}8.1.1  $c_n \leftarrow x_n + u$ \\
-\hspace{+6mm}8.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $c_{max} \leftarrow u$ \\
-10.  if $olduse > max$ then \\
-\hspace{+3mm}10.1  for $n$ from $max + 1$ to $oldused - 1$ do \\
-\hspace{+6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_add}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_add.}
-This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.
-Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the
-MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
-
-The first thing that has to be accomplished is to sort out which of the two inputs is the largest.  The addition logic
-will simply add all of the smallest input to the largest input and store that first part of the result in the
-destination.  Then it will apply a simpler addition loop to excess digits of the larger input.
-
-The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two
-inputs.  The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the
-same number of digits.  After the inputs are sorted the destination $c$ is grown as required to accomodate the sum
-of the two inputs.  The original \textbf{used} count of $c$ is copied and set to the new used count.
-
-At this point the first addition loop will go through as many digit positions that both inputs have.  The carry
-variable $\mu$ is set to zero outside the loop.  Inside the loop an ``addition'' step requires three statements to produce
-one digit of the summand.  First
-two digits from $a$ and $b$ are added together along with the carry $\mu$.  The carry of this step is extracted and stored
-in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$.
-
-Now all of the digit positions that both inputs have in common have been exhausted.  If $min \ne max$ then $x$ is an alias
-for one of the inputs that has more digits.  A simplified addition loop is then used to essentially copy the remaining digits
-and the carry to the destination.
-
-The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition.
-
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* low level addition, based on HAC pp.594, Algorithm 14.7 */
-018   int
-019   s_mp_add (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     mp_int *x;
-022     int     olduse, res, min, max;
-023
-024     /* find sizes, we let |a| <= |b| which means we have to sort
-025      * them.  "x" will point to the input with the most digits
-026      */
-027     if (a->used > b->used) \{
-028       min = b->used;
-029       max = a->used;
-030       x = a;
-031     \} else \{
-032       min = a->used;
-033       max = b->used;
-034       x = b;
-035     \}
-036
-037     /* init result */
-038     if (c->alloc < (max + 1)) \{
-039       if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{
-040         return res;
-041       \}
-042     \}
-043
-044     /* get old used digit count and set new one */
-045     olduse = c->used;
-046     c->used = max + 1;
-047
-048     \{
-049       mp_digit u, *tmpa, *tmpb, *tmpc;
-050       int i;
-051
-052       /* alias for digit pointers */
-053
-054       /* first input */
-055       tmpa = a->dp;
-056
-057       /* second input */
-058       tmpb = b->dp;
-059
-060       /* destination */
-061       tmpc = c->dp;
-062
-063       /* zero the carry */
-064       u = 0;
-065       for (i = 0; i < min; i++) \{
-066         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-067         *tmpc = *tmpa++ + *tmpb++ + u;
-068
-069         /* U = carry bit of T[i] */
-070         u = *tmpc >> ((mp_digit)DIGIT_BIT);
-071
-072         /* take away carry bit from T[i] */
-073         *tmpc++ &= MP_MASK;
-074       \}
-075
-076       /* now copy higher words if any, that is in A+B
-077        * if A or B has more digits add those in
-078        */
-079       if (min != max) \{
-080         for (; i < max; i++) \{
-081           /* T[i] = X[i] + U */
-082           *tmpc = x->dp[i] + u;
-083
-084           /* U = carry bit of T[i] */
-085           u = *tmpc >> ((mp_digit)DIGIT_BIT);
-086
-087           /* take away carry bit from T[i] */
-088           *tmpc++ &= MP_MASK;
-089         \}
-090       \}
-091
-092       /* add carry */
-093       *tmpc++ = u;
-094
-095       /* clear digits above oldused */
-096       for (i = c->used; i < olduse; i++) \{
-097         *tmpc++ = 0;
-098       \}
-099     \}
-100
-101     mp_clamp (c);
-102     return MP_OKAY;
-103   \}
-104   #endif
-105
-\end{alltt}
-\end{small}
-
-We first sort (lines 27 to 35) the inputs based on magnitude and determine the $min$ and $max$ variables.
-Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
-grow the destination (37 to 42) ensure that it can accomodate the result of the addition.
-
-Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on
-lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
-compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
-
-The initial carry $u$ will be cleared (line 64), note that $u$ is of type mp\_digit which ensures type
-compatibility within the implementation.  The initial addition (line 65 to 74) adds digits from
-both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
-(line 80 to 90) adds the remaining digits from the larger of the two inputs.  The addition is finished
-with the final carry being stored in $tmpc$ (line 93).  Note the ``++'' operator within the same expression.
-After line 93, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop (line 96 to 99) which set any old upper digits to zero.
-
-\subsection{Low Level Subtraction}
-The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
-unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must
-be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.
-This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
-
-
-For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
-the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For
-this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a
-mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).
-
-For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
-\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
-\hline \\
-1.  $min \leftarrow b.used$ \\
-2.  $max \leftarrow a.used$ \\
-3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
-4.  $oldused \leftarrow c.used$ \\
-5.  $c.used \leftarrow max$ \\
-6.  $u \leftarrow 0$ \\
-7.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{3mm}7.1  $c_n \leftarrow a_n - b_n - u$ \\
-\hspace{3mm}7.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-8.  if $min < max$ then do \\
-\hspace{3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{6mm}8.1.1  $c_n \leftarrow a_n - u$ \\
-\hspace{6mm}8.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9. if $oldused > max$ then do \\
-\hspace{3mm}9.1  for $n$ from $max$ to $oldused - 1$ do \\
-\hspace{6mm}9.1.1  $c_n \leftarrow 0$ \\
-10. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
-11. Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_sub}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sub.}
-This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
-passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
-algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
-of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
-
-The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2
-set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at
-most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and
-set to the maximal count for the operation.
-
-The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision
-subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction
-loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.
-
-For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to
-the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the
-third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the
-way to the most significant bit.
-
-Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most
-significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
-is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the
-carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.
-
-If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
-10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
-018   int
-019   s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     olduse, res, min, max;
-022
-023     /* find sizes */
-024     min = b->used;
-025     max = a->used;
-026
-027     /* init result */
-028     if (c->alloc < max) \{
-029       if ((res = mp_grow (c, max)) != MP_OKAY) \{
-030         return res;
-031       \}
-032     \}
-033     olduse = c->used;
-034     c->used = max;
-035
-036     \{
-037       mp_digit u, *tmpa, *tmpb, *tmpc;
-038       int i;
-039
-040       /* alias for digit pointers */
-041       tmpa = a->dp;
-042       tmpb = b->dp;
-043       tmpc = c->dp;
-044
-045       /* set carry to zero */
-046       u = 0;
-047       for (i = 0; i < min; i++) \{
-048         /* T[i] = A[i] - B[i] - U */
-049         *tmpc = (*tmpa++ - *tmpb++) - u;
-050
-051         /* U = carry bit of T[i]
-052          * Note this saves performing an AND operation since
-053          * if a carry does occur it will propagate all the way to the
-054          * MSB.  As a result a single shift is enough to get the carry
-055          */
-056         u = *tmpc >> ((mp_digit)((CHAR_BIT * sizeof(mp_digit)) - 1));
-057
-058         /* Clear carry from T[i] */
-059         *tmpc++ &= MP_MASK;
-060       \}
-061
-062       /* now copy higher words if any, e.g. if A has more digits than B  */
-063       for (; i < max; i++) \{
-064         /* T[i] = A[i] - U */
-065         *tmpc = *tmpa++ - u;
-066
-067         /* U = carry bit of T[i] */
-068         u = *tmpc >> ((mp_digit)((CHAR_BIT * sizeof(mp_digit)) - 1));
-069
-070         /* Clear carry from T[i] */
-071         *tmpc++ &= MP_MASK;
-072       \}
-073
-074       /* clear digits above used (since we may not have grown result above) */
-
-075       for (i = c->used; i < olduse; i++) \{
-076         *tmpc++ = 0;
-077       \}
-078     \}
-079
-080     mp_clamp (c);
-081     return MP_OKAY;
-082   \}
-083
-084   #endif
-085
-\end{alltt}
-\end{small}
-
-Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded
-(lines 24 and 25).  In reality the $min$ and $max$ variables are only aliases and are only
-used to make the source code easier to read.  Again the pointer alias optimization is used
-within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
-(lines 41, 42 and 43) for $a$, $b$ and $c$ respectively.
-
-The first subtraction loop (lines 46 through 60) subtract digits from both inputs until the smaller of
-the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward''
-method of extracting the carry (line 56).  The traditional method for extracting the carry would be to shift
-by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of
-the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry
-extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the
-most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This
-optimization only works on twos compliment machines which is a safe assumption to make.
-
-If $a$ has a larger magnitude than $b$ an additional loop (lines 63 through 72) is required to propagate
-the carry through $a$ and copy the result to $c$.
-
-\subsection{High Level Addition}
-Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
-established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data
-types.
-
-Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign}
-flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
-
-\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed addition $c = a + b$. \\
-\hline \\
-1.  if $a.sign = b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_add}
-\end{figure}
-
-\textbf{Algorithm mp\_add.}
-This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from
-either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly
-straightforward but restricted since subtraction can only produce positive results.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&&\\
-
-\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
-\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
-
-\hline &&&&\\
-
-\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Addition Guide Chart}
-\label{fig:AddChart}
-\end{figure}
-
-Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three
-specific cases need to be handled.  The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are
-forwarded to step three to check for errors.  This simplifies the description of the algorithm considerably and best
-follows how the implementation actually was achieved.
-
-Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
-s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
-to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.
-
-For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
-produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp
-within algorithm s\_mp\_add will force $-0$ to become $0$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* high level addition (handles signs) */
-018   int mp_add (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     int     sa, sb, res;
-021
-022     /* get sign of both inputs */
-023     sa = a->sign;
-024     sb = b->sign;
-025
-026     /* handle two cases, not four */
-027     if (sa == sb) \{
-028       /* both positive or both negative */
-029       /* add their magnitudes, copy the sign */
-030       c->sign = sa;
-031       res = s_mp_add (a, b, c);
-032     \} else \{
-033       /* one positive, the other negative */
-034       /* subtract the one with the greater magnitude from */
-035       /* the one of the lesser magnitude.  The result gets */
-036       /* the sign of the one with the greater magnitude. */
-037       if (mp_cmp_mag (a, b) == MP_LT) \{
-038         c->sign = sb;
-039         res = s_mp_sub (b, a, c);
-040       \} else \{
-041         c->sign = sa;
-042         res = s_mp_sub (a, b, c);
-043       \}
-044     \}
-045     return res;
-046   \}
-047
-048   #endif
-049
-\end{alltt}
-\end{small}
-
-The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
-is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
-explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
-level functions do so.  Returning their return code is sufficient.
-
-\subsection{High Level Subtraction}
-The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed subtraction $c = a - b$. \\
-\hline \\
-1.  if $a.sign \ne b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
-                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
-                              MP\_NEG  &  \mbox{otherwise} \\
-                              \end{array} \right .$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_sub}
-\end{figure}
-
-\textbf{Algorithm mp\_sub.}
-This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or
-\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  Chart \ref{fig:SubChart} lists the eight possible inputs and
-the operations required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Subtraction Guide Chart}
-\label{fig:SubChart}
-\end{figure}
-
-Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the
-algorithm from producing $-a - -a = -0$ as a result.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* high level subtraction (handles signs) */
-018   int
-019   mp_sub (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     sa, sb, res;
-022
-023     sa = a->sign;
-024     sb = b->sign;
-025
-026     if (sa != sb) \{
-027       /* subtract a negative from a positive, OR */
-028       /* subtract a positive from a negative. */
-029       /* In either case, ADD their magnitudes, */
-030       /* and use the sign of the first number. */
-031       c->sign = sa;
-032       res = s_mp_add (a, b, c);
-033     \} else \{
-034       /* subtract a positive from a positive, OR */
-035       /* subtract a negative from a negative. */
-036       /* First, take the difference between their */
-037       /* magnitudes, then... */
-038       if (mp_cmp_mag (a, b) != MP_LT) \{
-039         /* Copy the sign from the first */
-040         c->sign = sa;
-041         /* The first has a larger or equal magnitude */
-042         res = s_mp_sub (a, b, c);
-043       \} else \{
-044         /* The result has the *opposite* sign from */
-045         /* the first number. */
-046         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-047         /* The second has a larger magnitude */
-048         res = s_mp_sub (b, a, c);
-049       \}
-050     \}
-051     return res;
-052   \}
-053
-054   #endif
-055
-\end{alltt}
-\end{small}
-
-Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
-and forward it to the end of the function.  On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a
-``greater than or equal to'' comparison.
-
-\section{Bit and Digit Shifting}
-It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.
-This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.
-
-In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
-the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
-are on radix-$\beta$ digits.
-
-\subsection{Multiplication by Two}
-
-In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient
-operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = 2a$. \\
-\hline \\
-1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
-2.  $oldused \leftarrow b.used$ \\
-3.  $b.used \leftarrow a.used$ \\
-4.  $r \leftarrow 0$ \\
-5.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}5.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
-\hspace{3mm}5.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}5.3  $r \leftarrow rr$ \\
-6.  If $r \ne 0$ then do \\
-\hspace{3mm}6.1  $b_{n + 1} \leftarrow r$ \\
-\hspace{3mm}6.2  $b.used \leftarrow b.used + 1$ \\
-7.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2.}
-This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such
-an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since
-it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.
-
-Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
-is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
-
-Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together
-are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
-obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
-the previous carry.  Recall from section 4.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with
-forwarding the carry to the next iteration.
-
-Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.
-Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* b = a*2 */
-018   int mp_mul_2(mp_int * a, mp_int * b)
-019   \{
-020     int     x, res, oldused;
-021
-022     /* grow to accomodate result */
-023     if (b->alloc < (a->used + 1)) \{
-024       if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{
-025         return res;
-026       \}
-027     \}
-028
-029     oldused = b->used;
-030     b->used = a->used;
-031
-032     \{
-033       mp_digit r, rr, *tmpa, *tmpb;
-034
-035       /* alias for source */
-036       tmpa = a->dp;
-037
-038       /* alias for dest */
-039       tmpb = b->dp;
-040
-041       /* carry */
-042       r = 0;
-043       for (x = 0; x < a->used; x++) \{
-044
-045         /* get what will be the *next* carry bit from the
-046          * MSB of the current digit
-047          */
-048         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
-049
-050         /* now shift up this digit, add in the carry [from the previous] */
-051         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
-052
-053         /* copy the carry that would be from the source
-054          * digit into the next iteration
-055          */
-056         r = rr;
-057       \}
-058
-059       /* new leading digit? */
-060       if (r != 0) \{
-061         /* add a MSB which is always 1 at this point */
-062         *tmpb = 1;
-063         ++(b->used);
-064       \}
-065
-066       /* now zero any excess digits on the destination
-067        * that we didn't write to
-068        */
-069       tmpb = b->dp + b->used;
-070       for (x = b->used; x < oldused; x++) \{
-071         *tmpb++ = 0;
-072       \}
-073     \}
-074     b->sign = a->sign;
-075     return MP_OKAY;
-076   \}
-077   #endif
-078
-\end{alltt}
-\end{small}
-
-This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
-is the use of the logical shift operator on line 51 to perform a single precision doubling.
-
-\subsection{Division by Two}
-A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = a/2$. \\
-\hline \\
-1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
-2.  If the reallocation failed return(\textit{MP\_MEM}). \\
-3.  $oldused \leftarrow b.used$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $r \leftarrow 0$ \\
-6.  for $n$ from $b.used - 1$ to $0$ do \\
-\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
-\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}6.3  $r \leftarrow rr$ \\
-7.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
-10.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2.}
-This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
-core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
-could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
-reading past the end of the array of digits.
-
-Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the
-least significant bit not the most significant bit.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* b = a/2 */
-018   int mp_div_2(mp_int * a, mp_int * b)
-019   \{
-020     int     x, res, oldused;
-021
-022     /* copy */
-023     if (b->alloc < a->used) \{
-024       if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
-025         return res;
-026       \}
-027     \}
-028
-029     oldused = b->used;
-030     b->used = a->used;
-031     \{
-032       mp_digit r, rr, *tmpa, *tmpb;
-033
-034       /* source alias */
-035       tmpa = a->dp + b->used - 1;
-036
-037       /* dest alias */
-038       tmpb = b->dp + b->used - 1;
-039
-040       /* carry */
-041       r = 0;
-042       for (x = b->used - 1; x >= 0; x--) \{
-043         /* get the carry for the next iteration */
-044         rr = *tmpa & 1;
-045
-046         /* shift the current digit, add in carry and store */
-047         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-048
-049         /* forward carry to next iteration */
-050         r = rr;
-051       \}
-052
-053       /* zero excess digits */
-054       tmpb = b->dp + b->used;
-055       for (x = b->used; x < oldused; x++) \{
-056         *tmpb++ = 0;
-057       \}
-058     \}
-059     b->sign = a->sign;
-060     mp_clamp (b);
-061     return MP_OKAY;
-062   \}
-063   #endif
-064
-\end{alltt}
-\end{small}
-
-\section{Polynomial Basis Operations}
-Recall from section 4.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
-the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single
-place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
-division and Karatsuba multiplication.
-
-Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
-$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
-polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.
-
-\subsection{Multiplication by $x$}
-
-Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one
-degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
-multiplying by the integer $\beta$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_lshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
-2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  $a.used \leftarrow a.used + b$ \\
-5.  $i \leftarrow a.used - 1$ \\
-6.  $j \leftarrow a.used - 1 - b$ \\
-7.  for $n$ from $a.used - 1$ to $b$ do \\
-\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
-\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
-\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
-8.  for $n$ from 0 to $b - 1$ do \\
-\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_lshd}
-\end{figure}
-
-\textbf{Algorithm mp\_lshd.}
-This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs
-from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
-motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
-different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
-typically used on values where the original value is no longer required.  The algorithm will return success immediately if
-$b \le 0$ since the rest of algorithm is only valid when $b > 0$.
-
-First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
-the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).
-The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on
-step 8 sets the lower $b$ digits to zero.
-
-\newpage
-\begin{center}
-\begin{figure}[here]
-\includegraphics{pics/sliding_window.ps}
-\caption{Sliding Window Movement}
-\label{pic:sliding_window}
-\end{figure}
-\end{center}
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* shift left a certain amount of digits */
-018   int mp_lshd (mp_int * a, int b)
-019   \{
-020     int     x, res;
-021
-022     /* if its less than zero return */
-023     if (b <= 0) \{
-024       return MP_OKAY;
-025     \}
-026
-027     /* grow to fit the new digits */
-028     if (a->alloc < (a->used + b)) \{
-029        if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{
-030          return res;
-031        \}
-032     \}
-033
-034     \{
-035       mp_digit *top, *bottom;
-036
-037       /* increment the used by the shift amount then copy upwards */
-038       a->used += b;
-039
-040       /* top */
-041       top = a->dp + a->used - 1;
-042
-043       /* base */
-044       bottom = (a->dp + a->used - 1) - b;
-045
-046       /* much like mp_rshd this is implemented using a sliding window
-047        * except the window goes the otherway around.  Copying from
-048        * the bottom to the top.  see bn_mp_rshd.c for more info.
-049        */
-050       for (x = a->used - 1; x >= b; x--) \{
-051         *top-- = *bottom--;
-052       \}
-053
-054       /* zero the lower digits */
-055       top = a->dp;
-056       for (x = 0; x < b; x++) \{
-057         *top++ = 0;
-058       \}
-059     \}
-060     return MP_OKAY;
-061   \}
-062   #endif
-063
-\end{alltt}
-\end{small}
-
-The if statement (line 23) ensures that the $b$ variable is greater than zero since we do not interpret negative
-shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates
-the need for an additional variable in the for loop.  The variable $top$ (line 41) is an alias
-for the leading digit while $bottom$ (line 44) is an alias for the trailing edge.  The aliases form a
-window of exactly $b$ digits over the input.
-
-\subsection{Division by $x$}
-
-Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_rshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return. \\
-2.  If $a.used \le b$ then do \\
-\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
-\hspace{3mm}2.2  Return. \\
-3.  $i \leftarrow 0$ \\
-4.  $j \leftarrow b$ \\
-5.  for $n$ from 0 to $a.used - b - 1$ do \\
-\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
-\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
-\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
-6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
-\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.used \leftarrow a.used - b$ \\
-8.  Return. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_rshd}
-\end{figure}
-
-\textbf{Algorithm mp\_rshd.}
-This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
-it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.
-
-If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
-to the shift count $b$ then it will simply zero the input and return.
-
-After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
-is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.
-Also the digits are copied from the leading to the trailing edge.
-
-Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* shift right a certain amount of digits */
-018   void mp_rshd (mp_int * a, int b)
-019   \{
-020     int     x;
-021
-022     /* if b <= 0 then ignore it */
-023     if (b <= 0) \{
-024       return;
-025     \}
-026
-027     /* if b > used then simply zero it and return */
-028     if (a->used <= b) \{
-029       mp_zero (a);
-030       return;
-031     \}
-032
-033     \{
-034       mp_digit *bottom, *top;
-035
-036       /* shift the digits down */
-037
-038       /* bottom */
-039       bottom = a->dp;
-040
-041       /* top [offset into digits] */
-042       top = a->dp + b;
-043
-044       /* this is implemented as a sliding window where
-045        * the window is b-digits long and digits from
-046        * the top of the window are copied to the bottom
-047        *
-048        * e.g.
-049
-050        b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-051                    /\symbol{92}                   |      ---->
-052                     \symbol{92}-------------------/      ---->
-053        */
-054       for (x = 0; x < (a->used - b); x++) \{
-055         *bottom++ = *top++;
-056       \}
-057
-058       /* zero the top digits */
-059       for (; x < a->used; x++) \{
-060         *bottom++ = 0;
-061       \}
-062     \}
-063
-064     /* remove excess digits */
-065     a->used -= b;
-066   \}
-067   #endif
-068
-\end{alltt}
-\end{small}
-
-The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
-form a sliding window except we copy in the other direction.  After the window (line 59) we then zero
-the upper digits of the input to make sure the result is correct.
-
-\section{Powers of Two}
-
-Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For
-example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
-shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.
-
-\subsection{Multiplication by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
-\hline \\
-1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
-2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  If $b \ge lg(\beta)$ then \\
-\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
-\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
-5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $d \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-\hspace{3mm}6.4  If $r > 0$ then do \\
-\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
-\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
-7.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2d.}
-This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
-quickly compute the product.
-
-First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than
-$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$
-left.
-
-After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts
-required.  If it is non-zero a modified shift loop is used to calculate the remaining product.
-Essentially the loop is a generic version of algorithm mp\_mul\_2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
-variable is used to extract the upper $d$ bits to form the carry for the next iteration.
-
-This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to
-complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* shift left by a certain bit count */
-018   int mp_mul_2d (mp_int * a, int b, mp_int * c)
-019   \{
-020     mp_digit d;
-021     int      res;
-022
-023     /* copy */
-024     if (a != c) \{
-025        if ((res = mp_copy (a, c)) != MP_OKAY) \{
-026          return res;
-027        \}
-028     \}
-029
-030     if (c->alloc < (int)(c->used + (b / DIGIT_BIT) + 1)) \{
-031        if ((res = mp_grow (c, c->used + (b / DIGIT_BIT) + 1)) != MP_OKAY) \{
-032          return res;
-033        \}
-034     \}
-035
-036     /* shift by as many digits in the bit count */
-037     if (b >= (int)DIGIT_BIT) \{
-038       if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{
-039         return res;
-040       \}
-041     \}
-042
-043     /* shift any bit count < DIGIT_BIT */
-044     d = (mp_digit) (b % DIGIT_BIT);
-045     if (d != 0) \{
-046       mp_digit *tmpc, shift, mask, r, rr;
-047       int x;
-048
-049       /* bitmask for carries */
-050       mask = (((mp_digit)1) << d) - 1;
-051
-052       /* shift for msbs */
-053       shift = DIGIT_BIT - d;
-054
-055       /* alias */
-056       tmpc = c->dp;
-057
-058       /* carry */
-059       r    = 0;
-060       for (x = 0; x < c->used; x++) \{
-061         /* get the higher bits of the current word */
-062         rr = (*tmpc >> shift) & mask;
-063
-064         /* shift the current word and OR in the carry */
-065         *tmpc = ((*tmpc << d) | r) & MP_MASK;
-066         ++tmpc;
-067
-068         /* set the carry to the carry bits of the current word */
-069         r = rr;
-070       \}
-071
-072       /* set final carry */
-073       if (r != 0) \{
-074          c->dp[(c->used)++] = r;
-075       \}
-076     \}
-077     mp_clamp (c);
-078     return MP_OKAY;
-079   \}
-080   #endif
-081
-\end{alltt}
-\end{small}
-
-The shifting is performed in--place which means the first step (line 24) is to copy the input to the
-destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
-has to be grown (line 31) to accomodate the result.
-
-If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples
-of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift
-loop (lines 45 to 76) we make use of pre--computed values $shift$ and $mask$.   These are used to
-extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a
-chain between consecutive iterations to propagate the carry.
-
-\subsection{Division by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
-2.  $c \leftarrow a$ \\
-3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-4.  If $b \ge lg(\beta)$ then do \\
-\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
-5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $k \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2d.}
-This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm
-mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
-by using algorithm mp\_mod\_2d.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* shift right by a certain bit count (store quotient in c, optional remaind
-      er in d) */
-018   int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
-019   \{
-020     mp_digit D, r, rr;
-021     int     x, res;
-022     mp_int  t;
-023
-024
-025     /* if the shift count is <= 0 then we do no work */
-026     if (b <= 0) \{
-027       res = mp_copy (a, c);
-028       if (d != NULL) \{
-029         mp_zero (d);
-030       \}
-031       return res;
-032     \}
-033
-034     if ((res = mp_init (&t)) != MP_OKAY) \{
-035       return res;
-036     \}
-037
-038     /* get the remainder */
-039     if (d != NULL) \{
-040       if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{
-041         mp_clear (&t);
-042         return res;
-043       \}
-044     \}
-045
-046     /* copy */
-047     if ((res = mp_copy (a, c)) != MP_OKAY) \{
-048       mp_clear (&t);
-049       return res;
-050     \}
-051
-052     /* shift by as many digits in the bit count */
-053     if (b >= (int)DIGIT_BIT) \{
-054       mp_rshd (c, b / DIGIT_BIT);
-055     \}
-056
-057     /* shift any bit count < DIGIT_BIT */
-058     D = (mp_digit) (b % DIGIT_BIT);
-059     if (D != 0) \{
-060       mp_digit *tmpc, mask, shift;
-061
-062       /* mask */
-063       mask = (((mp_digit)1) << D) - 1;
-064
-065       /* shift for lsb */
-066       shift = DIGIT_BIT - D;
-067
-068       /* alias */
-069       tmpc = c->dp + (c->used - 1);
-070
-071       /* carry */
-072       r = 0;
-073       for (x = c->used - 1; x >= 0; x--) \{
-074         /* get the lower  bits of this word in a temp */
-075         rr = *tmpc & mask;
-076
-077         /* shift the current word and mix in the carry bits from the previous
-      word */
-078         *tmpc = (*tmpc >> D) | (r << shift);
-079         --tmpc;
-080
-081         /* set the carry to the carry bits of the current word found above */
-082         r = rr;
-083       \}
-084     \}
-085     mp_clamp (c);
-086     if (d != NULL) \{
-087       mp_exch (&t, d);
-088     \}
-089     mp_clear (&t);
-090     return MP_OKAY;
-091   \}
-092   #endif
-093
-\end{alltt}
-\end{small}
-
-The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally
-ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the
-result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
-the quotient is obtained.
-
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
-the direction of the shifts.
-
-\subsection{Remainder of Division by Power of Two}
-
-The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
-algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mod\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $b > a.used \cdot lg(\beta)$ then do \\
-\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}2.2  Return the result of step 2.1. \\
-3.  $c \leftarrow a$ \\
-4.  If step 3 failed return(\textit{MP\_MEM}). \\
-5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
-\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
-6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
-8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mod\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mod\_2d.}
-This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the
-result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$
-is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* calc a value mod 2**b */
-018   int
-019   mp_mod_2d (mp_int * a, int b, mp_int * c)
-020   \{
-021     int     x, res;
-022
-023     /* if b is <= 0 then zero the int */
-024     if (b <= 0) \{
-025       mp_zero (c);
-026       return MP_OKAY;
-027     \}
-028
-029     /* if the modulus is larger than the value than return */
-030     if (b >= (int) (a->used * DIGIT_BIT)) \{
-031       res = mp_copy (a, c);
-032       return res;
-033     \}
-034
-035     /* copy */
-036     if ((res = mp_copy (a, c)) != MP_OKAY) \{
-037       return res;
-038     \}
-039
-040     /* zero digits above the last digit of the modulus */
-041     for (x = (b / DIGIT_BIT) + (((b % DIGIT_BIT) == 0) ? 0 : 1); x < c->used;
-      x++) \{
-042       c->dp[x] = 0;
-043     \}
-044     /* clear the digit that is not completely outside/inside the modulus */
-045     c->dp[b / DIGIT_BIT] &=
-046       (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi
-      t) 1));
-047     mp_clamp (c);
-048     return MP_OKAY;
-049   \}
-050   #endif
-051
-\end{alltt}
-\end{small}
-
-We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
-than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
-perform some work to produce the remainder.
-
-Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce
-the number.  First we zero any digits above the last digit in $2^b$ (line 41).  Next we reduce the
-leading digit of both (line 45) and then mp\_clamp().
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
-                      & in $O(n)$ time. \\
-                      &\\
-$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
-                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
-                      & upto $64$ with a hamming weight less than three. \\
-                      &\\
-$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
-                      & $2^k - 1$ as well. \\
-                      &\\
-$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
-                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
-                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
-                      & calculation.  \\
-                      & \\
-$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
-                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
-                      & the cost of addition. \\
-                      & \\
-$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
-                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
-                      & \\
-$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
-                      & calculating the result of a signed comparison. \\
-                      &
-\end{tabular}
-
-\chapter{Multiplication and Squaring}
-\section{The Multipliers}
-For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of
-algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction
-where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication
-and squaring, leaving modular reductions for the subsequent chapter.
-
-The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular
-exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
-exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions,
-35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision
-multiplications.
-
-For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied
-against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the
-overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in
-1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.
-This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.
-
-\section{Multiplication}
-\subsection{The Baseline Multiplication}
-\label{sec:basemult}
-\index{baseline multiplication}
-Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
-algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision
-multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To
-simplify most discussions, it will be assumed that the inputs have comparable number of digits.
-
-The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be
-used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important
-facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this
-modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product
-will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.
-
-Recall from sub-section 4.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to
-include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The
-constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 5.2.2 for more information}).
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-1.  If min$(a.used, b.used) < \delta$ then do \\
-\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
-\hspace{3mm}1.2  Return the result of step 1.1 \\
-\\
-Allocate and initialize a temporary mp\_int. \\
-2.  Init $t$ to be of size $digs$ \\
-3.  If step 2 failed return(\textit{MP\_MEM}). \\
-4.  $t.used \leftarrow digs$ \\
-\\
-Compute the product. \\
-5.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}5.1  $u \leftarrow 0$ \\
-\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
-\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
-\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
-\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
-6.  Clamp excess digits of $t$. \\
-7.  Swap $c$ with $t$ \\
-8.  Clear $t$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_mul\_digs}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_mul\_digs.}
-This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
-a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent
-algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.
-Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the
-inputs.
-
-The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
-input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A
-temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to
-compute products when either $a = c$ or $b = c$ without overwriting the inputs.
-
-All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
-is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
-will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the
-innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.
-
-For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
-visualized in the following table.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|l|}
-\hline   &&          & 5 & 7 & 6 & \\
-\hline   $\times$&&  & 2 & 4 & 1 & \\
-\hline &&&&&&\\
-  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
-  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
-  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Long-Hand Multiplication Diagram}
-\end{figure}
-
-Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate
-count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
-
-Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
-is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
-double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
-5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit
-$t_{ix+iy}$ and the result would be lost.
-
-At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
-digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
-exceed the precision requested.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* multiplies |a| * |b| and only computes upto digs digits of result
-018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how
-019    * many digits of output are created.
-020    */
-021   int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-022   \{
-023     mp_int  t;
-024     int     res, pa, pb, ix, iy;
-025     mp_digit u;
-026     mp_word r;
-027     mp_digit tmpx, *tmpt, *tmpy;
-028
-029     /* can we use the fast multiplier? */
-030     if (((digs) < MP_WARRAY) &&
-031         (MIN (a->used, b->used) <
-032             (1 << ((CHAR_BIT * sizeof(mp_word)) - (2 * DIGIT_BIT))))) \{
-033       return fast_s_mp_mul_digs (a, b, c, digs);
-034     \}
-035
-036     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
-037       return res;
-038     \}
-039     t.used = digs;
-040
-041     /* compute the digits of the product directly */
-042     pa = a->used;
-043     for (ix = 0; ix < pa; ix++) \{
-044       /* set the carry to zero */
-045       u = 0;
-046
-047       /* limit ourselves to making digs digits of output */
-048       pb = MIN (b->used, digs - ix);
-049
-050       /* setup some aliases */
-051       /* copy of the digit from a used within the nested loop */
-052       tmpx = a->dp[ix];
-053
-054       /* an alias for the destination shifted ix places */
-055       tmpt = t.dp + ix;
-056
-057       /* an alias for the digits of b */
-058       tmpy = b->dp;
-059
-060       /* compute the columns of the output and propagate the carry */
-061       for (iy = 0; iy < pb; iy++) \{
-062         /* compute the column as a mp_word */
-063         r       = (mp_word)*tmpt +
-064                   ((mp_word)tmpx * (mp_word)*tmpy++) +
-065                   (mp_word)u;
-066
-067         /* the new column is the lower part of the result */
-068         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-069
-070         /* get the carry word from the result */
-071         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-072       \}
-073       /* set carry if it is placed below digs */
-074       if ((ix + iy) < digs) \{
-075         *tmpt = u;
-076       \}
-077     \}
-078
-079     mp_clamp (&t);
-080     mp_exch (&t, c);
-081
-082     mp_clear (&t);
-083     return MP_OKAY;
-084   \}
-085   #endif
-086
-\end{alltt}
-\end{small}
-
-First we determine (line 30) if the Comba method can be used first since it's faster.  The conditions for
-sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than
-\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is
-set to $\delta$ but can be reduced when memory is at a premium.
-
-If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
-$t$ (line 36) to the exact size of the output to avoid further re--allocations.  At this point we now
-begin the $O(n^2)$ loop.
-
-This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
-digits as output.  In each iteration of the outer loop the $pb$ variable is set (line 48) to the maximum
-number of inner loop iterations.
-
-Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
-carry from the previous iteration.  A particularly important observation is that most modern optimizing
-C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that
-is required for the product.  In x86 terms for example, this means using the MUL instruction.
-
-Each digit of the product is stored in turn (line 68) and the carry propagated (line 71) to the
-next iteration.
-
-\subsection{Faster Multiplication by the ``Comba'' Method}
-
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be
-computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement
-in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G.
-Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an
-interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written
-five years before.
-
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight
-twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products
-are produced then added together to form the final result.  In the baseline algorithm the columns are added together
-after each iteration to get the result instantaneously.
-
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at
-the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated
-after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute
-the product vector $\vec x$ as follows.
-
-\begin{equation}
-\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
-\end{equation}
-
-Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
-of $576$ and $241$.
-
-\newpage\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|}
-  \hline &          & 5 & 7 & 6 & First Input\\
-  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
-\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
-                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
-   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
-\hline 10 & 34 & 45 & 31 & 6 & Final Result \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Comba Multiplication Diagram}
-\end{figure}
-
-At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.
-Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
-congruent to adding a leading zero digit.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Comba Fixup}. \\
-\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
-\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
-\hline \\
-1.  for $n$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
-\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
-2.  Return($\vec x$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Comba Fixup}
-\end{figure}
-
-With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case
-$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
-efficient than the baseline algorithm why not simply always use this algorithm?
-
-\subsubsection{Column Weight.}
-At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output
-independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
-the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
-three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
-an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is
-min$(m, n)$ which is fairly obvious.
-
-The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
-from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
-two quantities we must not violate the following
-
-\begin{equation}
-k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
-\end{equation}
-
-Which reduces to
-
-\begin{equation}
-k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
-\end{equation}
-
-Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
-found.
-
-\begin{equation}
-k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
-\end{equation}
-
-The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration
-the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since
-$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
-1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}).\\
-\\
-3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
-\\
-4.  $\_ \hat W \leftarrow 0$ \\
-5.  for $ix$ from 0 to $pa - 1$ do \\
-\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
-\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
-\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
-\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
-\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
-\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
-\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
-\\
-6.  $oldused \leftarrow c.used$ \\
-7.  $c.used \leftarrow digs$ \\
-8.  for $ix$ from $0$ to $pa$ do \\
-\hspace{3mm}8.1  $c_{ix} \leftarrow W_{ix}$ \\
-9.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
-\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\
-\\
-10.  Clamp $c$. \\
-11.  Return MP\_OKAY. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_mul\_digs}
-\label{fig:COMBAMULT}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
-
-The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the
-loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
-reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
-
-The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
-$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
-$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.
-
-The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
-means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
-pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to
-move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until
-$tx \ge a.used$ or $ty < 0$ occurs.
-
-After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
-into the next round by dividing $\_ \hat W$ by $\beta$.
-
-To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the
-cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require
-$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice,
-the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
-and addition operations in the nested loop in parallel.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* Fast (comba) multiplier
-018    *
-019    * This is the fast column-array [comba] multiplier.  It is
-020    * designed to compute the columns of the product first
-021    * then handle the carries afterwards.  This has the effect
-022    * of making the nested loops that compute the columns very
-023    * simple and schedulable on super-scalar processors.
-024    *
-025    * This has been modified to produce a variable number of
-026    * digits of output so if say only a half-product is required
-027    * you don't have to compute the upper half (a feature
-028    * required for fast Barrett reduction).
-029    *
-030    * Based on Algorithm 14.12 on pp.595 of HAC.
-031    *
-032    */
-033   int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-034   \{
-035     int     olduse, res, pa, ix, iz;
-036     mp_digit W[MP_WARRAY];
-037     mp_word  _W;
-038
-039     /* grow the destination as required */
-040     if (c->alloc < digs) \{
-041       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
-042         return res;
-043       \}
-044     \}
-045
-046     /* number of output digits to produce */
-047     pa = MIN(digs, a->used + b->used);
-048
-049     /* clear the carry */
-050     _W = 0;
-051     for (ix = 0; ix < pa; ix++) \{
-052         int      tx, ty;
-053         int      iy;
-054         mp_digit *tmpx, *tmpy;
-055
-056         /* get offsets into the two bignums */
-057         ty = MIN(b->used-1, ix);
-058         tx = ix - ty;
-059
-060         /* setup temp aliases */
-061         tmpx = a->dp + tx;
-062         tmpy = b->dp + ty;
-063
-064         /* this is the number of times the loop will iterrate, essentially
-065            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-066          */
-067         iy = MIN(a->used-tx, ty+1);
-068
-069         /* execute loop */
-070         for (iz = 0; iz < iy; ++iz) \{
-071            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-072
-073         \}
-074
-075         /* store term */
-076         W[ix] = ((mp_digit)_W) & MP_MASK;
-077
-078         /* make next carry */
-079         _W = _W >> ((mp_word)DIGIT_BIT);
-080     \}
-081
-082     /* setup dest */
-083     olduse  = c->used;
-084     c->used = pa;
-085
-086     \{
-087       mp_digit *tmpc;
-088       tmpc = c->dp;
-089       for (ix = 0; ix < (pa + 1); ix++) \{
-090         /* now extract the previous digit [below the carry] */
-091         *tmpc++ = W[ix];
-092       \}
-093
-094       /* clear unused digits [that existed in the old copy of c] */
-095       for (; ix < olduse; ix++) \{
-096         *tmpc++ = 0;
-097       \}
-098     \}
-099     mp_clamp (c);
-100     return MP_OKAY;
-101   \}
-102   #endif
-103
-\end{alltt}
-\end{small}
-
-As per the pseudo--code we first calculate $pa$ (line 47) as the number of digits to output.  Next we begin the outer loop
-to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 61, 62) to point
-inside the two multiplicands quickly.
-
-The inner loop (lines 70 to 73) of this implementation is where the tradeoff come into play.  Originally this comba
-implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix
-the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write
-one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth
-is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often
-slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the
-compiler has aliased $\_ \hat W$ to a CPU register.
-
-After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 76, 79) to forward it as
-a carry for the next pass.  After the outer loop we use the final carry (line 76) as the last digit of the product.
-
-\subsection{Polynomial Basis Multiplication}
-To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
-the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and
-$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
-
-The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
-directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
-requires $O(n^2)$ time and would in practice be slower than the Comba technique.
-
-However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown
-coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with
-Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in
-effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.
-
-The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since
-$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the
-fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required
-by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
-
-When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
-is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product
-$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
-simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.
-The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the
-points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
-
-If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points}
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
-example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
-
-\begin{eqnarray}
-\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
-16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
-\end{eqnarray}
-
-Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
-polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.
-
-As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of
-multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is
-$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
-summarizes the exponents for various values of $n$.
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
-\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
-\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
-\hline $4$ & $1.403677461$ &\\
-\hline $5$ & $1.365212389$ &\\
-\hline $10$ & $1.278753601$ &\\
-\hline $100$ & $1.149426538$ &\\
-\hline $1000$ & $1.100270931$ &\\
-\hline $10000$ & $1.075252070$ &\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
-\label{fig:exponent}
-\end{figure}
-
-At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
-of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
-numbers.
-
-\subsubsection{Cutoff Point}
-The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However,
-the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
-polynomial basis approach more costly to use with small inputs.
-
-Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a
-point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and
-when $m > y$ the Comba methods are slower than the polynomial basis algorithms.
-
-The exact location of $y$ depends on several key architectural elements of the computer platform in question.
-
-\begin{enumerate}
-\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
-on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
-the cutoff point $y$ will be.
-
-\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
-grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
-directly reflects on the ratio previous mentioned.
-
-\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
-influence over the cutoff point.
-
-\end{enumerate}
-
-A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
-is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
-a high resolution timer is available.
-
-\subsection{Karatsuba Multiplication}
-Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
-general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with
-light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
-
-\begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a + b)(c + d) - (ac + bd))x + bd
-\end{equation}
-
-Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
-this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns
-out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points
-$\zeta_0$, $\zeta_{\infty}$ and $\zeta_{1}$.  Consider the resultant system of equations.
-
-\begin{center}
-\begin{tabular}{rcrcrcrc}
-$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$\zeta_{1}$ &      $=$ & $w_2$ & $+$ & $w_1$ & $+$ & $w_0$ \\
-$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
-\end{tabular}
-\end{center}
-
-By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
-of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
-\hline \\
-1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
-2.  If step 2 failed then return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
-3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
-6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
-7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
-\\
-Calculate the three products. \\
-8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
-9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 + x0$ (\textit{mp\_add}) \\
-11.  $x0 \leftarrow y1 + y0$ \\
-12.  $t1 \leftarrow t1 \cdot x0$ \\
-\\
-Calculate the middle term. \\
-13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow t1 - x0$ (\textit{s\_mp\_sub}) \\
-\\
-Calculate the final product. \\
-15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
-16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
-17.  $t1 \leftarrow x0y0 + t1$ \\
-18.  $c \leftarrow t1 + x1y1$ \\
-19.  Clear all of the temporary variables. \\
-20.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_mul.}
-This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
-from Knuth \cite[pp. 294-295]{TAOCPV2}.
-
-\index{radix point}
-In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
-be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the
-smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5
-compute the lower halves.  Step 6 and 7 computer the upper halves.
-
-After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 + x0$ has been computed.  By using $x0$ instead
-of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
-
-The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* c = |a| * |b| using Karatsuba Multiplication using
-018    * three half size multiplications
-019    *
-020    * Let B represent the radix [e.g. 2**DIGIT_BIT] and
-021    * let n represent half of the number of digits in
-022    * the min(a,b)
-023    *
-024    * a = a1 * B**n + a0
-025    * b = b1 * B**n + b0
-026    *
-027    * Then, a * b =>
-028      a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
-029    *
-030    * Note that a1b1 and a0b0 are used twice and only need to be
-031    * computed once.  So in total three half size (half # of
-032    * digit) multiplications are performed, a0b0, a1b1 and
-033    * (a1+b1)(a0+b0)
-034    *
-035    * Note that a multiplication of half the digits requires
-036    * 1/4th the number of single precision multiplications so in
-037    * total after one call 25% of the single precision multiplications
-038    * are saved.  Note also that the call to mp_mul can end up back
-039    * in this function if the a0, a1, b0, or b1 are above the threshold.
-040    * This is known as divide-and-conquer and leads to the famous
-041    * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
-042    * the standard O(N**2) that the baseline/comba methods use.
-043    * Generally though the overhead of this method doesn't pay off
-044    * until a certain size (N ~ 80) is reached.
-045    */
-046   int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
-047   \{
-048     mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
-049     int     B, err;
-050
-051     /* default the return code to an error */
-052     err = MP_MEM;
-053
-054     /* min # of digits */
-055     B = MIN (a->used, b->used);
-056
-057     /* now divide in two */
-058     B = B >> 1;
-059
-060     /* init copy all the temps */
-061     if (mp_init_size (&x0, B) != MP_OKAY)
-062       goto ERR;
-063     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-064       goto X0;
-065     if (mp_init_size (&y0, B) != MP_OKAY)
-066       goto X1;
-067     if (mp_init_size (&y1, b->used - B) != MP_OKAY)
-068       goto Y0;
-069
-070     /* init temps */
-071     if (mp_init_size (&t1, B * 2) != MP_OKAY)
-072       goto Y1;
-073     if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-074       goto T1;
-075     if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
-076       goto X0Y0;
-077
-078     /* now shift the digits */
-079     x0.used = y0.used = B;
-080     x1.used = a->used - B;
-081     y1.used = b->used - B;
-082
-083     \{
-084       int x;
-085       mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-086
-087       /* we copy the digits directly instead of using higher level functions
-088        * since we also need to shift the digits
-089        */
-090       tmpa = a->dp;
-091       tmpb = b->dp;
-092
-093       tmpx = x0.dp;
-094       tmpy = y0.dp;
-095       for (x = 0; x < B; x++) \{
-096         *tmpx++ = *tmpa++;
-097         *tmpy++ = *tmpb++;
-098       \}
-099
-100       tmpx = x1.dp;
-101       for (x = B; x < a->used; x++) \{
-102         *tmpx++ = *tmpa++;
-103       \}
-104
-105       tmpy = y1.dp;
-106       for (x = B; x < b->used; x++) \{
-107         *tmpy++ = *tmpb++;
-108       \}
-109     \}
-110
-111     /* only need to clamp the lower words since by definition the
-112      * upper words x1/y1 must have a known number of digits
-113      */
-114     mp_clamp (&x0);
-115     mp_clamp (&y0);
-116
-117     /* now calc the products x0y0 and x1y1 */
-118     /* after this x0 is no longer required, free temp [x0==t2]! */
-119     if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
-120       goto X1Y1;          /* x0y0 = x0*y0 */
-121     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-122       goto X1Y1;          /* x1y1 = x1*y1 */
-123
-124     /* now calc x1+x0 and y1+y0 */
-125     if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
-126       goto X1Y1;          /* t1 = x1 - x0 */
-127     if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
-128       goto X1Y1;          /* t2 = y1 - y0 */
-129     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-130       goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
-131
-132     /* add x0y0 */
-133     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
-134       goto X1Y1;          /* t2 = x0y0 + x1y1 */
-135     if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
-136       goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
-137
-138     /* shift by B */
-139     if (mp_lshd (&t1, B) != MP_OKAY)
-140       goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-141     if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-142       goto X1Y1;          /* x1y1 = x1y1 << 2*B */
-143
-144     if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-145       goto X1Y1;          /* t1 = x0y0 + t1 */
-146     if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-147       goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
-148
-149     /* Algorithm succeeded set the return code to MP_OKAY */
-150     err = MP_OKAY;
-151
-152   X1Y1:mp_clear (&x1y1);
-153   X0Y0:mp_clear (&x0y0);
-154   T1:mp_clear (&t1);
-155   Y1:mp_clear (&y1);
-156   Y0:mp_clear (&y0);
-157   X1:mp_clear (&x1);
-158   X0:mp_clear (&x0);
-159   ERR:
-160     return err;
-161   \}
-162   #endif
-163
-\end{alltt}
-\end{small}
-
-The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
-wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
-to handle error recovery with a single piece of code.  Lines 61 to 75 handle initializing all of the temporary variables
-required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
-the temporaries that have been successfully allocated so far.
-
-The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the
-additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
-number of digits for the next section of code.
-
-The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
-to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and
-\textbf{sign} members are copied first.  The first for loop on line 101 copies the lower halves.  Since they are both the same magnitude it
-is simpler to calculate both lower halves in a single loop.  The for loop on lines 106 and 106 calculate the upper halves $x1$ and
-$y1$ respectively.
-
-By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
-
-When line 150 is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
-the same code that handles errors can be used to clear the temporary variables and return.
-
-\subsection{Toom-Cook $3$-Way Multiplication}
-Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points  are
-chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$,
-$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients
-of the $W(x)$.
-
-With the five relations that Toom-Cook specifies, the following system of equations is formed.
-
-\begin{center}
-\begin{tabular}{rcrcrcrcrcr}
-$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
-$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
-$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
-$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
-$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
-\end{tabular}
-\end{center}
-
-A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
-of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
-the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
-(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
-\hline \\
-Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
-1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
-2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-\\
-Find the five equations for $w_0, w_1, ..., w_4$. \\
-8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
-9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
-10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
-11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
-13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
-14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
-15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
-16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
-\\
-Continued on the next page.\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
-\hline \\
-Now solve the system of equations. \\
-18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
-19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
-20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
-21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
-23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
-24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
-\\
-Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
-26. for $n$ from $1$ to $4$ do \\
-\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
-27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
-28. Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul (continued)}
-\end{figure}
-
-\textbf{Algorithm mp\_toom\_mul.}
-This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this
-algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
-description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
-any given step.
-
-The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
-integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
-
-The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
-to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
-$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
-
-After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients
-$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
-the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
-that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.
-
-Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer
-result $a \cdot b$ is produced.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_toom\_mul.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* multiplication using the Toom-Cook 3-way algorithm
-018    *
-019    * Much more complicated than Karatsuba but has a lower
-020    * asymptotic running time of O(N**1.464).  This algorithm is
-021    * only particularly useful on VERY large inputs
-022    * (we're talking 1000s of digits here...).
-023   */
-024   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-025   \{
-026       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-027       int res, B;
-028
-029       /* init temps */
-030       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4,
-031                                &a0, &a1, &a2, &b0, &b1,
-032                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
-033          return res;
-034       \}
-035
-036       /* B */
-037       B = MIN(a->used, b->used) / 3;
-038
-039       /* a = a2 * B**2 + a1 * B + a0 */
-040       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
-041          goto ERR;
-042       \}
-043
-044       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
-045          goto ERR;
-046       \}
-047       mp_rshd(&a1, B);
-048       if ((res = mp_mod_2d(&a1, DIGIT_BIT * B, &a1)) != MP_OKAY) \{
-049          goto ERR;
-050       \}
-051
-052       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
-053          goto ERR;
-054       \}
-055       mp_rshd(&a2, B*2);
-056
-057       /* b = b2 * B**2 + b1 * B + b0 */
-058       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
-059          goto ERR;
-060       \}
-061
-062       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
-063          goto ERR;
-064       \}
-065       mp_rshd(&b1, B);
-066       (void)mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-067
-068       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
-069          goto ERR;
-070       \}
-071       mp_rshd(&b2, B*2);
-072
-073       /* w0 = a0*b0 */
-074       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
-075          goto ERR;
-076       \}
-077
-078       /* w4 = a2 * b2 */
-079       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
-080          goto ERR;
-081       \}
-082
-083       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-084       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
-085          goto ERR;
-086       \}
-087       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-088          goto ERR;
-089       \}
-090       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-091          goto ERR;
-092       \}
-093       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
-094          goto ERR;
-095       \}
-096
-097       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
-098          goto ERR;
-099       \}
-100       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-101          goto ERR;
-102       \}
-103       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-104          goto ERR;
-105       \}
-106       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
-107          goto ERR;
-108       \}
-109
-110       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
-111          goto ERR;
-112       \}
-113
-114       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-115       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
-116          goto ERR;
-117       \}
-118       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-119          goto ERR;
-120       \}
-121       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-122          goto ERR;
-123       \}
-124       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-125          goto ERR;
-126       \}
-127
-128       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
-129          goto ERR;
-130       \}
-131       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-132          goto ERR;
-133       \}
-134       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-135          goto ERR;
-136       \}
-137       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-138          goto ERR;
-139       \}
-140
-141       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
-142          goto ERR;
-143       \}
-144
-145
-146       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-147       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
-148          goto ERR;
-149       \}
-150       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-151          goto ERR;
-152       \}
-153       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
-154          goto ERR;
-155       \}
-156       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-157          goto ERR;
-158       \}
-159       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
-160          goto ERR;
-161       \}
-162
-163       /* now solve the matrix
-164
-165          0  0  0  0  1
-166          1  2  4  8  16
-167          1  1  1  1  1
-168          16 8  4  2  1
-169          1  0  0  0  0
-170
-171          using 12 subtractions, 4 shifts,
-172                 2 small divisions and 1 small multiplication
-173        */
-174
-175       /* r1 - r4 */
-176       if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
-177          goto ERR;
-178       \}
-179       /* r3 - r0 */
-180       if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
-181          goto ERR;
-182       \}
-183       /* r1/2 */
-184       if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
-185          goto ERR;
-186       \}
-187       /* r3/2 */
-188       if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
-189          goto ERR;
-190       \}
-191       /* r2 - r0 - r4 */
-192       if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
-193          goto ERR;
-194       \}
-195       if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
-196          goto ERR;
-197       \}
-198       /* r1 - r2 */
-199       if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-200          goto ERR;
-201       \}
-202       /* r3 - r2 */
-203       if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-204          goto ERR;
-205       \}
-206       /* r1 - 8r0 */
-207       if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
-208          goto ERR;
-209       \}
-210       if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
-211          goto ERR;
-212       \}
-213       /* r3 - 8r4 */
-214       if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
-215          goto ERR;
-216       \}
-217       if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
-218          goto ERR;
-219       \}
-220       /* 3r2 - r1 - r3 */
-221       if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
-222          goto ERR;
-223       \}
-224       if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
-225          goto ERR;
-226       \}
-227       if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
-228          goto ERR;
-229       \}
-230       /* r1 - r2 */
-231       if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-232          goto ERR;
-233       \}
-234       /* r3 - r2 */
-235       if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-236          goto ERR;
-237       \}
-238       /* r1/3 */
-239       if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
-240          goto ERR;
-241       \}
-242       /* r3/3 */
-243       if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
-244          goto ERR;
-245       \}
-246
-247       /* at this point shift W[n] by B*n */
-248       if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
-249          goto ERR;
-250       \}
-251       if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
-252          goto ERR;
-253       \}
-254       if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
-255          goto ERR;
-256       \}
-257       if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
-258          goto ERR;
-259       \}
-260
-261       if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
-262          goto ERR;
-263       \}
-264       if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
-265          goto ERR;
-266       \}
-267       if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
-268          goto ERR;
-269       \}
-270       if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
-271          goto ERR;
-272       \}
-273
-274   ERR:
-275       mp_clear_multi(&w0, &w1, &w2, &w3, &w4,
-276                      &a0, &a1, &a2, &b0, &b1,
-277                      &b2, &tmp1, &tmp2, NULL);
-278       return res;
-279   \}
-280
-281   #endif
-282
-\end{alltt}
-\end{small}
-
-The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very
-large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
-Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
-algorithm is not practical as Karatsuba has a much lower cutoff point.
-
-First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines 40 to 71) with
-combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
-for $b$.
-
-Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
-we get those out of the way first (lines 74 and 79).  Next we compute $w1, w2$ and $w3$ using Horners method.
-
-After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
-straight forward.
-
-\subsection{Signed Multiplication}
-Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
-of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot b$ \\
-\hline \\
-1.  If $a.sign = b.sign$ then \\
-\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
-2.  else \\
-\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
-3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
-\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
-4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
-\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
-5.  else \\
-\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
-\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
-\hspace{3mm}5.3  else \\
-\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
-6.  $c.sign \leftarrow sign$ \\
-7.  Return the result of the unsigned multiplication performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_mul.}
-This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms
-available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
-s\_mp\_mul\_digs will clear it.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* high level multiplication (handles sign) */
-018   int mp_mul (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     int     res, neg;
-021     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-022
-023     /* use Toom-Cook? */
-024   #ifdef BN_MP_TOOM_MUL_C
-025     if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{
-026       res = mp_toom_mul(a, b, c);
-027     \} else
-028   #endif
-029   #ifdef BN_MP_KARATSUBA_MUL_C
-030     /* use Karatsuba? */
-031     if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{
-032       res = mp_karatsuba_mul (a, b, c);
-033     \} else
-034   #endif
-035     \{
-036       /* can we use the fast multiplier?
-037        *
-038        * The fast multiplier can be used if the output will
-039        * have less than MP_WARRAY digits and the number of
-040        * digits won't affect carry propagation
-041        */
-042       int     digs = a->used + b->used + 1;
-043
-044   #ifdef BN_FAST_S_MP_MUL_DIGS_C
-045       if ((digs < MP_WARRAY) &&
-046           (MIN(a->used, b->used) <=
-047            (1 << ((CHAR_BIT * sizeof(mp_word)) - (2 * DIGIT_BIT))))) \{
-048         res = fast_s_mp_mul_digs (a, b, c, digs);
-049       \} else
-050   #endif
-051       \{
-052   #ifdef BN_S_MP_MUL_DIGS_C
-053         res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
-054   #else
-055         res = MP_VAL;
-056   #endif
-057       \}
-058     \}
-059     c->sign = (c->used > 0) ? neg : MP_ZPOS;
-060     return res;
-061   \}
-062   #endif
-063
-\end{alltt}
-\end{small}
-
-The implementation is rather simplistic and is not particularly noteworthy.  Line 23 computes the sign of the result using the ``?''
-operator from the C programming language.  Line 47 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.
-
-\section{Squaring}
-\label{sec:basesquare}
-
-Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
-available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
-performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider
-the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$,
-$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$
-and $3 \cdot 1 = 1 \cdot 3$.
-
-For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
-required for multiplication.  The following diagram gives an example of the operations required.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{ccccc|c}
-&&1&2&3&\\
-$\times$ &&1&2&3&\\
-\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
-       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
-         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
-\end{tabular}
-\end{center}
-\caption{Squaring Optimization Diagram}
-\end{figure}
-
-Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
-represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.
-
-The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
-appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double
-products and at most one square (\textit{see the exercise section}).
-
-The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row,
-occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero.
-Column two of row one is a square and column three is the first unique column.
-
-\subsection{The Baseline Squaring Algorithm}
-The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
-will not handle.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}) \\
-3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
-4.  For $ix$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}Calculate the square. \\
-\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
-\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}Calculate the double products after the square. \\
-\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
-\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}Set the last carry. \\
-\hspace{3mm}4.5  While $u > 0$ do \\
-\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
-\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
-6.  Exchange $b$ and $t$. \\
-7.  Clear $t$ (\textit{mp\_clear}) \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sqr.}
-This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
-\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the
-destination mp\_int to be the same as the source mp\_int.
-
-The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
-the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
-the carry and compute the double products.
-
-The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
-very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
-when it is multiplied by two, it can be properly represented by a mp\_word.
-
-Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial
-results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-018   int s_mp_sqr (mp_int * a, mp_int * b)
-019   \{
-020     mp_int  t;
-021     int     res, ix, iy, pa;
-022     mp_word r;
-023     mp_digit u, tmpx, *tmpt;
-024
-025     pa = a->used;
-026     if ((res = mp_init_size (&t, (2 * pa) + 1)) != MP_OKAY) \{
-027       return res;
-028     \}
-029
-030     /* default used is maximum possible size */
-031     t.used = (2 * pa) + 1;
-032
-033     for (ix = 0; ix < pa; ix++) \{
-034       /* first calculate the digit at 2*ix */
-035       /* calculate double precision result */
-036       r = (mp_word)t.dp[2*ix] +
-037           ((mp_word)a->dp[ix] * (mp_word)a->dp[ix]);
-038
-039       /* store lower part in result */
-040       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-041
-042       /* get the carry */
-043       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-044
-045       /* left hand side of A[ix] * A[iy] */
-046       tmpx        = a->dp[ix];
-047
-048       /* alias for where to store the results */
-049       tmpt        = t.dp + ((2 * ix) + 1);
-050
-051       for (iy = ix + 1; iy < pa; iy++) \{
-052         /* first calculate the product */
-053         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
-054
-055         /* now calculate the double precision result, note we use
-056          * addition instead of *2 since it's easier to optimize
-057          */
-058         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
-059
-060         /* store lower part */
-061         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-062
-063         /* get carry */
-064         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-065       \}
-066       /* propagate upwards */
-067       while (u != ((mp_digit) 0)) \{
-068         r       = ((mp_word) *tmpt) + ((mp_word) u);
-069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-070         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-071       \}
-072     \}
-073
-074     mp_clamp (&t);
-075     mp_exch (&t, b);
-076     mp_clear (&t);
-077     return MP_OKAY;
-078   \}
-079   #endif
-080
-\end{alltt}
-\end{small}
-
-Inside the outer loop (line 33) the square term is calculated on line 36.  The carry (line 43) has been
-extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized
-(lines 46 and 49) to simplify the inner loop.  The doubling is performed using two
-additions (line 58) since it is usually faster than shifting, if not at least as fast.
-
-The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
-get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
-square a number.
-
-\subsection{Faster Squaring by the ``Comba'' Method}
-A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
-drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
-performance hazards.
-
-The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
-propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
-that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
-$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.
-
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two
-mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and
-carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
-1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
-2.  If step 1 failed return(\textit{MP\_MEM}). \\
-\\
-3.  $pa \leftarrow 2 \cdot a.used$ \\
-4.  $\hat W1 \leftarrow 0$ \\
-5.  for $ix$ from $0$ to $pa - 1$ do \\
-\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
-\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
-\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
-\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
-\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
-\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
-\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
-\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
-\hspace{3mm}5.8  if $ix$ is even then \\
-\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
-\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
-\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
-\\
-6.  $oldused \leftarrow b.used$ \\
-7.  $b.used \leftarrow 2 \cdot a.used$ \\
-8.  for $ix$ from $0$ to $pa - 1$ do \\
-\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
-9.  for $ix$ from $pa$ to $oldused - 1$ do \\
-\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
-10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm
-s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.
-This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
-
-First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
-products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
-addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
-$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
-of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
-fewer multiplications and the routine ends up being faster.
-
-Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
-only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* the jist of squaring...
-018    * you do like mult except the offset of the tmpx [one that
-019    * starts closer to zero] can't equal the offset of tmpy.
-020    * So basically you set up iy like before then you min it with
-021    * (ty-tx) so that it never happens.  You double all those
-022    * you add in the inner loop
-023
-024   After that loop you do the squares and add them in.
-025   */
-026
-027   int fast_s_mp_sqr (mp_int * a, mp_int * b)
-028   \{
-029     int       olduse, res, pa, ix, iz;
-030     mp_digit   W[MP_WARRAY], *tmpx;
-031     mp_word   W1;
-032
-033     /* grow the destination as required */
-034     pa = a->used + a->used;
-035     if (b->alloc < pa) \{
-036       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
-037         return res;
-038       \}
-039     \}
-040
-041     /* number of output digits to produce */
-042     W1 = 0;
-043     for (ix = 0; ix < pa; ix++) \{
-044         int      tx, ty, iy;
-045         mp_word  _W;
-046         mp_digit *tmpy;
-047
-048         /* clear counter */
-049         _W = 0;
-050
-051         /* get offsets into the two bignums */
-052         ty = MIN(a->used-1, ix);
-053         tx = ix - ty;
-054
-055         /* setup temp aliases */
-056         tmpx = a->dp + tx;
-057         tmpy = a->dp + ty;
-058
-059         /* this is the number of times the loop will iterrate, essentially
-060            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-061          */
-062         iy = MIN(a->used-tx, ty+1);
-063
-064         /* now for squaring tx can never equal ty
-065          * we halve the distance since they approach at a rate of 2x
-066          * and we have to round because odd cases need to be executed
-067          */
-068         iy = MIN(iy, ((ty-tx)+1)>>1);
-069
-070         /* execute loop */
-071         for (iz = 0; iz < iy; iz++) \{
-072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-073         \}
-074
-075         /* double the inner product and add carry */
-076         _W = _W + _W + W1;
-077
-078         /* even columns have the square term in them */
-079         if ((ix&1) == 0) \{
-080            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
-081         \}
-082
-083         /* store it */
-084         W[ix] = (mp_digit)(_W & MP_MASK);
-085
-086         /* make next carry */
-087         W1 = _W >> ((mp_word)DIGIT_BIT);
-088     \}
-089
-090     /* setup dest */
-091     olduse  = b->used;
-092     b->used = a->used+a->used;
-093
-094     \{
-095       mp_digit *tmpb;
-096       tmpb = b->dp;
-097       for (ix = 0; ix < pa; ix++) \{
-098         *tmpb++ = W[ix] & MP_MASK;
-099       \}
-100
-101       /* clear unused digits [that existed in the old copy of c] */
-102       for (; ix < olduse; ix++) \{
-103         *tmpb++ = 0;
-104       \}
-105     \}
-106     mp_clamp (b);
-107     return MP_OKAY;
-108   \}
-109   #endif
-110
-\end{alltt}
-\end{small}
-
-This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for
-the special case of squaring.
-
-\subsection{Polynomial Basis Squaring}
-The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
-is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
-multiplications to find the $\zeta$ relations, squaring operations are performed instead.
-
-\subsection{Karatsuba Squaring}
-Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.
-Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a
-number with the following equation.
-
-\begin{equation}
-h(x) = a^2x^2 + \left ((a + b)^2 - (a^2 + b^2) \right )x + b^2
-\end{equation}
-
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a + b)^2$.  As in
-Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of
-$O \left ( n^{lg(3)} \right )$.
-
-If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm
-instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the
-time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff
-point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.
-
-Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.
-The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
-were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
-2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1\beta^B + x0$ \\
-3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
-\\
-Calculate the three squares. \\
-6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
-7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 + x0$ (\textit{s\_mp\_add}) \\
-9.  $t1 \leftarrow t1^2$ \\
-\\
-Compute the middle term. \\
-10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t1 - t2$ \\
-\\
-Compute final product. \\
-12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
-13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
-14.  $t1 \leftarrow t1 + x0x0$ \\
-15.  $b \leftarrow t1 + x1x1$ \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_sqr.}
-This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
-multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
-
-The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
-placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
-as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
-
-By expanding $\left (x1 + x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $(x0 - x1)^2 - (x1^2 + x0^2)  = 2 \cdot x0 \cdot x1$.
-Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
-this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
-
-Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
-machine clock cycles.}.
-
-\begin{equation}
-5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
-\end{equation}
-
-For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
-\begin{center}
-\begin{tabular}{rcl}
-${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
-${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
-${13 \over 9}$     & $<$ & $n$ \\
-\end{tabular}
-\end{center}
-
-This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
-where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
-the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
-ratio of 1:7.  } than simpler operations such as addition.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* Karatsuba squaring, computes b = a*a using three
-018    * half size squarings
-019    *
-020    * See comments of karatsuba_mul for details.  It
-021    * is essentially the same algorithm but merely
-022    * tuned to perform recursive squarings.
-023    */
-024   int mp_karatsuba_sqr (mp_int * a, mp_int * b)
-025   \{
-026     mp_int  x0, x1, t1, t2, x0x0, x1x1;
-027     int     B, err;
-028
-029     err = MP_MEM;
-030
-031     /* min # of digits */
-032     B = a->used;
-033
-034     /* now divide in two */
-035     B = B >> 1;
-036
-037     /* init copy all the temps */
-038     if (mp_init_size (&x0, B) != MP_OKAY)
-039       goto ERR;
-040     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-041       goto X0;
-042
-043     /* init temps */
-044     if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
-045       goto X1;
-046     if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
-047       goto T1;
-048     if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
-049       goto T2;
-050     if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
-051       goto X0X0;
-052
-053     \{
-054       int x;
-055       mp_digit *dst, *src;
-056
-057       src = a->dp;
-058
-059       /* now shift the digits */
-060       dst = x0.dp;
-061       for (x = 0; x < B; x++) \{
-062         *dst++ = *src++;
-063       \}
-064
-065       dst = x1.dp;
-066       for (x = B; x < a->used; x++) \{
-067         *dst++ = *src++;
-068       \}
-069     \}
-070
-071     x0.used = B;
-072     x1.used = a->used - B;
-073
-074     mp_clamp (&x0);
-075
-076     /* now calc the products x0*x0 and x1*x1 */
-077     if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-078       goto X1X1;           /* x0x0 = x0*x0 */
-079     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-080       goto X1X1;           /* x1x1 = x1*x1 */
-081
-082     /* now calc (x1+x0)**2 */
-083     if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
-084       goto X1X1;           /* t1 = x1 - x0 */
-085     if (mp_sqr (&t1, &t1) != MP_OKAY)
-086       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
-087
-088     /* add x0y0 */
-089     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-090       goto X1X1;           /* t2 = x0x0 + x1x1 */
-091     if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
-092       goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
-093
-094     /* shift by B */
-095     if (mp_lshd (&t1, B) != MP_OKAY)
-096       goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
-097     if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-098       goto X1X1;           /* x1x1 = x1x1 << 2*B */
-099
-100     if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-101       goto X1X1;           /* t1 = x0x0 + t1 */
-102     if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-103       goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
-104
-105     err = MP_OKAY;
-106
-107   X1X1:mp_clear (&x1x1);
-108   X0X0:mp_clear (&x0x0);
-109   T2:mp_clear (&t2);
-110   T1:mp_clear (&t1);
-111   X1:mp_clear (&x1);
-112   X0:mp_clear (&x0);
-113   ERR:
-114     return err;
-115   \}
-116   #endif
-117
-\end{alltt}
-\end{small}
-
-This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and
-shift the input into the two halves.  The loop from line 53 to line 69 has been modified since only one input exists.  The \textbf{used}
-count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
-to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.
-
-By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
-is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
-it is actually below the Comba limit (\textit{at 110 digits}).
-
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are
-redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and
-mp\_clears are executed normally.
-
-\subsection{Toom-Cook Squaring}
-The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to
-derive their own Toom-Cook squaring algorithm.
-
-\subsection{High Level Squaring}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
-\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
-2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
-\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
-3.  else \\
-\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
-\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
-\hspace{3mm}3.3  else \\
-\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
-4.  $b.sign \leftarrow MP\_ZPOS$ \\
-5.  Return the result of the unsigned squaring performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_sqr.}
-This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
-\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
-neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* computes b = a*a */
-018   int
-019   mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     int     res;
-022
-023   #ifdef BN_MP_TOOM_SQR_C
-024     /* use Toom-Cook? */
-025     if (a->used >= TOOM_SQR_CUTOFF) \{
-026       res = mp_toom_sqr(a, b);
-027     /* Karatsuba? */
-028     \} else
-029   #endif
-030   #ifdef BN_MP_KARATSUBA_SQR_C
-031     if (a->used >= KARATSUBA_SQR_CUTOFF) \{
-032       res = mp_karatsuba_sqr (a, b);
-033     \} else
-034   #endif
-035     \{
-036   #ifdef BN_FAST_S_MP_SQR_C
-037       /* can we use the fast comba multiplier? */
-038       if ((((a->used * 2) + 1) < MP_WARRAY) &&
-039            (a->used <
-040            (1 << (((sizeof(mp_word) * CHAR_BIT) - (2 * DIGIT_BIT)) - 1)))) \{
-041         res = fast_s_mp_sqr (a, b);
-042       \} else
-043   #endif
-044       \{
-045   #ifdef BN_S_MP_SQR_C
-046         res = s_mp_sqr (a, b);
-047   #else
-048         res = MP_VAL;
-049   #endif
-050       \}
-051     \}
-052     b->sign = MP_ZPOS;
-053     return res;
-054   \}
-055   #endif
-056
-\end{alltt}
-\end{small}
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
-                      & that have different number of digits in Karatsuba multiplication. \\
-                      & \\
-$\left [ 2 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
-                      & of double products and at most one square is stated.  Prove this statement. \\
-                      & \\
-$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
-                      & \\
-$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
-                      & \\
-$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
-                      & required for equation $6.7$ to be true.  \\
-                      & \\
-$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
-                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
-                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
-                      &\\
-$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
-                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
-                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
-                      & \\
-\end{tabular}
-
-\chapter{Modular Reduction}
-\section{Basics of Modular Reduction}
-\index{modular residue}
-Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms,
-such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be \textit{reduced}
-modulo another number $b$ by finding the remainder of the division $a/b$.  Full integer division with remainder is a topic to be covered
-in~\ref{sec:division}.
-
-Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result
-$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the
-``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
-other forms of residues.
-
-Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions
-is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the
-RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in
-elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular
-exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the
-range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check
-algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.
-
-\section{The Barrett Reduction}
-The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
-division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to
-
-\begin{equation}
-c = a - b \cdot \lfloor a/b \rfloor
-\end{equation}
-
-Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper
-targeted the DSP56K processor.}  intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal.  However,
-DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types.
-It would take another common optimization to optimize the algorithm.
-
-\subsection{Fixed Point Arithmetic}
-The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
-point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were
-fairly slow if not unavailable.   The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit
-integer and a $q$-bit fraction part (\textit{where $p+q = k$}).
-
-In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
-value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by
-moving the implied decimal point back to where it should be.  For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted
-to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the
-fixed point representation of $5$.  The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$.
-
-This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication
-of two fixed point numbers.  Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal.  If $2^q$ is
-equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic.  Using this fact dividing an integer
-$a$ by another integer $b$ can be achieved with the following expression.
-
-\begin{equation}
-\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
-\end{equation}
-
-The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with
-modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
-are considerably faster than division on most processors.
-
-Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
-leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
-the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  The value of $2^q$ must be close to or ideally
-larger than the dividend.  In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach
-to work correctly.  Plugging this form of divison into the original equation the following modular residue equation arises.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
-\end{equation}
-
-Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
-variable also helps re-inforce the idea that it is meant to be computed once and re-used.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
-\end{equation}
-
-Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  In the context of Barrett
-reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough
-precision.
-
-Let $n$ represent the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and
-another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to
-reduce the number.
-
-For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
-$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
-By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
-
-\subsection{Choosing a Radix Point}
-Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
-that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$.
-See~\ref{sec:division} for further details.} might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
-the initial multiplication that finds the quotient.
-
-Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
-the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if
-two $m$-digit numbers have been multiplied.  Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the
-$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  Another way to
-express this is by re-writing $a$ as two parts.  If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then
-${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$.  Since $a'$ is bound to be less than $b$ the quotient
-is bound by $0 \le {a' \over b} < 1$.
-
-Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits
-``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
-with the irrelevant digits trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
-
-\begin{equation}
-c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
-\end{equation}
-
-Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the
-exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$.  If the optimization had not been performed the divisor
-would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient
-$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two.  The original fixed point quotient can be off
-by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient
-can be off by an additional value of one for a total of at most two.  This implies that
-$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then conditionally subtracting
-$b$ once or twice the residue is found.
-
-The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
-precision multiplications, ignoring the subtractions required.  In total $2m^2 + m$ single precision multiplications are required to find the residue.
-This is considerably faster than the original attempt.
-
-For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$
-represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.
-With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then
-$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$
-is found.
-
-\subsection{Trimming the Quotient}
-So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As
-it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
-optimization.
-
-After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
-half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision
-multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.
-In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.
-
-The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
-multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
-of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.
-
-\subsection{Trimming the Residue}
-After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
-multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the
-result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
-implicitly zero.
-
-The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
-$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
-be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces
-only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.
-
-With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
-is considerably faster than the straightforward $3m^2$ method.
-
-\subsection{The Barrett Algorithm}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\
-\textbf{Output}.  $a \mbox{ (mod }b\mbox{)}$ \\
-\hline \\
-Let $m$ represent the number of digits in $b$.  \\
-1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
-2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
-\\
-Produce the quotient. \\
-3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
-4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
-\\
-Subtract the multiple of modulus from the input. \\
-5.  $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
-7.  $a \leftarrow a - q$ (\textit{mp\_sub}) \\
-\\
-Add $\beta^{m+1}$ if a carry occured. \\
-8.  If $a < 0$ then (\textit{mp\_cmp\_d}) \\
-\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
-\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
-\hspace{3mm}8.3  $a \leftarrow a + q$ \\
-\\
-Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
-9.  While $a \ge b$ do (\textit{mp\_cmp}) \\
-\hspace{3mm}9.1  $c \leftarrow a - b$ \\
-10.  Clear $q$. \\
-11.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce.}
-This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
-\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must
-be adhered to for the algorithm to work.
-
-First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
-a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
-for the quotient to have enough precision.  If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem.
-Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The value of $\mu$ is passed as an argument to this
-algorithm and is assumed to be calculated and stored before the algorithm is used.
-
-Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called
-$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  The algorithm is based on $s\_mp\_mul\_digs$ except that
-instead of stopping at a given level of precision it starts at a given level of precision.  This optimal algorithm can only be used if the number
-of digits in $b$ is very much smaller than $\beta$.
-
-While it is known that
-$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied
-``borrow'' from the higher digits might leave a negative result.  After the multiple of the modulus has been subtracted from $a$ the residue must be
-fixed up in case it is negative.  The invariant $\beta^{m+1}$ must be added to the residue to make it positive again.
-
-The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is
-performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* reduces x mod m, assumes 0 < x < m**2, mu is
-018    * precomputed via mp_reduce_setup.
-019    * From HAC pp.604 Algorithm 14.42
-020    */
-021   int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-022   \{
-023     mp_int  q;
-024     int     res, um = m->used;
-025
-026     /* q = x */
-027     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
-028       return res;
-029     \}
-030
-031     /* q1 = x / b**(k-1)  */
-032     mp_rshd (&q, um - 1);
-033
-034     /* according to HAC this optimization is ok */
-035     if (((mp_digit) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
-036       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
-037         goto CLEANUP;
-038       \}
-039     \} else \{
-040   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-041       if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
-042         goto CLEANUP;
-043       \}
-044   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-045       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
-046         goto CLEANUP;
-047       \}
-048   #else
-049       \{
-050         res = MP_VAL;
-051         goto CLEANUP;
-052       \}
-053   #endif
-054     \}
-055
-056     /* q3 = q2 / b**(k+1) */
-057     mp_rshd (&q, um + 1);
-058
-059     /* x = x mod b**(k+1), quick (no division) */
-060     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
-061       goto CLEANUP;
-062     \}
-063
-064     /* q = q * m mod b**(k+1), quick (no division) */
-065     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
-066       goto CLEANUP;
-067     \}
-068
-069     /* x = x - q */
-070     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
-071       goto CLEANUP;
-072     \}
-073
-074     /* If x < 0, add b**(k+1) to it */
-075     if (mp_cmp_d (x, 0) == MP_LT) \{
-076       mp_set (&q, 1);
-077       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-078         goto CLEANUP;
-079       if ((res = mp_add (x, &q, x)) != MP_OKAY)
-080         goto CLEANUP;
-081     \}
-082
-083     /* Back off if it's too big */
-084     while (mp_cmp (x, m) != MP_LT) \{
-085       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
-086         goto CLEANUP;
-087       \}
-088     \}
-089
-090   CLEANUP:
-091     mp_clear (&q);
-092
-093     return res;
-094   \}
-095   #endif
-096
-\end{alltt}
-\end{small}
-
-The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
-the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
-in the modulus.  In the source code this is evaluated on lines 36 to 43 where algorithm s\_mp\_mul\_high\_digs is used when it is
-safe to do so.
-
-\subsection{The Barrett Setup Algorithm}
-In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
-future use so that the Barrett algorithm can be used without delay.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_setup}. \\
-\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
-\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
-\hline \\
-1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
-2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
-3.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_setup.}
-This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
-is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* pre-calculate the value required for Barrett reduction
-018    * For a given modulus "b" it calulates the value required in "a"
-019    */
-020   int mp_reduce_setup (mp_int * a, mp_int * b)
-021   \{
-022     int     res;
-023
-024     if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{
-025       return res;
-026     \}
-027     return mp_div (a, b, a, NULL);
-028   \}
-029   #endif
-030
-\end{alltt}
-\end{small}
-
-This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
-which would received the remainder is passed as NULL.  As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the
-remainder to be passed as NULL meaning to ignore the value.
-
-\section{The Montgomery Reduction}
-Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting
-form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a
-residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.
-
-Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
-$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
-is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
-
-\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
-to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$.  Adding zero will not change the value of the residue.
-
-\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
-this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to
-multiplication by $k^{-1}$ modulo $n$.
-
-From these two simple facts the following simple algorithm can be derived.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $1$ to $k$ do \\
-\hspace{3mm}1.1  If $x$ is odd then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
-\hspace{3mm}1.2  $x \leftarrow x/2$ \\
-2.  Return $x$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction}
-\end{figure}
-
-The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
-added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
-$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the
-final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to
-$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
-\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
-\hline $2$ & $x/2 = 1453$ \\
-\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
-\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
-\hline $5$ & $x/2 = 278$ \\
-\hline $6$ & $x/2 = 139$ \\
-\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
-\hline $8$ & $x/2 = 99$ \\
-\hline $9$ & $x + n = 356$, $x/2 = 178$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (I)}
-\label{fig:MONT1}
-\end{figure}
-
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 9$ (note $\beta^k = 512$ which is larger than $n$).  The result of
-the algorithm $r = 178$ is congruent to the value of $2^{-9} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^9$ modulo $257$ the correct residue
-$r \equiv 158$ is produced.
-
-Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
-and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.
-Fortunately there exists an alternative representation of the algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ ($2^k > n$) \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $1$ to $k$ do \\
-\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
-2.  Return $x/2^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified I)}
-\end{figure}
-
-This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
-precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|r|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\
-\hline -- & $5555$ & $1010110110011$ \\
-\hline $1$ & $x + 2^{0}n = 5812$ &  $1011010110100$ \\
-\hline $2$ & $5812$ & $1011010110100$ \\
-\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\
-\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\
-\hline $5$ & $8896$ & $10001011000000$ \\
-\hline $6$ & $8896$ & $10001011000000$ \\
-\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
-\hline $8$ & $25344$ & $110001100000000$ \\
-\hline $9$ & $x + 2^{7}n = 91136$ & $10110010000000000$ \\
-\hline -- & $x/2^k = 178$ & \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (II)}
-\label{fig:MONT2}
-\end{figure}
-
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 9$.
-With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the
-loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is
-zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.
-
-\subsection{Digit Based Montgomery Reduction}
-Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
-previous algorithm re-written to compute the Montgomery reduction in this new fashion.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ ($\beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
-2.  Return $x/\beta^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified II)}
-\end{figure}
-
-The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of
-the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
-problem breaks down to solving the following congruency.
-
-\begin{center}
-\begin{tabular}{rcl}
-$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\end{tabular}
-\end{center}
-
-In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used
-extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.
-
-For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$
-represent the value to reduce.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
-\hline --                 & $33$ & --\\
-\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
-\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Montgomery Reduction}
-\end{figure}
-
-The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$
-which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
-the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
-the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.
-
-\subsection{Baseline Montgomery Reduction}
-The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for
-Montgomery reductions.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  $digs \leftarrow 2n.used + 1$ \\
-2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
-\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
-\\
-Setup $x$ for the reduction. \\
-3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
-4.  $x.used \leftarrow digs$ \\
-\\
-Eliminate the lower $k$ digits. \\
-5.  For $ix$ from $0$ to $k - 1$ do \\
-\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}5.2  $u \leftarrow 0$ \\
-\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
-\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
-\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.4  While $u > 0$ do \\
-\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
-\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
-\\
-Divide by $\beta^k$ and fix up as required. \\
-6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
-7.  If $x \ge n$ then \\
-\hspace{3mm}7.1  $x \leftarrow x - n$ \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_reduce.}
-This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
-on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
-restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as
-for the Barrett algorithm.  Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$.  $\rho$ must be calculated in
-advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.
-
-Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
-the size of the input.  This algorithm is discussed in sub-section 6.3.3.
-
-Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
-calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
-multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
-
-Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications
-in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
-multiplications.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* computes xR**-1 == x (mod N) via Montgomery Reduction */
-018   int
-019   mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-020   \{
-021     int     ix, res, digs;
-022     mp_digit mu;
-023
-024     /* can the fast reduction [comba] method be used?
-025      *
-026      * Note that unlike in mul you're safely allowed *less*
-027      * than the available columns [255 per default] since carries
-028      * are fixed up in the inner loop.
-029      */
-030     digs = (n->used * 2) + 1;
-031     if ((digs < MP_WARRAY) &&
-032         (n->used <
-033         (1 << ((CHAR_BIT * sizeof(mp_word)) - (2 * DIGIT_BIT))))) \{
-034       return fast_mp_montgomery_reduce (x, n, rho);
-035     \}
-036
-037     /* grow the input as required */
-038     if (x->alloc < digs) \{
-039       if ((res = mp_grow (x, digs)) != MP_OKAY) \{
-040         return res;
-041       \}
-042     \}
-043     x->used = digs;
-044
-045     for (ix = 0; ix < n->used; ix++) \{
-046       /* mu = ai * rho mod b
-047        *
-048        * The value of rho must be precalculated via
-049        * montgomery_setup() such that
-050        * it equals -1/n0 mod b this allows the
-051        * following inner loop to reduce the
-052        * input one digit at a time
-053        */
-054       mu = (mp_digit) (((mp_word)x->dp[ix] * (mp_word)rho) & MP_MASK);
-055
-056       /* a = a + mu * m * b**i */
-057       \{
-058         int iy;
-059         mp_digit *tmpn, *tmpx, u;
-060         mp_word r;
-061
-062         /* alias for digits of the modulus */
-063         tmpn = n->dp;
-064
-065         /* alias for the digits of x [the input] */
-066         tmpx = x->dp + ix;
-067
-068         /* set the carry to zero */
-069         u = 0;
-070
-071         /* Multiply and add in place */
-072         for (iy = 0; iy < n->used; iy++) \{
-073           /* compute product and sum */
-074           r       = ((mp_word)mu * (mp_word)*tmpn++) +
-075                      (mp_word) u + (mp_word) *tmpx;
-076
-077           /* get carry */
-078           u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-079
-080           /* fix digit */
-081           *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
-082         \}
-083         /* At this point the ix'th digit of x should be zero */
-084
-085
-086         /* propagate carries upwards as required*/
-087         while (u != 0) \{
-088           *tmpx   += u;
-089           u        = *tmpx >> DIGIT_BIT;
-090           *tmpx++ &= MP_MASK;
-091         \}
-092       \}
-093     \}
-094
-095     /* at this point the n.used'th least
-096      * significant digits of x are all zero
-097      * which means we can shift x to the
-098      * right by n.used digits and the
-099      * residue is unchanged.
-100      */
-101
-102     /* x = x/b**n.used */
-103     mp_clamp(x);
-104     mp_rshd (x, n->used);
-105
-106     /* if x >= n then x = x - n */
-107     if (mp_cmp_mag (x, n) != MP_LT) \{
-108       return s_mp_sub (x, n, x);
-109     \}
-110
-111     return MP_OKAY;
-112   \}
-113   #endif
-114
-\end{alltt}
-\end{small}
-
-This is the baseline implementation of the Montgomery reduction algorithm.  Lines 30 to 35 determine if the Comba based
-routine can be used instead.  Line 48 computes the value of $\mu$ for that particular iteration of the outer loop.
-
-The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
-the alias $tmpn$ refers to the modulus $n$.
-
-\subsection{Faster ``Comba'' Montgomery Reduction}
-
-The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
-nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
-technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
-a $k \times 1$ product $k$ times.
-
-The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the
-carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.
-Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.
-
-With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
-the speed of the algorithm.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
-1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
-Copy the digits of $x$ into the array $\hat W$ \\
-2.  For $ix$ from $0$ to $x.used - 1$ do \\
-\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
-3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-Elimiate the lower $k$ digits. \\
-4.  for $ix$ from $0$ to $n.used - 1$ do \\
-\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
-\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Propagate carries upwards. \\
-5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
-\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Shift right and reduce modulo $\beta$ simultaneously. \\
-6.  for $ix$ from $0$ to $n.used + 1$ do \\
-\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
-Zero excess digits and fixup $x$. \\
-7.  if $x.used > n.used + 1$ then do \\
-\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
-\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
-8.  $x.used \leftarrow n.used + 1$ \\
-9.  Clamp excessive digits of $x$. \\
-10.  If $x \ge n$ then \\
-\hspace{3mm}10.1  $x \leftarrow x - n$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
-This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
-faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
-on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the
-the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
-a modulus of at most $3,556$ bits in length.
-
-As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
-contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
-4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
-as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
-a single precision multiplication instead half the amount of time is spent.
-
-Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
-4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
-how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
-point.
-
-Step 5 will propagate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
-stored in the destination $x$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* computes xR**-1 == x (mod N) via Montgomery Reduction
-018    *
-019    * This is an optimized implementation of montgomery_reduce
-020    * which uses the comba method to quickly calculate the columns of the
-021    * reduction.
-022    *
-023    * Based on Algorithm 14.32 on pp.601 of HAC.
-024   */
-025   int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-026   \{
-027     int     ix, res, olduse;
-028     mp_word W[MP_WARRAY];
-029
-030     /* get old used count */
-031     olduse = x->used;
-032
-033     /* grow a as required */
-034     if (x->alloc < (n->used + 1)) \{
-035       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
-036         return res;
-037       \}
-038     \}
-039
-040     /* first we have to get the digits of the input into
-041      * an array of double precision words W[...]
-042      */
-043     \{
-044       mp_word *_W;
-045       mp_digit *tmpx;
-046
-047       /* alias for the W[] array */
-048       _W   = W;
-049
-050       /* alias for the digits of  x*/
-051       tmpx = x->dp;
-052
-053       /* copy the digits of a into W[0..a->used-1] */
-054       for (ix = 0; ix < x->used; ix++) \{
-055         *_W++ = *tmpx++;
-056       \}
-057
-058       /* zero the high words of W[a->used..m->used*2] */
-059       for (; ix < ((n->used * 2) + 1); ix++) \{
-060         *_W++ = 0;
-061       \}
-062     \}
-063
-064     /* now we proceed to zero successive digits
-065      * from the least significant upwards
-066      */
-067     for (ix = 0; ix < n->used; ix++) \{
-068       /* mu = ai * m' mod b
-069        *
-070        * We avoid a double precision multiplication (which isn't required)
-071        * by casting the value down to a mp_digit.  Note this requires
-072        * that W[ix-1] have  the carry cleared (see after the inner loop)
-073        */
-074       mp_digit mu;
-075       mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
-076
-077       /* a = a + mu * m * b**i
-078        *
-079        * This is computed in place and on the fly.  The multiplication
-080        * by b**i is handled by offseting which columns the results
-081        * are added to.
-082        *
-083        * Note the comba method normally doesn't handle carries in the
-084        * inner loop In this case we fix the carry from the previous
-085        * column since the Montgomery reduction requires digits of the
-086        * result (so far) [see above] to work.  This is
-087        * handled by fixing up one carry after the inner loop.  The
-088        * carry fixups are done in order so after these loops the
-089        * first m->used words of W[] have the carries fixed
-090        */
-091       \{
-092         int iy;
-093         mp_digit *tmpn;
-094         mp_word *_W;
-095
-096         /* alias for the digits of the modulus */
-097         tmpn = n->dp;
-098
-099         /* Alias for the columns set by an offset of ix */
-100         _W = W + ix;
-101
-102         /* inner loop */
-103         for (iy = 0; iy < n->used; iy++) \{
-104             *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
-105         \}
-106       \}
-107
-108       /* now fix carry for next digit, W[ix+1] */
-109       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-110     \}
-111
-112     /* now we have to propagate the carries and
-113      * shift the words downward [all those least
-114      * significant digits we zeroed].
-115      */
-116     \{
-117       mp_digit *tmpx;
-118       mp_word *_W, *_W1;
-119
-120       /* nox fix rest of carries */
-121
-122       /* alias for current word */
-123       _W1 = W + ix;
-124
-125       /* alias for next word, where the carry goes */
-126       _W = W + ++ix;
-127
-128       for (; ix <= ((n->used * 2) + 1); ix++) \{
-129         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-130       \}
-131
-132       /* copy out, A = A/b**n
-133        *
-134        * The result is A/b**n but instead of converting from an
-135        * array of mp_word to mp_digit than calling mp_rshd
-136        * we just copy them in the right order
-137        */
-138
-139       /* alias for destination word */
-140       tmpx = x->dp;
-141
-142       /* alias for shifted double precision result */
-143       _W = W + n->used;
-144
-145       for (ix = 0; ix < (n->used + 1); ix++) \{
-146         *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
-147       \}
-148
-149       /* zero oldused digits, if the input a was larger than
-150        * m->used+1 we'll have to clear the digits
-151        */
-152       for (; ix < olduse; ix++) \{
-153         *tmpx++ = 0;
-154       \}
-155     \}
-156
-157     /* set the max used and clamp */
-158     x->used = n->used + 1;
-159     mp_clamp (x);
-160
-161     /* if A >= m then A = A - m */
-162     if (mp_cmp_mag (x, n) != MP_LT) \{
-163       return s_mp_sub (x, n, x);
-164     \}
-165     return MP_OKAY;
-166   \}
-167   #endif
-168
-\end{alltt}
-\end{small}
-
-The $\hat W$ array is first filled with digits of $x$ on line 50 then the rest of the digits are zeroed on line 54.  Both loops share
-the same alias variables to make the code easier to read.
-
-The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
-forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line 109 fixes the carry
-for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
-
-The for loop on line 108 propagates the rest of the carries upwards through the columns.  The for loop on line 125 reduces the columns
-modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
-digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.
-
-\subsection{Montgomery Setup}
-To calculate the variable $\rho$ a relatively simple algorithm will be required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
-\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
-\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\hline \\
-1.  $b \leftarrow n_0$ \\
-2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow (((b + 2) \mbox{ AND } 4) << 1) + b$ \\
-4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
-\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
-5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
-6.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_setup.}
-This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick
-to calculate $1/n_0$ when $\beta$ is a power of two.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* setups the montgomery reduction stuff */
-018   int
-019   mp_montgomery_setup (mp_int * n, mp_digit * rho)
-020   \{
-021     mp_digit x, b;
-022
-023   /* fast inversion mod 2**k
-024    *
-025    * Based on the fact that
-026    *
-027    * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
-028    *                    =>  2*X*A - X*X*A*A = 1
-029    *                    =>  2*(1) - (1)     = 1
-030    */
-031     b = n->dp[0];
-032
-033     if ((b & 1) == 0) \{
-034       return MP_VAL;
-035     \}
-036
-037     x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-038     x *= 2 - (b * x);             /* here x*a==1 mod 2**8 */
-039   #if !defined(MP_8BIT)
-040     x *= 2 - (b * x);             /* here x*a==1 mod 2**16 */
-041   #endif
-042   #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-043     x *= 2 - (b * x);             /* here x*a==1 mod 2**32 */
-044   #endif
-045   #ifdef MP_64BIT
-046     x *= 2 - (b * x);             /* here x*a==1 mod 2**64 */
-047   #endif
-048
-049     /* rho = -1/m mod b */
-050     *rho = (mp_digit)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
-051
-052     return MP_OKAY;
-053   \}
-054   #endif
-055
-\end{alltt}
-\end{small}
-
-This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
-multiplications when $\beta$ is not the default 28-bits.
-
-\section{The Diminished Radix Algorithm}
-The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
-or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
-
-\begin{equation}
-(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
-\end{equation}
-
-This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that
-then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
-of the above equation is very simple.  First write $x$ in the product form.
-
-\begin{equation}
-x = qn + r
-\end{equation}
-
-Now reduce both sides modulo $(n - k)$.
-
-\begin{equation}
-x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
-\end{equation}
-
-The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$
-into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Diminished Radix Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$, $k$ \\
-\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
-\hline \\
-1.  $q \leftarrow \lfloor x / n \rfloor$ \\
-2.  $q \leftarrow k \cdot q$ \\
-3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
-4.  $x \leftarrow x + q$ \\
-5.  If $x \ge (n - k)$ then \\
-\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
-\hspace{3mm}5.2  Goto step 1. \\
-6.  Return $x$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Diminished Radix Reduction}
-\label{fig:DR}
-\end{figure}
-
-This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
-once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
-
-\begin{equation}
-0 \le x < n^2 + k^2 - 2nk
-\end{equation}
-
-The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
-
-\begin{equation}
-q < n - 2k - k^2/n
-\end{equation}
-
-Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
-$0 \le x < n$.  By step four the sum $x + q$ is bounded by
-
-\begin{equation}
-0 \le q + x < (k + 1)n - 2k^2 - 1
-\end{equation}
-
-With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
-sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the
-range $0 \le x < (n - k - 1)^2$.
-
-\begin{figure}
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|}
-\hline
-$x = 123456789, n = 256, k = 3$ \\
-\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
-$q \leftarrow q*k = 1446759$ \\
-$x \leftarrow x \mbox{ mod } n = 21$ \\
-$x \leftarrow x + q = 1446780$ \\
-$x \leftarrow x - (n - k) = 1446527$ \\
-\hline
-$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
-$q \leftarrow q*k = 16950$ \\
-$x \leftarrow x \mbox{ mod } n = 127$ \\
-$x \leftarrow x + q = 17077$ \\
-$x \leftarrow x - (n - k) = 16824$ \\
-\hline
-$q \leftarrow \lfloor x/n \rfloor = 65$ \\
-$q \leftarrow q*k = 195$ \\
-$x \leftarrow x \mbox{ mod } n = 184$ \\
-$x \leftarrow x + q = 379$ \\
-$x \leftarrow x - (n - k) = 126$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example Diminished Radix Reduction}
-\label{fig:EXDR}
-\end{figure}
-
-Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
-is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
-three passes were required to find the residue $x \equiv 126$.
-
-
-\subsection{Choice of Moduli}
-On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
-modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen.
-
-Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.
-Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division
-by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$
-which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.
-
-However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be
-performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.
-Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$.
-
-Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted
-modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the
-$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.
-
-\subsection{Choice of $k$}
-Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
-in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
-as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.
-
-\subsection{Restricted Diminished Radix Reduction}
-The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce
-an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
-of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition
-of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular
-exponentiations are performed.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
-\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
-\textbf{Output}.  $x \mbox{ mod } n$ \\
-\hline \\
-1.  $m \leftarrow n.used$ \\
-2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
-3.  $\mu \leftarrow 0$ \\
-4.  for $i$ from $0$ to $m - 1$ do \\
-\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
-\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  $x_{m} \leftarrow \mu$ \\
-6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
-\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
-7.  Clamp excess digits of $x$. \\
-8.  If $x \ge n$ then \\
-\hspace{3mm}8.1  $x \leftarrow x - n$ \\
-\hspace{3mm}8.2  Goto step 3. \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_reduce.}
-This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
-with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.
-
-This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
-and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
-the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
-digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to
-$x$ before the addition of the multiple of the upper half.
-
-At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
-at step 3.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
-018    *
-019    * Based on algorithm from the paper
-020    *
-021    * "Generating Efficient Primes for Discrete Log Cryptosystems"
-022    *                 Chae Hoon Lim, Pil Joong Lee,
-023    *          POSTECH Information Research Laboratories
-024    *
-025    * The modulus must be of a special format [see manual]
-026    *
-027    * Has been modified to use algorithm 7.10 from the LTM book instead
-028    *
-029    * Input x must be in the range 0 <= x <= (n-1)**2
-030    */
-031   int
-032   mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
-033   \{
-034     int      err, i, m;
-035     mp_word  r;
-036     mp_digit mu, *tmpx1, *tmpx2;
-037
-038     /* m = digits in modulus */
-039     m = n->used;
-040
-041     /* ensure that "x" has at least 2m digits */
-042     if (x->alloc < (m + m)) \{
-043       if ((err = mp_grow (x, m + m)) != MP_OKAY) \{
-044         return err;
-045       \}
-046     \}
-047
-048   /* top of loop, this is where the code resumes if
-049    * another reduction pass is required.
-050    */
-051   top:
-052     /* aliases for digits */
-053     /* alias for lower half of x */
-054     tmpx1 = x->dp;
-055
-056     /* alias for upper half of x, or x/B**m */
-057     tmpx2 = x->dp + m;
-058
-059     /* set carry to zero */
-060     mu = 0;
-061
-062     /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
-063     for (i = 0; i < m; i++) \{
-064         r         = (((mp_word)*tmpx2++) * (mp_word)k) + *tmpx1 + mu;
-065         *tmpx1++  = (mp_digit)(r & MP_MASK);
-066         mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
-067     \}
-068
-069     /* set final carry */
-070     *tmpx1++ = mu;
-071
-072     /* zero words above m */
-073     for (i = m + 1; i < x->used; i++) \{
-074         *tmpx1++ = 0;
-075     \}
-076
-077     /* clamp, sub and return */
-078     mp_clamp (x);
-079
-080     /* if x >= n then subtract and reduce again
-081      * Each successive "recursion" makes the input smaller and smaller.
-082      */
-083     if (mp_cmp_mag (x, n) != MP_LT) \{
-084       if ((err = s_mp_sub(x, n, x)) != MP_OKAY) \{
-085         return err;
-086       \}
-087       goto top;
-088     \}
-089     return MP_OKAY;
-090   \}
-091   #endif
-092
-\end{alltt}
-\end{small}
-
-The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line 51 is where
-the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
-the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.
-
-The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
-a division by $\beta^m$ can be simulated virtually for free.  The loop on line 63 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
-in this algorithm.
-
-By line 70 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line 73 the
-same pointer will point to the $m+1$'th digit where the zeroes will be placed.
-
-Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.
-With the same logic at line 84 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
-as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
-does not need to be checked.
-
-\subsubsection{Setup}
-To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
-completeness.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_setup}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $k = \beta - n_0$ \\
-\hline \\
-1.  $k \leftarrow \beta - n_0$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_setup}
-\end{figure}
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* determines the setup value */
-018   void mp_dr_setup(mp_int *a, mp_digit *d)
-019   \{
-020      /* the casts are required if DIGIT_BIT is one less than
-021       * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
-022       */
-023      *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
-024           ((mp_word)a->dp[0]));
-025   \}
-026
-027   #endif
-028
-\end{alltt}
-\end{small}
-
-\subsubsection{Modulus Detection}
-Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
-of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
-\hline
-1.  If $n.used < 2$ then return($0$). \\
-2.  for $ix$ from $1$ to $n.used - 1$ do \\
-\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
-3.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_is\_modulus}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_is\_modulus.}
-This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
-in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
-step 3 then $n$ must be of Diminished Radix form.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* determines if a number is a valid DR modulus */
-018   int mp_dr_is_modulus(mp_int *a)
-019   \{
-020      int ix;
-021
-022      /* must be at least two digits */
-023      if (a->used < 2) \{
-024         return 0;
-025      \}
-026
-027      /* must be of the form b**k - a [a <= b] so all
-028       * but the first digit must be equal to -1 (mod b).
-029       */
-030      for (ix = 1; ix < a->used; ix++) \{
-031          if (a->dp[ix] != MP_MASK) \{
-032             return 0;
-033          \}
-034      \}
-035      return 1;
-036   \}
-037
-038   #endif
-039
-\end{alltt}
-\end{small}
-
-\subsection{Unrestricted Diminished Radix Reduction}
-The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
-is a straightforward adaptation of algorithm~\ref{fig:DR}.
-
-In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
-algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k}. \\
-\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
-\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
-\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  While $a \ge n$ do \\
-\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
-\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
-\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.5  If $a \ge n$ then do \\
-\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k.}
-This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
-shift which makes the algorithm fairly inexpensive to use.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* reduces a modulo n where n is of the form 2**p - d */
-018   int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
-019   \{
-020      mp_int q;
-021      int    p, res;
-022
-023      if ((res = mp_init(&q)) != MP_OKAY) \{
-024         return res;
-025      \}
-026
-027      p = mp_count_bits(n);
-028   top:
-029      /* q = a/2**p, a = a mod 2**p */
-030      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
-031         goto ERR;
-032      \}
-033
-034      if (d != 1) \{
-035         /* q = q * d */
-036         if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{
-037            goto ERR;
-038         \}
-039      \}
-040
-041      /* a = a + q */
-042      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
-043         goto ERR;
-044      \}
-045
-046      if (mp_cmp_mag(a, n) != MP_LT) \{
-047         if ((res = s_mp_sub(a, n, a)) != MP_OKAY) \{
-048            goto ERR;
-049         \}
-050         goto top;
-051      \}
-052
-053   ERR:
-054      mp_clear(&q);
-055      return res;
-056   \}
-057
-058   #endif
-059
-\end{alltt}
-\end{small}
-
-The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
-on line 30 calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
-is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
-any multiplications.
-
-The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are
-positive.  By using the unsigned versions the overhead is kept to a minimum.
-
-\subsubsection{Unrestricted Setup}
-To setup this reduction algorithm the value of $k = 2^p - n$ is required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $k = 2^p - n$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
-3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
-4.  $k \leftarrow x_0$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k\_setup.}
-This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
-is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* determines the setup value */
-018   int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
-019   \{
-020      int res, p;
-021      mp_int tmp;
-022
-023      if ((res = mp_init(&tmp)) != MP_OKAY) \{
-024         return res;
-025      \}
-026
-027      p = mp_count_bits(a);
-028      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
-029         mp_clear(&tmp);
-030         return res;
-031      \}
-032
-033      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
-034         mp_clear(&tmp);
-035         return res;
-036      \}
-037
-038      *d = tmp.dp[0];
-039      mp_clear(&tmp);
-040      return MP_OKAY;
-041   \}
-042   #endif
-043
-\end{alltt}
-\end{small}
-
-\subsubsection{Unrestricted Detection}
-An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
-
-\begin{enumerate}
-\item  The number has only one digit.
-\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
-\end{enumerate}
-
-If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
-one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
-that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
-significant bit.  The resulting sum will be a power of two.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
-\hline
-1.  If $n.used = 0$ then return($0$). \\
-2.  If $n.used = 1$ then return($1$). \\
-3.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-4.  for $x$ from $lg(\beta)$ to $p$ do \\
-\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
-5.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_is\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_is\_2k.}
-This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* determines if mp_reduce_2k can be used */
-018   int mp_reduce_is_2k(mp_int *a)
-019   \{
-020      int ix, iy, iw;
-021      mp_digit iz;
-022
-023      if (a->used == 0) \{
-024         return MP_NO;
-025      \} else if (a->used == 1) \{
-026         return MP_YES;
-027      \} else if (a->used > 1) \{
-028         iy = mp_count_bits(a);
-029         iz = 1;
-030         iw = 1;
-031
-032         /* Test every bit from the second digit up, must be 1 */
-033         for (ix = DIGIT_BIT; ix < iy; ix++) \{
-034             if ((a->dp[iw] & iz) == 0) \{
-035                return MP_NO;
-036             \}
-037             iz <<= 1;
-038             if (iz > (mp_digit)MP_MASK) \{
-039                ++iw;
-040                iz = 1;
-041             \}
-042         \}
-043      \}
-044      return MP_YES;
-045   \}
-046
-047   #endif
-048
-\end{alltt}
-\end{small}
-
-
-
-\section{Algorithm Comparison}
-So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
-that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
-all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.
-
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
-\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
-\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
-\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-
-In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
-reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
-calling the half precision multipliers, addition and division by $\beta$ algorithms.
-
-For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
-shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
-primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
-modular exponentiation to greatly speed up the operation.
-
-
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
-                     & calculates the correct value of $\rho$. \\
-                     & \\
-$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
-                     & \\
-$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
-                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
-                     & terminate within $1 \le k \le 10$ iterations. \\
-                     & \\
-\end{tabular}
-
-
-\chapter{Exponentiation}
-Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
-in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key
-cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
-such cryptosystem and many methods have been sought to speed it up.
-
-\section{Exponentiation Basics}
-A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
-the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
-with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
-
-Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
-are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least
-significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
-
-\begin{equation}
-a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
-\end{equation}
-
-By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
-
-\begin{equation}
-b = \sum_{i=0}^{k-1}2^i \cdot b_i
-\end{equation}
-
-The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
-$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
-$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
-
-While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to
-be computed in an auxilary variable.  Consider the following equivalent algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Left to Right Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$ and $k$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $k - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Left to Right Exponentiation}
-\label{fig:LTOR}
-\end{figure}
-
-This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
-multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
-product.
-
-For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|}
-\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
-\hline - & $1$ \\
-\hline $5$ & $a$ \\
-\hline $4$ & $a^2$ \\
-\hline $3$ & $a^4 \cdot a$ \\
-\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
-\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
-\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Left to Right Exponentiation}
-\end{figure}
-
-When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is
-called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.
-
-\subsection{Single Digit Exponentiation}
-The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended
-to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of
-$b$ that are greater than three.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_expt\_d}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
-2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
-3.  for $x$ from 1 to $lg(\beta)$ do \\
-\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
-\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
-\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
-\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
-4.  Clear $g$. \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_expt\_d}
-\end{figure}
-
-\textbf{Algorithm mp\_expt\_d.}
-This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
-quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the
-exponent is a fixed width.
-
-A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of
-$1$ in the subsequent step.
-
-Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
-on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
-of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
-iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d\_ex.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* calculate c = a**b  using a square-multiply algorithm */
-018   int mp_expt_d_ex (mp_int * a, mp_digit b, mp_int * c, int fast)
-019   \{
-020     int     res;
-021     unsigned int x;
-022
-023     mp_int  g;
-024
-025     if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{
-026       return res;
-027     \}
-028
-029     /* set initial result */
-030     mp_set (c, 1);
-031
-032     if (fast != 0) \{
-033       while (b > 0) \{
-034         /* if the bit is set multiply */
-035         if ((b & 1) != 0) \{
-036           if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{
-037             mp_clear (&g);
-038             return res;
-039           \}
-040         \}
-041
-042         /* square */
-043         if (b > 1) \{
-044           if ((res = mp_sqr (&g, &g)) != MP_OKAY) \{
-045             mp_clear (&g);
-046             return res;
-047           \}
-048         \}
-049
-050         /* shift to next bit */
-051         b >>= 1;
-052       \}
-053     \}
-054     else \{
-055       for (x = 0; x < DIGIT_BIT; x++) \{
-056         /* square */
-057         if ((res = mp_sqr (c, c)) != MP_OKAY) \{
-058           mp_clear (&g);
-059           return res;
-060         \}
-061
-062         /* if the bit is set multiply */
-063         if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{
-064           if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{
-065              mp_clear (&g);
-066              return res;
-067           \}
-068         \}
-069
-070         /* shift to next bit */
-071         b <<= 1;
-072       \}
-073     \} /* if ... else */
-074
-075     mp_clear (&g);
-076     return MP_OKAY;
-077   \}
-078   #endif
-079
-\end{alltt}
-\end{small}
-
-This describes only the algorithm that is used when the parameter $fast$ is $0$.  Line 30 sets the initial value of the result to $1$.  Next the loop on line 55 steps through each bit of the exponent starting from
-the most significant down towards the least significant. The invariant squaring operation placed on line 57 is performed first.  After
-the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
-71 moves all of the bits of the exponent upwards towards the most significant location.
-
-\section{$k$-ary Exponentiation}
-When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
-slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
-the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
-computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
-portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
-\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
-\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{$k$-ary Exponentiation}
-\label{fig:KARY}
-\end{figure}
-
-The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
-precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
-$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.
-However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
-
-Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
-original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
-has increased slightly but the number of multiplications has nearly halved.
-
-\subsection{Optimal Values of $k$}
-An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
-approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
-for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
-\hline $16$ & $2$ & $27$ & $24$ \\
-\hline $32$ & $3$ & $49$ & $48$ \\
-\hline $64$ & $3$ & $92$ & $96$ \\
-\hline $128$ & $4$ & $175$ & $192$ \\
-\hline $256$ & $4$ & $335$ & $384$ \\
-\hline $512$ & $5$ & $645$ & $768$ \\
-\hline $1024$ & $6$ & $1257$ & $1536$ \\
-\hline $2048$ & $6$ & $2452$ & $3072$ \\
-\hline $4096$ & $7$ & $4808$ & $6144$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
-\label{fig:OPTK}
-\end{figure}
-
-\subsection{Sliding-Window Exponentiation}
-A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
-this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the
-algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.
-
-Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm {\ref{fig:KARY}}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
-\hline $16$ & $3$ & $24$ & $27$ \\
-\hline $32$ & $3$ & $45$ & $49$ \\
-\hline $64$ & $4$ & $87$ & $92$ \\
-\hline $128$ & $4$ & $167$ & $175$ \\
-\hline $256$ & $5$ & $322$ & $335$ \\
-\hline $512$ & $6$ & $628$ & $645$ \\
-\hline $1024$ & $6$ & $1225$ & $1257$ \\
-\hline $2048$ & $7$ & $2403$ & $2452$ \\
-\hline $4096$ & $8$ & $4735$ & $4808$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
-\label{fig:OPTK2}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
-\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
-\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
-\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
-\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Sliding Window $k$-ary Exponentiation}
-\end{figure}
-
-Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
-algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
-the size as the previous table.
-
-Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as
-the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the
-exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
-a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$
-squarings.  The second method requires $8$ multiplications and $18$ squarings.
-
-In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.
-
-\section{Modular Exponentiation}
-
-Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing
-$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it
-modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.
-
-This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
-one of the algorithms presented in chapter six.
-
-Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
-will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
-value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}).  If no inverse exists the algorithm
-terminates with an error.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
-2.  If $b.sign = MP\_NEG$ then \\
-\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
-\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
-\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
-3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
-\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
-4.  else \\
-\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_exptmod}
-\end{figure}
-
-\textbf{Algorithm mp\_exptmod.}
-The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm
-which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation
-except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
-algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017
-018   /* this is a shell function that calls either the normal or Montgomery
-019    * exptmod functions.  Originally the call to the montgomery code was
-020    * embedded in the normal function but that wasted alot of stack space
-021    * for nothing (since 99% of the time the Montgomery code would be called)
-022    */
-023   int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-024   \{
-025     int dr;
-026
-027     /* modulus P must be positive */
-028     if (P->sign == MP_NEG) \{
-029        return MP_VAL;
-030     \}
-031
-032     /* if exponent X is negative we have to recurse */
-033     if (X->sign == MP_NEG) \{
-034   #ifdef BN_MP_INVMOD_C
-035        mp_int tmpG, tmpX;
-036        int err;
-037
-038        /* first compute 1/G mod P */
-039        if ((err = mp_init(&tmpG)) != MP_OKAY) \{
-040           return err;
-041        \}
-042        if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{
-043           mp_clear(&tmpG);
-044           return err;
-045        \}
-046
-047        /* now get |X| */
-048        if ((err = mp_init(&tmpX)) != MP_OKAY) \{
-049           mp_clear(&tmpG);
-050           return err;
-051        \}
-052        if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{
-053           mp_clear_multi(&tmpG, &tmpX, NULL);
-054           return err;
-055        \}
-056
-057        /* and now compute (1/G)**|X| instead of G**X [X < 0] */
-058        err = mp_exptmod(&tmpG, &tmpX, P, Y);
-059        mp_clear_multi(&tmpG, &tmpX, NULL);
-060        return err;
-061   #else
-062        /* no invmod */
-063        return MP_VAL;
-064   #endif
-065     \}
-066
-067   /* modified diminished radix reduction */
-068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C) && defin
-      ed(BN_S_MP_EXPTMOD_C)
-069     if (mp_reduce_is_2k_l(P) == MP_YES) \{
-070        return s_mp_exptmod(G, X, P, Y, 1);
-071     \}
-072   #endif
-073
-074   #ifdef BN_MP_DR_IS_MODULUS_C
-075     /* is it a DR modulus? */
-076     dr = mp_dr_is_modulus(P);
-077   #else
-078     /* default to no */
-079     dr = 0;
-080   #endif
-081
-082   #ifdef BN_MP_REDUCE_IS_2K_C
-083     /* if not, is it a unrestricted DR modulus? */
-084     if (dr == 0) \{
-085        dr = mp_reduce_is_2k(P) << 1;
-086     \}
-087   #endif
-088
-089     /* if the modulus is odd or dr != 0 use the montgomery method */
-090   #ifdef BN_MP_EXPTMOD_FAST_C
-091     if ((mp_isodd (P) == MP_YES) || (dr !=  0)) \{
-092       return mp_exptmod_fast (G, X, P, Y, dr);
-093     \} else \{
-094   #endif
-095   #ifdef BN_S_MP_EXPTMOD_C
-096       /* otherwise use the generic Barrett reduction technique */
-097       return s_mp_exptmod (G, X, P, Y, 0);
-098   #else
-099       /* no exptmod for evens */
-100       return MP_VAL;
-101   #endif
-102   #ifdef BN_MP_EXPTMOD_FAST_C
-103     \}
-104   #endif
-105   \}
-106
-107   #endif
-108
-\end{alltt}
-\end{small}
-
-In order to keep the algorithms in a known state the first step on line 28 is to reject any negative modulus as input.  If the exponent is
-negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
-the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
-exponent.
-
-If the exponent is positive the algorithm resumes the exponentiation.  Line 76 determines if the modulus is of the restricted Diminished Radix
-form.  If it is not line 69 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
-of three values.
-
-\begin{enumerate}
-\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
-\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
-\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
-\end{enumerate}
-
-Line 69 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
-the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.
-
-\subsection{Barrett Modular Exponentiation}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  $k \leftarrow lg(x)$ \\
-2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
-                              2 &  \mbox{if }k \le 7 \\
-                              3 &  \mbox{if }7 < k \le 36 \\
-                              4 &  \mbox{if }36 < k \le 140 \\
-                              5 &  \mbox{if }140 < k \le 450 \\
-                              6 &  \mbox{if }450 < k \le 1303 \\
-                              7 &  \mbox{if }1303 < k \le 3529 \\
-                              8 &  \mbox{if }3529 < k \\
-                              \end{array} \right .$ \\
-3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
-4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
-5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
-\\
-Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
-6.  $k \leftarrow 2^{winsize - 1}$ \\
-7.  $M_{k} \leftarrow M_1$ \\
-8.  for $ix$ from 0 to $winsize - 2$ do \\
-\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
-\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
-\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
-\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-10.  $res \leftarrow 1$ \\
-\\
-Start Sliding Window. \\
-11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
-12.  Loop \\
-\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
-\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
-\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
-\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
-\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
-\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
-Continued on next page. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
-\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
-\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
-\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
-\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
-\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.6.3  Goto step 12. \\
-\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
-\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
-\hspace{3mm}12.9  $mode \leftarrow 2$ \\
-\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
-\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
-\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
-\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
-\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
-\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}Reset the window. \\
-\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
-\\
-No more windows left.  Check for residual bits of exponent. \\
-13.  If $mode = 2$ and $bitcpy > 0$ then do \\
-\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
-\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
-\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
-\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
-\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
-\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-14.  $y \leftarrow res$ \\
-15.  Clear $res$, $mu$ and the $M$ array. \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod (continued)}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_exptmod.}
-This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
-algorithm to keep the product small throughout the algorithm.
-
-The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the
-larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
-table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.
-
-After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
-the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
-times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
-
-Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
-\begin{enumerate}
-\item The variable $mode$ dictates how the bits of the exponent are interpreted.
-\begin{enumerate}
-   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply
-         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.
-   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits
-         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.
-   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
-         downwards.
-\end{enumerate}
-\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
-      is fetched from the exponent.
-\item The variable $buf$ holds the currently read digit of the exponent.
-\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
-\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
-      the appropriate operations performed.
-\item The variable $bitbuf$ holds the current bits of the window being formed.
-\end{enumerate}
-
-All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
-inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
-read and if there are no digits left than the loop terminates.
-
-After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
-upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to
-trailing edges the entire exponent is read from most significant bit to least significant bit.
-
-At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the
-algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
-the two cases of $mode = 1$ and $mode = 2$ respectively.
-
-\begin{center}
-\begin{figure}[here]
-\includegraphics{pics/expt_state.ps}
-\caption{Sliding Window State Diagram}
-\label{pic:expt_state}
-\end{figure}
-\end{center}
-
-By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then
-a Left-to-Right algorithm is used to process the remaining few bits.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
-\vspace{-3mm}
-\begin{alltt}
-016   #ifdef MP_LOW_MEM
-017      #define TAB_SIZE 32
-018   #else
-019      #define TAB_SIZE 256
-020   #endif
-021
-022   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
-      e)
-023   \{
-024     mp_int  M[TAB_SIZE], res, mu;
-025     mp_digit buf;
-026     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-027     int (*redux)(mp_int*,mp_int*,mp_int*);
-028
-029     /* find window size */
-030     x = mp_count_bits (X);
-031     if (x <= 7) \{
-032       winsize = 2;
-033     \} else if (x <= 36) \{
-034       winsize = 3;
-035     \} else if (x <= 140) \{
-036       winsize = 4;
-037     \} else if (x <= 450) \{
-038       winsize = 5;
-039     \} else if (x <= 1303) \{
-040       winsize = 6;
-041     \} else if (x <= 3529) \{
-042       winsize = 7;
-043     \} else \{
-044       winsize = 8;
-045     \}
-046
-047   #ifdef MP_LOW_MEM
-048       if (winsize > 5) \{
-049          winsize = 5;
-050       \}
-051   #endif
-052
-053     /* init M array */
-054     /* init first cell */
-055     if ((err = mp_init(&M[1])) != MP_OKAY) \{
-056        return err;
-057     \}
-058
-059     /* now init the second half of the array */
-060     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-061       if ((err = mp_init(&M[x])) != MP_OKAY) \{
-062         for (y = 1<<(winsize-1); y < x; y++) \{
-063           mp_clear (&M[y]);
-064         \}
-065         mp_clear(&M[1]);
-066         return err;
-067       \}
-068     \}
-069
-070     /* create mu, used for Barrett reduction */
-071     if ((err = mp_init (&mu)) != MP_OKAY) \{
-072       goto LBL_M;
-073     \}
-074
-075     if (redmode == 0) \{
-076        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
-077           goto LBL_MU;
-078        \}
-079        redux = mp_reduce;
-080     \} else \{
-081        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
-082           goto LBL_MU;
-083        \}
-084        redux = mp_reduce_2k_l;
-085     \}
-086
-087     /* create M table
-088      *
-089      * The M table contains powers of the base,
-090      * e.g. M[x] = G**x mod P
-091      *
-092      * The first half of the table is not
-093      * computed though accept for M[0] and M[1]
-094      */
-095     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
-096       goto LBL_MU;
-097     \}
-098
-099     /* compute the value at M[1<<(winsize-1)] by squaring
-100      * M[1] (winsize-1) times
-101      */
-102     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
-103       goto LBL_MU;
-104     \}
-105
-106     for (x = 0; x < (winsize - 1); x++) \{
-107       /* square it */
-108       if ((err = mp_sqr (&M[1 << (winsize - 1)],
-109                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
-110         goto LBL_MU;
-111       \}
-112
-113       /* reduce modulo P */
-114       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
-115         goto LBL_MU;
-116       \}
-117     \}
-118
-119     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-120      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-121      */
-122     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
-123       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
-124         goto LBL_MU;
-125       \}
-126       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
-127         goto LBL_MU;
-128       \}
-129     \}
-130
-131     /* setup result */
-132     if ((err = mp_init (&res)) != MP_OKAY) \{
-133       goto LBL_MU;
-134     \}
-135     mp_set (&res, 1);
-136
-137     /* set initial mode and bit cnt */
-138     mode   = 0;
-139     bitcnt = 1;
-140     buf    = 0;
-141     digidx = X->used - 1;
-142     bitcpy = 0;
-143     bitbuf = 0;
-144
-145     for (;;) \{
-146       /* grab next digit as required */
-147       if (--bitcnt == 0) \{
-148         /* if digidx == -1 we are out of digits */
-149         if (digidx == -1) \{
-150           break;
-151         \}
-152         /* read next digit and reset the bitcnt */
-153         buf    = X->dp[digidx--];
-154         bitcnt = (int) DIGIT_BIT;
-155       \}
-156
-157       /* grab the next msb from the exponent */
-158       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-159       buf <<= (mp_digit)1;
-160
-161       /* if the bit is zero and mode == 0 then we ignore it
-162        * These represent the leading zero bits before the first 1 bit
-163        * in the exponent.  Technically this opt is not required but it
-164        * does lower the # of trivial squaring/reductions used
-165        */
-166       if ((mode == 0) && (y == 0)) \{
-167         continue;
-168       \}
-169
-170       /* if the bit is zero and mode == 1 then we square */
-171       if ((mode == 1) && (y == 0)) \{
-172         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-173           goto LBL_RES;
-174         \}
-175         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-176           goto LBL_RES;
-177         \}
-178         continue;
-179       \}
-180
-181       /* else we add it to the window */
-182       bitbuf |= (y << (winsize - ++bitcpy));
-183       mode    = 2;
-184
-185       if (bitcpy == winsize) \{
-186         /* ok window is filled so square as required and multiply  */
-187         /* square first */
-188         for (x = 0; x < winsize; x++) \{
-189           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-190             goto LBL_RES;
-191           \}
-192           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-193             goto LBL_RES;
-194           \}
-195         \}
-196
-197         /* then multiply */
-198         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
-199           goto LBL_RES;
-200         \}
-201         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-202           goto LBL_RES;
-203         \}
-204
-205         /* empty window and reset */
-206         bitcpy = 0;
-207         bitbuf = 0;
-208         mode   = 1;
-209       \}
-210     \}
-211
-212     /* if bits remain then square/multiply */
-213     if ((mode == 2) && (bitcpy > 0)) \{
-214       /* square then multiply if the bit is set */
-215       for (x = 0; x < bitcpy; x++) \{
-216         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-217           goto LBL_RES;
-218         \}
-219         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-220           goto LBL_RES;
-221         \}
-222
-223         bitbuf <<= 1;
-224         if ((bitbuf & (1 << winsize)) != 0) \{
-225           /* then multiply */
-226           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
-227             goto LBL_RES;
-228           \}
-229           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
-230             goto LBL_RES;
-231           \}
-232         \}
-233       \}
-234     \}
-235
-236     mp_exch (&res, Y);
-237     err = MP_OKAY;
-238   LBL_RES:mp_clear (&res);
-239   LBL_MU:mp_clear (&mu);
-240   LBL_M:
-241     mp_clear(&M[1]);
-242     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
-243       mp_clear (&M[x]);
-244     \}
-245     return err;
-246   \}
-247   #endif
-248
-\end{alltt}
-\end{small}
-
-Lines 31 through 45 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
-from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement
-on line 37 the value of $x$ is already known to be greater than $140$.
-
-The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits.  This logic is used to ensure
-the table of precomputed powers of $G$ remains relatively small.
-
-The for loop on line 60 initializes the $M$ array while lines 71 and 76 through 85 initialize the reduction
-function that will be used for this modulus.
-
--- More later.
-
-\section{Quick Power of Two}
-Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
-equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_2expt}. \\
-\textbf{Input}.   integer $b$ \\
-\textbf{Output}.  $a \leftarrow 2^b$ \\
-\hline \\
-1.  $a \leftarrow 0$ \\
-2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
-3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
-4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_2expt}
-\end{figure}
-
-\textbf{Algorithm mp\_2expt.}
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* computes a = 2**b
-018    *
-019    * Simple algorithm which zeroes the int, grows it then just sets one bit
-020    * as required.
-021    */
-022   int
-023   mp_2expt (mp_int * a, int b)
-024   \{
-025     int     res;
-026
-027     /* zero a as per default */
-028     mp_zero (a);
-029
-030     /* grow a to accomodate the single bit */
-031     if ((res = mp_grow (a, (b / DIGIT_BIT) + 1)) != MP_OKAY) \{
-032       return res;
-033     \}
-034
-035     /* set the used count of where the bit will go */
-036     a->used = (b / DIGIT_BIT) + 1;
-037
-038     /* put the single bit in its place */
-039     a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
-040
-041     return MP_OKAY;
-042   \}
-043   #endif
-044
-\end{alltt}
-\end{small}
-
-\chapter{Higher Level Algorithms}
-
-This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
-routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.
-
-The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
-for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.
-These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate
-various representations of integers.  For example, converting from an mp\_int to a string of character.
-
-\section{Integer Division with Remainder}
-\label{sec:division}
-
-Integer division aside from modular exponentiation is the most intensive algorithm to compute.  Like addition, subtraction and multiplication
-the basis of this algorithm is the long-hand division algorithm taught to school children.  Throughout this discussion several common variables
-will be used.  Let $x$ represent the divisor and $y$ represent the dividend.  Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and
-let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$.  The following simple algorithm will be used to start the discussion.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\
-\textbf{Input}.   integer $x$ and $y$ \\
-\textbf{Output}.  $q = \lfloor y/x\rfloor, r = y - xq$ \\
-\hline \\
-1.  $q \leftarrow 0$ \\
-2.  $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\
-3.  for $t$ from $n$ down to $0$ do \\
-\hspace{3mm}3.1  Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\
-\hspace{3mm}3.2  $q \leftarrow q + k\beta^t$ \\
-\hspace{3mm}3.3  $y \leftarrow y - kx\beta^t$ \\
-4.  $r \leftarrow y$ \\
-5.  Return($q, r$) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Radix-$\beta$ Integer Division}
-\label{fig:raddiv}
-\end{figure}
-
-As children we are taught this very simple algorithm for the case of $\beta = 10$.  Almost instinctively several optimizations are taught for which
-their reason of existing are never explained.  For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor.
-
-To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and
-simultaneously $(k + 1)x\beta^t$ is greater than $y$.  Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have.  The habitual method
-used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient.  By only using leading
-digits a much simpler division may be used to form an educated guess at what the value must be.  In this case $k = \lfloor 54/23\rfloor = 2$ quickly
-arises as a possible  solution.  Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$.
-As a  result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$.
-
-Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder
-$y = 841 - 3x\beta = 181$.  Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the
-remainder $y = 181 - 7x = 20$.  The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since
-$237 \cdot 23 + 20 = 5471$ is true.
-
-\subsection{Quotient Estimation}
-\label{sec:divest}
-As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend.  When $p$ leading
-digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows.  Technically
-speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the
-dividend and divisor are zero.
-
-The value of the estimation may off by a few values in either direction and in general is fairly correct.  A simplification \cite[pp. 271]{TAOCPV2}
-of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$.  The estimate
-using this technique is never too small.  For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$
-represent the most significant digits of the dividend and divisor respectively.
-
-\textbf{Proof.}\textit{  The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to
-$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. }
-The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger.  For all other
-cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$.  The latter portion of the inequalility
-$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values.  Next a series of
-inequalities will prove the hypothesis.
-
-\begin{equation}
-y - \hat k x \le y - \hat k x_s\beta^s
-\end{equation}
-
-This is trivially true since $x \ge x_s\beta^s$.  Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$.
-
-\begin{equation}
-y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s)
-\end{equation}
-
-By simplifying the previous inequality the following inequality is formed.
-
-\begin{equation}
-y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s
-\end{equation}
-
-Subsequently,
-
-\begin{equation}
-y_{t-2}\beta^{t-2} + \ldots +  y_0  + x_s\beta^s - \beta^s < x_s\beta^s \le x
-\end{equation}
-
-Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof.  \textbf{QED}
-
-
-\subsection{Normalized Integers}
-For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$.  By multiplying both
-$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original
-remainder.  The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will
-lie in the domain of a single digit.  Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$.
-
-\begin{equation}
-{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta}
-\end{equation}
-
-At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small.
-
-\subsection{Radix-$\beta$ Division with Remainder}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div}. \\
-\textbf{Input}.   mp\_int $a, b$ \\
-\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
-\hline \\
-1.  If $b = 0$ return(\textit{MP\_VAL}). \\
-2.  If $\vert a \vert < \vert b \vert$ then do \\
-\hspace{3mm}2.1  $d \leftarrow a$ \\
-\hspace{3mm}2.2  $c \leftarrow 0$ \\
-\hspace{3mm}2.3  Return(\textit{MP\_OKAY}). \\
-\\
-Setup the quotient to receive the digits. \\
-3.  Grow $q$ to $a.used + 2$ digits. \\
-4.  $q \leftarrow 0$ \\
-5.  $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\
-6.  $sign \leftarrow  \left \lbrace \begin{array}{ll}
-                              MP\_ZPOS &  \mbox{if }a.sign = b.sign \\
-                              MP\_NEG  &  \mbox{otherwise} \\
-                              \end{array} \right .$ \\
-\\
-Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\
-7.  $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\
-8.  $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\
-\\
-Find the leading digit of the quotient. \\
-9.  $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\
-10.  $y \leftarrow y \cdot \beta^{n - t}$ \\
-11.  While ($x \ge y$) do \\
-\hspace{3mm}11.1  $q_{n - t} \leftarrow q_{n - t} + 1$ \\
-\hspace{3mm}11.2  $x \leftarrow x - y$ \\
-12.  $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\
-\\
-Continued on the next page. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div} (continued). \\
-\textbf{Input}.   mp\_int $a, b$ \\
-\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
-\hline \\
-Now find the remainder fo the digits. \\
-13.  for $i$ from $n$ down to $(t + 1)$ do \\
-\hspace{3mm}13.1  If $i > x.used$ then jump to the next iteration of this loop. \\
-\hspace{3mm}13.2  If $x_{i} = y_{t}$ then \\
-\hspace{6mm}13.2.1  $q_{i - t - 1} \leftarrow \beta - 1$ \\
-\hspace{3mm}13.3  else \\
-\hspace{6mm}13.3.1  $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\
-\hspace{6mm}13.3.2  $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\
-\hspace{6mm}13.3.3  $q_{i - t - 1} \leftarrow \hat r$ \\
-\hspace{3mm}13.4  $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\
-\\
-Fixup quotient estimation. \\
-\hspace{3mm}13.5  Loop \\
-\hspace{6mm}13.5.1  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
-\hspace{6mm}13.5.2  t$1 \leftarrow 0$ \\
-\hspace{6mm}13.5.3  t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\
-\hspace{6mm}13.5.4  $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\
-\hspace{6mm}13.5.5  t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\
-\hspace{6mm}13.5.6  If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\
-\hspace{3mm}13.6  t$1 \leftarrow y \cdot q_{i - t - 1}$ \\
-\hspace{3mm}13.7  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
-\hspace{3mm}13.8  $x \leftarrow x - $ t$1$ \\
-\hspace{3mm}13.9  If $x.sign = MP\_NEG$ then \\
-\hspace{6mm}13.10  t$1 \leftarrow y$ \\
-\hspace{6mm}13.11  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
-\hspace{6mm}13.12  $x \leftarrow x + $ t$1$ \\
-\hspace{6mm}13.13  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
-\\
-Finalize the result. \\
-14.  Clamp excess digits of $q$ \\
-15.  $c \leftarrow q, c.sign \leftarrow sign$ \\
-16.  $x.sign \leftarrow a.sign$ \\
-17.  $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\
-18.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div (continued)}
-\end{figure}
-\textbf{Algorithm mp\_div.}
-This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor.  The algorithm is a signed
-division and will produce a fully qualified quotient and remainder.
-
-First the divisor $b$ must be non-zero which is enforced in step one.  If the divisor is larger than the dividend than the quotient is implicitly
-zero and the remainder is the dividend.
-
-After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient.  Two unsigned copies of the
-divisor $y$ and dividend $x$ are made as well.  The core of the division algorithm is an unsigned division and will only work if the values are
-positive.  Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$.
-This is performed by shifting both to the left by enough bits to get the desired normalization.
-
-At this point the division algorithm can begin producing digits of the quotient.  Recall that maximum value of the estimation used is
-$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means.  In this case $y$ is shifted
-to the left (\textit{step ten}) so that it has the same number of digits as $x$.  The loop on step eleven will subtract multiples of the
-shifted copy of $y$ until $x$ is smaller.  Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two
-times to produce the desired leading digit of the quotient.
-
-Now the remainder of the digits can be produced.  The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly
-accurately approximate the true quotient digit.  The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by
-induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$.
-
-Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high.  The next step of the estimation process is
-to refine the estimation.  The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher
-order approximation to adjust the quotient digit.
-
-After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced
-by optimizing Barrett reduction.}.  Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of
-algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large.
-
-Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the
-remainder.  An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC}
-is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie
-outside their respective boundaries.  For example, if $t = 0$ or $i \le 1$ then the digits would be undefined.  In those cases the digits should
-respectively be replaced with a zero.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_div.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   #ifdef BN_MP_DIV_SMALL
-018
-019   /* slower bit-bang division... also smaller */
-020   int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-021   \{
-022      mp_int ta, tb, tq, q;
-023      int    res, n, n2;
-024
-025     /* is divisor zero ? */
-026     if (mp_iszero (b) == MP_YES) \{
-027       return MP_VAL;
-028     \}
-029
-030     /* if a < b then q=0, r = a */
-031     if (mp_cmp_mag (a, b) == MP_LT) \{
-032       if (d != NULL) \{
-033         res = mp_copy (a, d);
-034       \} else \{
-035         res = MP_OKAY;
-036       \}
-037       if (c != NULL) \{
-038         mp_zero (c);
-039       \}
-040       return res;
-041     \}
-042
-043     /* init our temps */
-044     if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL)) != MP_OKAY) \{
-045        return res;
-046     \}
-047
-048
-049     mp_set(&tq, 1);
-050     n = mp_count_bits(a) - mp_count_bits(b);
-051     if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
-052         ((res = mp_abs(b, &tb)) != MP_OKAY) ||
-053         ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
-054         ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) \{
-055         goto LBL_ERR;
-056     \}
-057
-058     while (n-- >= 0) \{
-059        if (mp_cmp(&tb, &ta) != MP_GT) \{
-060           if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
-061               ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) \{
-062              goto LBL_ERR;
-063           \}
-064        \}
-065        if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
-066            ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) \{
-067              goto LBL_ERR;
-068        \}
-069     \}
-070
-071     /* now q == quotient and ta == remainder */
-072     n  = a->sign;
-073     n2 = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-074     if (c != NULL) \{
-075        mp_exch(c, &q);
-076        c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
-077     \}
-078     if (d != NULL) \{
-079        mp_exch(d, &ta);
-080        d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
-081     \}
-082   LBL_ERR:
-083      mp_clear_multi(&ta, &tb, &tq, &q, NULL);
-084      return res;
-085   \}
-086
-087   #else
-088
-089   /* integer signed division.
-090    * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
-091    * HAC pp.598 Algorithm 14.20
-092    *
-093    * Note that the description in HAC is horribly
-094    * incomplete.  For example, it doesn't consider
-095    * the case where digits are removed from 'x' in
-096    * the inner loop.  It also doesn't consider the
-097    * case that y has fewer than three digits, etc..
-098    *
-099    * The overall algorithm is as described as
-100    * 14.20 from HAC but fixed to treat these cases.
-101   */
-102   int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-103   \{
-104     mp_int  q, x, y, t1, t2;
-105     int     res, n, t, i, norm, neg;
-106
-107     /* is divisor zero ? */
-108     if (mp_iszero (b) == MP_YES) \{
-109       return MP_VAL;
-110     \}
-111
-112     /* if a < b then q=0, r = a */
-113     if (mp_cmp_mag (a, b) == MP_LT) \{
-114       if (d != NULL) \{
-115         res = mp_copy (a, d);
-116       \} else \{
-117         res = MP_OKAY;
-118       \}
-119       if (c != NULL) \{
-120         mp_zero (c);
-121       \}
-122       return res;
-123     \}
-124
-125     if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) \{
-126       return res;
-127     \}
-128     q.used = a->used + 2;
-129
-130     if ((res = mp_init (&t1)) != MP_OKAY) \{
-131       goto LBL_Q;
-132     \}
-133
-134     if ((res = mp_init (&t2)) != MP_OKAY) \{
-135       goto LBL_T1;
-136     \}
-137
-138     if ((res = mp_init_copy (&x, a)) != MP_OKAY) \{
-139       goto LBL_T2;
-140     \}
-141
-142     if ((res = mp_init_copy (&y, b)) != MP_OKAY) \{
-143       goto LBL_X;
-144     \}
-145
-146     /* fix the sign */
-147     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-148     x.sign = y.sign = MP_ZPOS;
-149
-150     /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
-151     norm = mp_count_bits(&y) % DIGIT_BIT;
-152     if (norm < (int)(DIGIT_BIT-1)) \{
-153        norm = (DIGIT_BIT-1) - norm;
-154        if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) \{
-155          goto LBL_Y;
-156        \}
-157        if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) \{
-158          goto LBL_Y;
-159        \}
-160     \} else \{
-161        norm = 0;
-162     \}
-163
-164     /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-165     n = x.used - 1;
-166     t = y.used - 1;
-167
-168     /* while (x >= y*b**n-t) do \{ q[n-t] += 1; x -= y*b**\{n-t\} \} */
-169     if ((res = mp_lshd (&y, n - t)) != MP_OKAY) \{ /* y = y*b**\{n-t\} */
-170       goto LBL_Y;
-171     \}
-172
-173     while (mp_cmp (&x, &y) != MP_LT) \{
-174       ++(q.dp[n - t]);
-175       if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) \{
-176         goto LBL_Y;
-177       \}
-178     \}
-179
-180     /* reset y by shifting it back down */
-181     mp_rshd (&y, n - t);
-182
-183     /* step 3. for i from n down to (t + 1) */
-184     for (i = n; i >= (t + 1); i--) \{
-185       if (i > x.used) \{
-186         continue;
-187       \}
-188
-189       /* step 3.1 if xi == yt then set q\{i-t-1\} to b-1,
-190        * otherwise set q\{i-t-1\} to (xi*b + x\{i-1\})/yt */
-191       if (x.dp[i] == y.dp[t]) \{
-192         q.dp[(i - t) - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
-193       \} else \{
-194         mp_word tmp;
-195         tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
-196         tmp |= ((mp_word) x.dp[i - 1]);
-197         tmp /= ((mp_word) y.dp[t]);
-198         if (tmp > (mp_word) MP_MASK) \{
-199           tmp = MP_MASK;
-200         \}
-201         q.dp[(i - t) - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
-202       \}
-203
-204       /* while (q\{i-t-1\} * (yt * b + y\{t-1\})) >
-205                xi * b**2 + xi-1 * b + xi-2
-206
-207          do q\{i-t-1\} -= 1;
-208       */
-209       q.dp[(i - t) - 1] = (q.dp[(i - t) - 1] + 1) & MP_MASK;
-210       do \{
-211         q.dp[(i - t) - 1] = (q.dp[(i - t) - 1] - 1) & MP_MASK;
-212
-213         /* find left hand */
-214         mp_zero (&t1);
-215         t1.dp[0] = ((t - 1) < 0) ? 0 : y.dp[t - 1];
-216         t1.dp[1] = y.dp[t];
-217         t1.used = 2;
-218         if ((res = mp_mul_d (&t1, q.dp[(i - t) - 1], &t1)) != MP_OKAY) \{
-219           goto LBL_Y;
-220         \}
-221
-222         /* find right hand */
-223         t2.dp[0] = ((i - 2) < 0) ? 0 : x.dp[i - 2];
-224         t2.dp[1] = ((i - 1) < 0) ? 0 : x.dp[i - 1];
-225         t2.dp[2] = x.dp[i];
-226         t2.used = 3;
-227       \} while (mp_cmp_mag(&t1, &t2) == MP_GT);
-228
-229       /* step 3.3 x = x - q\{i-t-1\} * y * b**\{i-t-1\} */
-230       if ((res = mp_mul_d (&y, q.dp[(i - t) - 1], &t1)) != MP_OKAY) \{
-231         goto LBL_Y;
-232       \}
-233
-234       if ((res = mp_lshd (&t1, (i - t) - 1)) != MP_OKAY) \{
-235         goto LBL_Y;
-236       \}
-237
-238       if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) \{
-239         goto LBL_Y;
-240       \}
-241
-242       /* if x < 0 then \{ x = x + y*b**\{i-t-1\}; q\{i-t-1\} -= 1; \} */
-243       if (x.sign == MP_NEG) \{
-244         if ((res = mp_copy (&y, &t1)) != MP_OKAY) \{
-245           goto LBL_Y;
-246         \}
-247         if ((res = mp_lshd (&t1, (i - t) - 1)) != MP_OKAY) \{
-248           goto LBL_Y;
-249         \}
-250         if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) \{
-251           goto LBL_Y;
-252         \}
-253
-254         q.dp[(i - t) - 1] = (q.dp[(i - t) - 1] - 1UL) & MP_MASK;
-255       \}
-256     \}
-257
-258     /* now q is the quotient and x is the remainder
-259      * [which we have to normalize]
-260      */
-261
-262     /* get sign before writing to c */
-263     x.sign = (x.used == 0) ? MP_ZPOS : a->sign;
-264
-265     if (c != NULL) \{
-266       mp_clamp (&q);
-267       mp_exch (&q, c);
-268       c->sign = neg;
-269     \}
-270
-271     if (d != NULL) \{
-272       if ((res = mp_div_2d (&x, norm, &x, NULL)) != MP_OKAY) \{
-273         goto LBL_Y;
-274       \}
-275       mp_exch (&x, d);
-276     \}
-277
-278     res = MP_OKAY;
-279
-280   LBL_Y:mp_clear (&y);
-281   LBL_X:mp_clear (&x);
-282   LBL_T2:mp_clear (&t2);
-283   LBL_T1:mp_clear (&t1);
-284   LBL_Q:mp_clear (&q);
-285     return res;
-286   \}
-287
-288   #endif
-289
-290   #endif
-291
-\end{alltt}
-\end{small}
-
-The implementation of this algorithm differs slightly from the pseudo code presented previously.  In this algorithm either of the quotient $c$ or
-remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired.  For example, the C code to call the division
-algorithm with only the quotient is
-
-\begin{verbatim}
-mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
-\end{verbatim}
-
-Lines 108 and 113 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor
-respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 147 determines the sign of
-the quotient and line 148 ensures that both $x$ and $y$ are positive.
-
-The number of bits in the leading digit is calculated on line 151.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
-of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
-exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
-them to the left by $lg(\beta) - 1 - k$ bits.
-
-Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the
-leading digit of the quotient.  The loop beginning on line 184 will produce the remainder of the quotient digits.
-
-The conditional ``continue'' on line 186 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
-algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
-above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.
-
-Lines 214, 216 and 223 through 225 manually construct the high accuracy estimations by setting the digits of the two mp\_int
-variables directly.
-
-\section{Single Digit Helpers}
-
-This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants.  All of
-the helper functions assume the single digit input is positive and will treat them as such.
-
-\subsection{Single Digit Addition and Subtraction}
-
-Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction
-algorithms.   As a result these algorithms are subtantially simpler with a slight cost in performance.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_add\_d}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c = a + b$ \\
-\hline \\
-1.  $t \leftarrow b$ (\textit{mp\_set}) \\
-2.  $c \leftarrow a + t$ \\
-3.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_add\_d}
-\end{figure}
-
-\textbf{Algorithm mp\_add\_d.}
-This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_add\_d.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* single digit addition */
-018   int
-019   mp_add_d (mp_int * a, mp_digit b, mp_int * c)
-020   \{
-021     int     res, ix, oldused;
-022     mp_digit *tmpa, *tmpc, mu;
-023
-024     /* grow c as required */
-025     if (c->alloc < (a->used + 1)) \{
-026        if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) \{
-027           return res;
-028        \}
-029     \}
-030
-031     /* if a is negative and |a| >= b, call c = |a| - b */
-032     if ((a->sign == MP_NEG) && ((a->used > 1) || (a->dp[0] >= b))) \{
-033        /* temporarily fix sign of a */
-034        a->sign = MP_ZPOS;
-035
-036        /* c = |a| - b */
-037        res = mp_sub_d(a, b, c);
-038
-039        /* fix sign  */
-040        a->sign = c->sign = MP_NEG;
-041
-042        /* clamp */
-043        mp_clamp(c);
-044
-045        return res;
-046     \}
-047
-048     /* old number of used digits in c */
-049     oldused = c->used;
-050
-051     /* sign always positive */
-052     c->sign = MP_ZPOS;
-053
-054     /* source alias */
-055     tmpa    = a->dp;
-056
-057     /* destination alias */
-058     tmpc    = c->dp;
-059
-060     /* if a is positive */
-061     if (a->sign == MP_ZPOS) \{
-062        /* add digit, after this we're propagating
-063         * the carry.
-064         */
-065        *tmpc   = *tmpa++ + b;
-066        mu      = *tmpc >> DIGIT_BIT;
-067        *tmpc++ &= MP_MASK;
-068
-069        /* now handle rest of the digits */
-070        for (ix = 1; ix < a->used; ix++) \{
-071           *tmpc   = *tmpa++ + mu;
-072           mu      = *tmpc >> DIGIT_BIT;
-073           *tmpc++ &= MP_MASK;
-074        \}
-075        /* set final carry */
-076        ix++;
-077        *tmpc++  = mu;
-078
-079        /* setup size */
-080        c->used = a->used + 1;
-081     \} else \{
-082        /* a was negative and |a| < b */
-083        c->used  = 1;
-084
-085        /* the result is a single digit */
-086        if (a->used == 1) \{
-087           *tmpc++  =  b - a->dp[0];
-088        \} else \{
-089           *tmpc++  =  b;
-090        \}
-091
-092        /* setup count so the clearing of oldused
-093         * can fall through correctly
-094         */
-095        ix       = 1;
-096     \}
-097
-098     /* now zero to oldused */
-099     while (ix++ < oldused) \{
-100        *tmpc++ = 0;
-101     \}
-102     mp_clamp(c);
-103
-104     return MP_OKAY;
-105   \}
-106
-107   #endif
-108
-\end{alltt}
-\end{small}
-
-Clever use of the letter 't'.
-
-\subsubsection{Subtraction}
-The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int.
-
-\subsection{Single Digit Multiplication}
-Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline
-multiplication algorithm.  Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands
-only has one digit.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_d}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c = ab$ \\
-\hline \\
-1.  $pa \leftarrow a.used$ \\
-2.  Grow $c$ to at least $pa + 1$ digits. \\
-3.  $oldused \leftarrow c.used$ \\
-4.  $c.used \leftarrow pa + 1$ \\
-5.  $c.sign \leftarrow a.sign$ \\
-6.  $\mu \leftarrow 0$ \\
-7.  for $ix$ from $0$ to $pa - 1$ do \\
-\hspace{3mm}7.1  $\hat r \leftarrow \mu + a_{ix}b$ \\
-\hspace{3mm}7.2  $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}7.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-8.  $c_{pa} \leftarrow \mu$ \\
-9.  for $ix$ from $pa + 1$ to $oldused$ do \\
-\hspace{3mm}9.1  $c_{ix} \leftarrow 0$ \\
-10.  Clamp excess digits of $c$. \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_d}
-\end{figure}
-\textbf{Algorithm mp\_mul\_d.}
-This algorithm quickly multiplies an mp\_int by a small single digit value.  It is specially tailored to the job and has a minimal of overhead.
-Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_d.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* multiply by a digit */
-018   int
-019   mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
-020   \{
-021     mp_digit u, *tmpa, *tmpc;
-022     mp_word  r;
-023     int      ix, res, olduse;
-024
-025     /* make sure c is big enough to hold a*b */
-026     if (c->alloc < (a->used + 1)) \{
-027       if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) \{
-028         return res;
-029       \}
-030     \}
-031
-032     /* get the original destinations used count */
-033     olduse = c->used;
-034
-035     /* set the sign */
-036     c->sign = a->sign;
-037
-038     /* alias for a->dp [source] */
-039     tmpa = a->dp;
-040
-041     /* alias for c->dp [dest] */
-042     tmpc = c->dp;
-043
-044     /* zero carry */
-045     u = 0;
-046
-047     /* compute columns */
-048     for (ix = 0; ix < a->used; ix++) \{
-049       /* compute product and carry sum for this term */
-050       r       = (mp_word)u + ((mp_word)*tmpa++ * (mp_word)b);
-051
-052       /* mask off higher bits to get a single digit */
-053       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
-054
-055       /* send carry into next iteration */
-056       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-057     \}
-058
-059     /* store final carry [if any] and increment ix offset  */
-060     *tmpc++ = u;
-061     ++ix;
-062
-063     /* now zero digits above the top */
-064     while (ix++ < olduse) \{
-065        *tmpc++ = 0;
-066     \}
-067
-068     /* set used count */
-069     c->used = a->used + 1;
-070     mp_clamp(c);
-071
-072     return MP_OKAY;
-073   \}
-074   #endif
-075
-\end{alltt}
-\end{small}
-
-In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is
-read from the source.  This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively.
-
-\subsection{Single Digit Division}
-Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion.  Since the
-divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_d}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c = \lfloor a / b \rfloor, d = a - cb$ \\
-\hline \\
-1.  If $b = 0$ then return(\textit{MP\_VAL}).\\
-2.  If $b = 3$ then use algorithm mp\_div\_3 instead. \\
-3.  Init $q$ to $a.used$ digits.  \\
-4.  $q.used \leftarrow a.used$ \\
-5.  $q.sign \leftarrow a.sign$ \\
-6.  $\hat w \leftarrow 0$ \\
-7.  for $ix$ from $a.used - 1$ down to $0$ do \\
-\hspace{3mm}7.1  $\hat w \leftarrow \hat w \beta + a_{ix}$ \\
-\hspace{3mm}7.2  If $\hat w \ge b$ then \\
-\hspace{6mm}7.2.1  $t \leftarrow \lfloor \hat w / b \rfloor$ \\
-\hspace{6mm}7.2.2  $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\
-\hspace{3mm}7.3  else\\
-\hspace{6mm}7.3.1  $t \leftarrow 0$ \\
-\hspace{3mm}7.4  $q_{ix} \leftarrow t$ \\
-8.  $d \leftarrow \hat w$ \\
-9.  Clamp excess digits of $q$. \\
-10.  $c \leftarrow q$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_d}
-\end{figure}
-\textbf{Algorithm mp\_div\_d.}
-This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach.  Essentially in every iteration of the
-algorithm another digit of the dividend is reduced and another digit of quotient produced.  Provided $b < \beta$ the value of $\hat w$
-after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$.
-
-If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3.  It replaces the division by three with
-a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup.  In essence it is much like the Barrett reduction
-from chapter seven.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_d.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   static int s_is_power_of_two(mp_digit b, int *p)
-018   \{
-019      int x;
-020
-021      /* fast return if no power of two */
-022      if ((b == 0) || ((b & (b-1)) != 0)) \{
-023         return 0;
-024      \}
-025
-026      for (x = 0; x < DIGIT_BIT; x++) \{
-027         if (b == (((mp_digit)1)<<x)) \{
-028            *p = x;
-029            return 1;
-030         \}
-031      \}
-032      return 0;
-033   \}
-034
-035   /* single digit division (based on routine from MPI) */
-036   int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
-037   \{
-038     mp_int  q;
-039     mp_word w;
-040     mp_digit t;
-041     int     res, ix;
-042
-043     /* cannot divide by zero */
-044     if (b == 0) \{
-045        return MP_VAL;
-046     \}
-047
-048     /* quick outs */
-049     if ((b == 1) || (mp_iszero(a) == MP_YES)) \{
-050        if (d != NULL) \{
-051           *d = 0;
-052        \}
-053        if (c != NULL) \{
-054           return mp_copy(a, c);
-055        \}
-056        return MP_OKAY;
-057     \}
-058
-059     /* power of two ? */
-060     if (s_is_power_of_two(b, &ix) == 1) \{
-061        if (d != NULL) \{
-062           *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
-063        \}
-064        if (c != NULL) \{
-065           return mp_div_2d(a, ix, c, NULL);
-066        \}
-067        return MP_OKAY;
-068     \}
-069
-070   #ifdef BN_MP_DIV_3_C
-071     /* three? */
-072     if (b == 3) \{
-073        return mp_div_3(a, c, d);
-074     \}
-075   #endif
-076
-077     /* no easy answer [c'est la vie].  Just division */
-078     if ((res = mp_init_size(&q, a->used)) != MP_OKAY) \{
-079        return res;
-080     \}
-081
-082     q.used = a->used;
-083     q.sign = a->sign;
-084     w = 0;
-085     for (ix = a->used - 1; ix >= 0; ix--) \{
-086        w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-087
-088        if (w >= b) \{
-089           t = (mp_digit)(w / b);
-090           w -= ((mp_word)t) * ((mp_word)b);
-091         \} else \{
-092           t = 0;
-093         \}
-094         q.dp[ix] = (mp_digit)t;
-095     \}
-096
-097     if (d != NULL) \{
-098        *d = (mp_digit)w;
-099     \}
-100
-101     if (c != NULL) \{
-102        mp_clamp(&q);
-103        mp_exch(&q, c);
-104     \}
-105     mp_clear(&q);
-106
-107     return res;
-108   \}
-109
-110   #endif
-111
-\end{alltt}
-\end{small}
-
-Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
-indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
-
-The division and remainder on lines 89 and 90 can be replaced often by a single division on most processors.  For example, the 32-bit x86 based
-processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC
-compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.
-
-\subsection{Single Digit Root Extraction}
-
-Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned.  Algorithms such as the Newton-Raphson approximation
-(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$.
-
-\begin{equation}
-x_{i+1} = x_i - {f(x_i) \over f'(x_i)}
-\label{eqn:newton}
-\end{equation}
-
-In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired.  The derivative of $f(x)$ is
-simply $f'(x) = nx^{n - 1}$.  Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain
-such as the real numbers.  As a result the root found can be above the true root by few and must be manually adjusted.  Ideally at the end of the
-algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_n\_root}. \\
-\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
-\textbf{Output}.  $c^b \le a$ \\
-\hline \\
-1.  If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
-2.  $sign \leftarrow a.sign$ \\
-3.  $a.sign \leftarrow MP\_ZPOS$ \\
-4.  t$2 \leftarrow 2$ \\
-5.  Loop \\
-\hspace{3mm}5.1  t$1 \leftarrow $ t$2$ \\
-\hspace{3mm}5.2  t$3 \leftarrow $ t$1^{b - 1}$ \\
-\hspace{3mm}5.3  t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\
-\hspace{3mm}5.4  t$2 \leftarrow $ t$2 - a$ \\
-\hspace{3mm}5.5  t$3 \leftarrow $ t$3 \cdot b$ \\
-\hspace{3mm}5.6  t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\
-\hspace{3mm}5.7  t$2 \leftarrow $ t$1 - $ t$3$ \\
-\hspace{3mm}5.8  If t$1 \ne $ t$2$ then goto step 5.  \\
-6.  Loop \\
-\hspace{3mm}6.1  t$2 \leftarrow $ t$1^b$ \\
-\hspace{3mm}6.2  If t$2 > a$ then \\
-\hspace{6mm}6.2.1  t$1 \leftarrow $ t$1 - 1$ \\
-\hspace{6mm}6.2.2  Goto step 6. \\
-7.  $a.sign \leftarrow sign$ \\
-8.  $c \leftarrow $ t$1$ \\
-9.  $c.sign \leftarrow sign$  \\
-10.  Return(\textit{MP\_OKAY}).  \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_n\_root}
-\end{figure}
-\textbf{Algorithm mp\_n\_root.}
-This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach.  It is partially optimized based on the observation
-that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator.  That is at first the denominator is calculated by finding
-$x^{b - 1}$.  This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator.  This saves a total of $b - 1$
-multiplications by t$1$ inside the loop.
-
-The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the
-root.  Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_n\_root.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* wrapper function for mp_n_root_ex()
-018    * computes c = (a)**(1/b) such that (c)**b <= a and (c+1)**b > a
-019    */
-020   int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
-021   \{
-022     return mp_n_root_ex(a, b, c, 0);
-023   \}
-024
-025   #endif
-026
-\end{alltt}
-\end{small}
-
-\section{Random Number Generation}
-
-Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms.  Pollard-Rho
-factoring for example, can make use of random values as starting points to find factors of a composite integer.  In this case the algorithm presented
-is solely for simulations and not intended for cryptographic use.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_rand}. \\
-\textbf{Input}.   An integer $b$ \\
-\textbf{Output}.  A pseudo-random number of $b$ digits \\
-\hline \\
-1.  $a \leftarrow 0$ \\
-2.  If $b \le 0$ return(\textit{MP\_OKAY}) \\
-3.  Pick a non-zero random digit $d$. \\
-4.  $a \leftarrow a + d$ \\
-5.  for $ix$ from 1 to $d - 1$ do \\
-\hspace{3mm}5.1  $a \leftarrow a \cdot \beta$ \\
-\hspace{3mm}5.2  Pick a random digit $d$. \\
-\hspace{3mm}5.3  $a \leftarrow a + d$ \\
-6.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_rand}
-\end{figure}
-\textbf{Algorithm mp\_rand.}
-This algorithm produces a pseudo-random integer of $b$ digits.  By ensuring that the first digit is non-zero the algorithm also guarantees that the
-final result has at least $b$ digits.  It relies heavily on a third-part random number generator which should ideally generate uniformly all of
-the integers from $0$ to $\beta - 1$.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_rand.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* makes a pseudo-random int of a given size */
-018   int
-019   mp_rand (mp_int * a, int digits)
-020   \{
-021     int     res;
-022     mp_digit d;
-023
-024     mp_zero (a);
-025     if (digits <= 0) \{
-026       return MP_OKAY;
-027     \}
-028
-029     /* first place a random non-zero digit */
-030     do \{
-031       d = ((mp_digit) abs (MP_GEN_RANDOM())) & MP_MASK;
-032     \} while (d == 0);
-033
-034     if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{
-035       return res;
-036     \}
-037
-038     while (--digits > 0) \{
-039       if ((res = mp_lshd (a, 1)) != MP_OKAY) \{
-040         return res;
-041       \}
-042
-043       if ((res = mp_add_d (a, ((mp_digit) abs (MP_GEN_RANDOM())), a)) != MP_OK
-      AY) \{
-044         return res;
-045       \}
-046     \}
-047
-048     return MP_OKAY;
-049   \}
-050   #endif
-051
-\end{alltt}
-\end{small}
-
-\section{Formatted Representations}
-The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties.  For example, the ability to
-be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers
-into a program.
-
-\subsection{Reading Radix-n Input}
-For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to
-printable characters.  For example, when the character ``N'' is read it represents the integer $23$.  The first $16$ characters of the
-map are for the common representations up to hexadecimal.  After that they match the ``base64'' encoding scheme which are suitable chosen
-such that they are printable.  While outputting as base64 may not be too helpful for human operators it does allow communication via non binary
-mediums.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{cc|cc|cc|cc}
-\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} &  \textbf{Value} & \textbf{Char} \\
-\hline
-0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\
-4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\
-8 & 8 & 9 & 9 & 10 & A & 11 & B \\
-12 & C & 13 & D & 14 & E & 15 & F \\
-16 & G & 17 & H & 18 & I & 19 & J \\
-20 & K & 21 & L & 22 & M & 23 & N \\
-24 & O & 25 & P & 26 & Q & 27 & R \\
-28 & S & 29 & T & 30 & U & 31 & V \\
-32 & W & 33 & X & 34 & Y & 35 & Z \\
-36 & a & 37 & b & 38 & c & 39 & d \\
-40 & e & 41 & f & 42 & g & 43 & h \\
-44 & i & 45 & j & 46 & k & 47 & l \\
-48 & m & 49 & n & 50 & o & 51 & p \\
-52 & q & 53 & r & 54 & s & 55 & t \\
-56 & u & 57 & v & 58 & w & 59 & x \\
-60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Lower ASCII Map}
-\label{fig:ASC}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_read\_radix}. \\
-\textbf{Input}.   A string $str$ of length $sn$ and radix $r$. \\
-\textbf{Output}.  The radix-$\beta$ equivalent mp\_int. \\
-\hline \\
-1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
-2.  $ix \leftarrow 0$ \\
-3.  If $str_0 =$ ``-'' then do \\
-\hspace{3mm}3.1  $ix \leftarrow ix + 1$ \\
-\hspace{3mm}3.2  $sign \leftarrow MP\_NEG$ \\
-4.  else \\
-\hspace{3mm}4.1  $sign \leftarrow MP\_ZPOS$ \\
-5.  $a \leftarrow 0$ \\
-6.  for $iy$ from $ix$ to $sn - 1$ do \\
-\hspace{3mm}6.1  Let $y$ denote the position in the map of $str_{iy}$. \\
-\hspace{3mm}6.2  If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\
-\hspace{3mm}6.3  $a \leftarrow a \cdot r$ \\
-\hspace{3mm}6.4  $a \leftarrow a + y$ \\
-7.  If $a \ne 0$ then $a.sign \leftarrow sign$ \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_read\_radix}
-\end{figure}
-\textbf{Algorithm mp\_read\_radix.}
-This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer.  A minus symbol ``-'' may precede the
-string  to indicate the value is negative, otherwise it is assumed to be positive.  The algorithm will read up to $sn$ characters from the input
-and will stop when it reads a character it cannot map the algorithm stops reading characters from the string.  This allows numbers to be embedded
-as part of larger input without any significant problem.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_read\_radix.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* read a string [ASCII] in a given radix */
-018   int mp_read_radix (mp_int * a, const char *str, int radix)
-019   \{
-020     int     y, res, neg;
-021     char    ch;
-022
-023     /* zero the digit bignum */
-024     mp_zero(a);
-025
-026     /* make sure the radix is ok */
-027     if ((radix < 2) || (radix > 64)) \{
-028       return MP_VAL;
-029     \}
-030
-031     /* if the leading digit is a
-032      * minus set the sign to negative.
-033      */
-034     if (*str == '-') \{
-035       ++str;
-036       neg = MP_NEG;
-037     \} else \{
-038       neg = MP_ZPOS;
-039     \}
-040
-041     /* set the integer to the default of zero */
-042     mp_zero (a);
-043
-044     /* process each digit of the string */
-045     while (*str != '\symbol{92}0') \{
-046       /* if the radix <= 36 the conversion is case insensitive
-047        * this allows numbers like 1AB and 1ab to represent the same  value
-048        * [e.g. in hex]
-049        */
-050       ch = (radix <= 36) ? (char)toupper((int)*str) : *str;
-051       for (y = 0; y < 64; y++) \{
-052         if (ch == mp_s_rmap[y]) \{
-053            break;
-054         \}
-055       \}
-056
-057       /* if the char was found in the map
-058        * and is less than the given radix add it
-059        * to the number, otherwise exit the loop.
-060        */
-061       if (y < radix) \{
-062         if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{
-063            return res;
-064         \}
-065         if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{
-066            return res;
-067         \}
-068       \} else \{
-069         break;
-070       \}
-071       ++str;
-072     \}
-073
-074     /* set the sign only if a != 0 */
-075     if (mp_iszero(a) != MP_YES) \{
-076        a->sign = neg;
-077     \}
-078     return MP_OKAY;
-079   \}
-080   #endif
-081
-\end{alltt}
-\end{small}
-
-\subsection{Generating Radix-$n$ Output}
-Generating radix-$n$ output is fairly trivial with a division and remainder algorithm.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toradix}. \\
-\textbf{Input}.   A mp\_int $a$ and an integer $r$\\
-\textbf{Output}.  The radix-$r$ representation of $a$ \\
-\hline \\
-1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
-2.  If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}).  \\
-3.  $t \leftarrow a$ \\
-4.  $str \leftarrow$ ``'' \\
-5.  if $t.sign = MP\_NEG$ then \\
-\hspace{3mm}5.1  $str \leftarrow str + $ ``-'' \\
-\hspace{3mm}5.2  $t.sign = MP\_ZPOS$ \\
-6.  While ($t \ne 0$) do \\
-\hspace{3mm}6.1  $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\
-\hspace{3mm}6.2  $t \leftarrow \lfloor t / r \rfloor$ \\
-\hspace{3mm}6.3  Look up $d$ in the map and store the equivalent character in $y$. \\
-\hspace{3mm}6.4  $str \leftarrow str + y$ \\
-7.  If $str_0 = $``$-$'' then \\
-\hspace{3mm}7.1  Reverse the digits $str_1, str_2, \ldots str_n$. \\
-8.  Otherwise \\
-\hspace{3mm}8.1  Reverse the digits $str_0, str_1, \ldots str_n$. \\
-9.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toradix}
-\end{figure}
-\textbf{Algorithm mp\_toradix.}
-This algorithm computes the radix-$r$ representation of an mp\_int $a$.  The ``digits'' of the representation are extracted by reducing
-successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$.  Note that instead of actually dividing by $r^k$ in
-each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration.  As a result a series of trivial $n \times 1$ divisions
-are required instead of a series of $n \times k$ divisions.  One design flaw of this approach is that the digits are produced in the reverse order
-(see~\ref{fig:mpradix}).  To remedy this flaw the digits must be swapped or simply ``reversed''.
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\
-\hline $1234$ & -- & -- \\
-\hline $123$  & $4$ & ``4'' \\
-\hline $12$   & $3$ & ``43'' \\
-\hline $1$    & $2$ & ``432'' \\
-\hline $0$    & $1$ & ``4321'' \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Algorithm mp\_toradix.}
-\label{fig:mpradix}
-\end{figure}
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_toradix.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* stores a bignum as a ASCII string in a given radix (2..64) */
-018   int mp_toradix (mp_int * a, char *str, int radix)
-019   \{
-020     int     res, digs;
-021     mp_int  t;
-022     mp_digit d;
-023     char   *_s = str;
-024
-025     /* check range of the radix */
-026     if ((radix < 2) || (radix > 64)) \{
-027       return MP_VAL;
-028     \}
-029
-030     /* quick out if its zero */
-031     if (mp_iszero(a) == MP_YES) \{
-032        *str++ = '0';
-033        *str = '\symbol{92}0';
-034        return MP_OKAY;
-035     \}
-036
-037     if ((res = mp_init_copy (&t, a)) != MP_OKAY) \{
-038       return res;
-039     \}
-040
-041     /* if it is negative output a - */
-042     if (t.sign == MP_NEG) \{
-043       ++_s;
-044       *str++ = '-';
-045       t.sign = MP_ZPOS;
-046     \}
-047
-048     digs = 0;
-049     while (mp_iszero (&t) == MP_NO) \{
-050       if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) \{
-051         mp_clear (&t);
-052         return res;
-053       \}
-054       *str++ = mp_s_rmap[d];
-055       ++digs;
-056     \}
-057
-058     /* reverse the digits of the string.  In this case _s points
-059      * to the first digit [exluding the sign] of the number]
-060      */
-061     bn_reverse ((unsigned char *)_s, digs);
-062
-063     /* append a NULL so the string is properly terminated */
-064     *str = '\symbol{92}0';
-065
-066     mp_clear (&t);
-067     return MP_OKAY;
-068   \}
-069
-070   #endif
-071
-\end{alltt}
-\end{small}
-
-\chapter{Number Theoretic Algorithms}
-This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi
-symbol computation.  These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and
-various Sieve based factoring algorithms.
-
-\section{Greatest Common Divisor}
-The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of
-both $a$ and $b$.  That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur
-simultaneously.
-
-The most common approach (cite) is to reduce one input modulo another.  That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then
-$r$ is also divisible by $k$.  The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\
-\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
-\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
-\hline \\
-1.  While ($b > 0$) do \\
-\hspace{3mm}1.1  $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
-\hspace{3mm}1.2  $a \leftarrow b$ \\
-\hspace{3mm}1.3  $b \leftarrow r$ \\
-2.  Return($a$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Greatest Common Divisor (I)}
-\label{fig:gcd1}
-\end{figure}
-
-This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly.  However, divisions are
-relatively expensive operations to perform and should ideally be avoided.  There is another approach based on a similar relationship of
-greatest common divisors.  The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$.
-In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\
-\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
-\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
-\hline \\
-1.  While ($b > 0$) do \\
-\hspace{3mm}1.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
-\hspace{3mm}1.2  $b \leftarrow b - a$ \\
-2.  Return($a$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Greatest Common Divisor (II)}
-\label{fig:gcd2}
-\end{figure}
-
-\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.}
-The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$.  In other
-words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$.  Since both $a$ and $b$ are always
-divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the
-second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof.  \textbf{QED}.
-
-As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful.  Specially if $b$ is much larger than $a$ such that
-$b - a$ is still very much larger than $a$.  A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does
-not divide the greatest common divisor but will divide $b - a$.  In this case ${b - a} \over p$ is also an integer and still divisible by
-the greatest common divisor.
-
-However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first.
-Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\
-\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
-\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
-\hline \\
-1.  $k \leftarrow 0$ \\
-2.  While $a$ and $b$ are both divisible by $p$ do \\
-\hspace{3mm}2.1  $a \leftarrow \lfloor a / p \rfloor$ \\
-\hspace{3mm}2.2  $b \leftarrow \lfloor b / p \rfloor$ \\
-\hspace{3mm}2.3  $k \leftarrow k + 1$ \\
-3.  While $a$ is divisible by $p$ do \\
-\hspace{3mm}3.1  $a \leftarrow \lfloor a / p \rfloor$ \\
-4.  While $b$ is divisible by $p$ do \\
-\hspace{3mm}4.1  $b \leftarrow \lfloor b / p \rfloor$ \\
-5.  While ($b > 0$) do \\
-\hspace{3mm}5.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
-\hspace{3mm}5.2  $b \leftarrow b - a$ \\
-\hspace{3mm}5.3  While $b$ is divisible by $p$ do \\
-\hspace{6mm}5.3.1  $b \leftarrow \lfloor b / p \rfloor$ \\
-6.  Return($a \cdot p^k$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Greatest Common Divisor (III)}
-\label{fig:gcd3}
-\end{figure}
-
-This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$
-decreases more rapidly.  The first loop on step two removes powers of $p$ that are in common.  A count, $k$, is kept which will present a common
-divisor of $p^k$.  After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$.  This means that $p$ can be safely
-divided out of the difference $b - a$ so long as the division leaves no remainder.
-
-In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often.  It also helps that division by $p$ be easy
-to compute.  The ideal choice of $p$ is two since division by two amounts to a right logical shift.  Another important observation is that by
-step five both $a$ and $b$ are odd.  Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the
-largest of the pair.
-
-\subsection{Complete Greatest Common Divisor}
-The algorithms presented so far cannot handle inputs which are zero or negative.  The following algorithm can handle all input cases properly
-and will produce the greatest common divisor.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_gcd}. \\
-\textbf{Input}.   mp\_int $a$ and $b$ \\
-\textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
-\hline \\
-1.  If $a = 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow \vert b \vert $ \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $b = 0$ then \\
-\hspace{3mm}2.1  $c \leftarrow \vert a \vert $ \\
-\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
-4.  $k \leftarrow 0$ \\
-5.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}5.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}5.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-8.  While $v.used > 0$ \\
-\hspace{3mm}8.1  If $\vert u \vert > \vert v \vert$ then \\
-\hspace{6mm}8.1.1  Swap $u$ and $v$. \\
-\hspace{3mm}8.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
-\hspace{3mm}8.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{6mm}8.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-9.  $c \leftarrow u \cdot 2^k$ \\
-10.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_gcd}
-\end{figure}
-\textbf{Algorithm mp\_gcd.}
-This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$.  The algorithm was originally based on Algorithm B of
-Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
-Algorithm B and in practice this appears to be true.
-
-The first two steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the
-largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of
-$a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
-
-Step five will divide out any common factors of two and keep track of the count in the variable $k$.  After this step, two is no longer a
-factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step
-six and seven ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while--loops will iterate since
-they cannot both be even.
-
-By step eight both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
-or greater than $u$.  This ensures that the subtraction on step 8.2 will always produce a positive and even result.  Step 8.3 removes any
-factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
-
-After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
-must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_gcd.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* Greatest Common Divisor using the binary method */
-018   int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     mp_int  u, v;
-021     int     k, u_lsb, v_lsb, res;
-022
-023     /* either zero than gcd is the largest */
-024     if (mp_iszero (a) == MP_YES) \{
-025       return mp_abs (b, c);
-026     \}
-027     if (mp_iszero (b) == MP_YES) \{
-028       return mp_abs (a, c);
-029     \}
-030
-031     /* get copies of a and b we can modify */
-032     if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{
-033       return res;
-034     \}
-035
-036     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
-037       goto LBL_U;
-038     \}
-039
-040     /* must be positive for the remainder of the algorithm */
-041     u.sign = v.sign = MP_ZPOS;
-042
-043     /* B1.  Find the common power of two for u and v */
-044     u_lsb = mp_cnt_lsb(&u);
-045     v_lsb = mp_cnt_lsb(&v);
-046     k     = MIN(u_lsb, v_lsb);
-047
-048     if (k > 0) \{
-049        /* divide the power of two out */
-050        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
-051           goto LBL_V;
-052        \}
-053
-054        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
-055           goto LBL_V;
-056        \}
-057     \}
-058
-059     /* divide any remaining factors of two out */
-060     if (u_lsb != k) \{
-061        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
-062           goto LBL_V;
-063        \}
-064     \}
-065
-066     if (v_lsb != k) \{
-067        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
-068           goto LBL_V;
-069        \}
-070     \}
-071
-072     while (mp_iszero(&v) == MP_NO) \{
-073        /* make sure v is the largest */
-074        if (mp_cmp_mag(&u, &v) == MP_GT) \{
-075           /* swap u and v to make sure v is >= u */
-076           mp_exch(&u, &v);
-077        \}
-078
-079        /* subtract smallest from largest */
-080        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
-081           goto LBL_V;
-082        \}
-083
-084        /* Divide out all factors of two */
-085        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
-086           goto LBL_V;
-087        \}
-088     \}
-089
-090     /* multiply by 2**k which we divided out at the beginning */
-091     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
-092        goto LBL_V;
-093     \}
-094     c->sign = MP_ZPOS;
-095     res = MP_OKAY;
-096   LBL_V:mp_clear (&u);
-097   LBL_U:mp_clear (&v);
-098     return res;
-099   \}
-100   #endif
-101
-\end{alltt}
-\end{small}
-
-This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the
-integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
-it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three
-trivial cases of inputs are handled on lines 23 through 29.  After those lines the inputs are assumed to be non-zero.
-
-Lines 32 and 36 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two
-must be divided out of the two inputs.  The block starting at line 43 removes common factors of two by first counting the number of trailing
-zero bits in both.  The local integer $k$ is used to keep track of how many factors of $2$ are pulled out of both values.  It is assumed that
-the number of factors will not exceed the maximum value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than
-entries than are accessible by an ``int'' so this is not a limitation.}.
-
-At this point there are no more common factors of two in the two values.  The divisions by a power of two on lines 61 and 67 remove
-any independent factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
-on line 72 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
-place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
-
-\section{Least Common Multiple}
-The least common multiple of a pair of integers is their product divided by their greatest common divisor.  For two integers $a$ and $b$ the
-least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$.  For example, if $a = 2 \cdot 2 \cdot 3 = 12$
-and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$.
-
-The least common multiple arises often in coding theory as well as number theory.  If two functions have periods of $a$ and $b$ respectively they will
-collide, that is be in synchronous states, after only $[ a, b ]$ iterations.  This is why, for example, random number generators based on
-Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}).
-Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_lcm}. \\
-\textbf{Input}.   mp\_int $a$ and $b$ \\
-\textbf{Output}.  The least common multiple $c = [a, b]$.  \\
-\hline \\
-1.  $c \leftarrow (a, b)$ \\
-2.  $t \leftarrow a \cdot b$ \\
-3.  $c \leftarrow \lfloor t / c \rfloor$ \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_lcm}
-\end{figure}
-\textbf{Algorithm mp\_lcm.}
-This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$.  It computes the least common multiple directly by
-dividing the product of the two inputs by their greatest common divisor.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_lcm.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* computes least common multiple as |a*b|/(a, b) */
-018   int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     int     res;
-021     mp_int  t1, t2;
-022
-023
-024     if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) \{
-025       return res;
-026     \}
-027
-028     /* t1 = get the GCD of the two inputs */
-029     if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) \{
-030       goto LBL_T;
-031     \}
-032
-033     /* divide the smallest by the GCD */
-034     if (mp_cmp_mag(a, b) == MP_LT) \{
-035        /* store quotient in t2 such that t2 * b is the LCM */
-036        if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) \{
-037           goto LBL_T;
-038        \}
-039        res = mp_mul(b, &t2, c);
-040     \} else \{
-041        /* store quotient in t2 such that t2 * a is the LCM */
-042        if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) \{
-043           goto LBL_T;
-044        \}
-045        res = mp_mul(a, &t2, c);
-046     \}
-047
-048     /* fix the sign to positive */
-049     c->sign = MP_ZPOS;
-050
-051   LBL_T:
-052     mp_clear_multi (&t1, &t2, NULL);
-053     return res;
-054   \}
-055   #endif
-056
-\end{alltt}
-\end{small}
-
-\section{Jacobi Symbol Computation}
-To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg.  What is the name of this?} off which the Jacobi symbol is
-defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
-equivalent to equation \ref{eqn:legendre}.
-
-\textit{-- Tom, don't be an ass, cite your source here...!}
-
-\begin{equation}
-a^{(p-1)/2} \equiv \begin{array}{rl}
-                              -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
-                              0  &  \mbox{if }a\mbox{ divides }p\mbox{.} \\
-                              1  &  \mbox{if }a\mbox{ is a quadratic residue}.
-                              \end{array} \mbox{ (mod }p\mbox{)}
-\label{eqn:legendre}
-\end{equation}
-
-\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.}
-An integer $a$ is a quadratic residue if the following equation has a solution.
-
-\begin{equation}
-x^2 \equiv a \mbox{ (mod }p\mbox{)}
-\label{eqn:root}
-\end{equation}
-
-Consider the following equation.
-
-\begin{equation}
-0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)}
-\label{eqn:rooti}
-\end{equation}
-
-Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true.  If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$
-then the quantity in the braces must be zero.  By reduction,
-
-\begin{eqnarray}
-\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0  \nonumber \\
-\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\
-x^2 \equiv a \mbox{ (mod }p\mbox{)}
-\end{eqnarray}
-
-As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue.  If $a$ does not divide $p$ and $a$
-is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since
-\begin{equation}
-0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)}
-\end{equation}
-One of the terms on the right hand side must be zero.  \textbf{QED}
-
-\subsection{Jacobi Symbol}
-The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2.  If $p = \prod_{i=0}^n p_i$ then
-the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation.
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right )
-\end{equation}
-
-By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function.  The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for
-further details.} will be used to derive an efficient Jacobi symbol algorithm.  Where $p$ is an odd integer greater than two and $a, b \in \Z$ the
-following are true.
-
-\begin{enumerate}
-\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$.
-\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$.
-\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$.
-\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$.  Otherwise, it equals $-1$.
-\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$.  More specifically
-$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$.
-\end{enumerate}
-
-Using these facts if $a = 2^k \cdot a'$ then
-
-\begin{eqnarray}
-\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\
-                               = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right )
-\label{eqn:jacobi}
-\end{eqnarray}
-
-By fact five,
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4}
-\end{equation}
-
-Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4}
-\end{equation}
-
-By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed.
-
-\begin{equation}
-\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right )  \cdot (-1)^{(p-1)(a'-1)/4}
-\end{equation}
-
-The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively.  The value of
-$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$.  Using this approach the
-factors of $p$ do not have to be known.  Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the
-Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_jacobi}. \\
-\textbf{Input}.   mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\
-\textbf{Output}.  The Jacobi symbol $c = \left ( {a \over p } \right )$. \\
-\hline \\
-1.  If $a = 0$ then \\
-\hspace{3mm}1.1  $c \leftarrow 0$ \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $a = 1$ then \\
-\hspace{3mm}2.1  $c \leftarrow 1$ \\
-\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
-3.  $a' \leftarrow a$ \\
-4.  $k \leftarrow 0$ \\
-5.  While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
-\hspace{3mm}5.2  $a' \leftarrow \lfloor a' / 2 \rfloor$ \\
-6.  If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\
-\hspace{3mm}6.1  $s \leftarrow 1$ \\
-7.  else \\
-\hspace{3mm}7.1  $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\
-\hspace{3mm}7.2  If $r = 1$ or $r = 7$ then \\
-\hspace{6mm}7.2.1  $s \leftarrow 1$ \\
-\hspace{3mm}7.3  else \\
-\hspace{6mm}7.3.1  $s \leftarrow -1$ \\
-8.  If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\
-\hspace{3mm}8.1  $s \leftarrow -s$ \\
-9.  If $a' \ne 1$ then \\
-\hspace{3mm}9.1  $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\
-\hspace{3mm}9.2  $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\
-10.  $c \leftarrow s$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_jacobi}
-\end{figure}
-\textbf{Algorithm mp\_jacobi.}
-This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three.  The algorithm
-is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}.
-
-Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively.  Step five determines the number of two factors in the
-input $a$.  If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one.  If $k$ is odd than the term evaluates to one
-if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled
-the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$.  The latter term evaluates to one if both $p$ and $a'$
-are congruent to one modulo four, otherwise it evaluates to negative one.
-
-By step nine if $a'$ does not equal one a recursion is required.  Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute
-$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_jacobi.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* computes the jacobi c = (a | n) (or Legendre if n is prime)
-018    * HAC pp. 73 Algorithm 2.149
-019    * HAC is wrong here, as the special case of (0 | 1) is not
-020    * handled correctly.
-021    */
-022   int mp_jacobi (mp_int * a, mp_int * n, int *c)
-023   \{
-024     mp_int  a1, p1;
-025     int     k, s, r, res;
-026     mp_digit residue;
-027
-028     /* if a < 0 return MP_VAL */
-029     if (mp_isneg(a) == MP_YES) \{
-030        return MP_VAL;
-031     \}
-032
-033     /* if n <= 0 return MP_VAL */
-034     if (mp_cmp_d(n, 0) != MP_GT) \{
-035        return MP_VAL;
-036     \}
-037
-038     /* step 1. handle case of a == 0 */
-039     if (mp_iszero (a) == MP_YES) \{
-040        /* special case of a == 0 and n == 1 */
-041        if (mp_cmp_d (n, 1) == MP_EQ) \{
-042          *c = 1;
-043        \} else \{
-044          *c = 0;
-045        \}
-046        return MP_OKAY;
-047     \}
-048
-049     /* step 2.  if a == 1, return 1 */
-050     if (mp_cmp_d (a, 1) == MP_EQ) \{
-051       *c = 1;
-052       return MP_OKAY;
-053     \}
-054
-055     /* default */
-056     s = 0;
-057
-058     /* step 3.  write a = a1 * 2**k  */
-059     if ((res = mp_init_copy (&a1, a)) != MP_OKAY) \{
-060       return res;
-061     \}
-062
-063     if ((res = mp_init (&p1)) != MP_OKAY) \{
-064       goto LBL_A1;
-065     \}
-066
-067     /* divide out larger power of two */
-068     k = mp_cnt_lsb(&a1);
-069     if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) \{
-070        goto LBL_P1;
-071     \}
-072
-073     /* step 4.  if e is even set s=1 */
-074     if ((k & 1) == 0) \{
-075       s = 1;
-076     \} else \{
-077       /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
-078       residue = n->dp[0] & 7;
-079
-080       if ((residue == 1) || (residue == 7)) \{
-081         s = 1;
-082       \} else if ((residue == 3) || (residue == 5)) \{
-083         s = -1;
-084       \}
-085     \}
-086
-087     /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-088     if ( ((n->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) \{
-089       s = -s;
-090     \}
-091
-092     /* if a1 == 1 we're done */
-093     if (mp_cmp_d (&a1, 1) == MP_EQ) \{
-094       *c = s;
-095     \} else \{
-096       /* n1 = n mod a1 */
-097       if ((res = mp_mod (n, &a1, &p1)) != MP_OKAY) \{
-098         goto LBL_P1;
-099       \}
-100       if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) \{
-101         goto LBL_P1;
-102       \}
-103       *c = s * r;
-104     \}
-105
-106     /* done */
-107     res = MP_OKAY;
-108   LBL_P1:mp_clear (&p1);
-109   LBL_A1:mp_clear (&a1);
-110     return res;
-111   \}
-112   #endif
-113
-\end{alltt}
-\end{small}
-
-As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C
-variable name character.
-
-The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm.  If the input is non-trivial the algorithm
-has to proceed compute the Jacobi.  The variable $s$ is used to hold the current Jacobi product.  Note that $s$ is merely a C ``int'' data type since
-the values it may obtain are merely $-1$, $0$ and $1$.
-
-After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$.  Technically only the least significant
-bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same
-processor requirements and neither is faster than the other.
-
-Line 59 through 71 determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
-$k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
-$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines 73 through 76.
-
-Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.
-
-\textit{-- Comment about default $s$ and such...}
-
-\section{Modular Inverse}
-\label{sec:modinv}
-The modular inverse of a number actually refers to the modular multiplicative inverse.  Essentially for any integer $a$ such that $(a, p) = 1$ there
-exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$.  The integer $b$ is called the multiplicative inverse of $a$ which is
-denoted as $b = a^{-1}$.  Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and
-fields of integers.  However, the former will be the matter of discussion.
-
-The simplest approach is to compute the algebraic inverse of the input.  That is to compute $b \equiv a^{\Phi(p) - 1}$.  If $\Phi(p)$ is the
-order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$.  The proof of which is trivial.
-
-\begin{equation}
-ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)}
-\end{equation}
-
-However, as simple as this approach may be it has two serious flaws.  It requires that the value of $\Phi(p)$ be known which if $p$ is composite
-requires all of the prime factors.  This approach also is very slow as the size of $p$ grows.
-
-A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear
-Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation.
-
-\begin{equation}
-ab + pq = 1
-\end{equation}
-
-Where $a$, $b$, $p$ and $q$ are all integers.  If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of
-$a$ modulo $p$.  The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$.
-However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place.  The
-binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine
-equation.
-
-\subsection{General Case}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_invmod}. \\
-\textbf{Input}.   mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$.  \\
-\textbf{Output}.  The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then return(\textit{MP\_VAL}). \\
-2.  If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\
-3.  $x \leftarrow \vert a \vert, y \leftarrow b$ \\
-4.  If $x_0 \equiv y_0  \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\
-5.  $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\
-6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
-\hspace{3mm}6.2  If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
-\hspace{6mm}6.2.1  $A \leftarrow A + y$ \\
-\hspace{6mm}6.2.2  $B \leftarrow B - x$ \\
-\hspace{3mm}6.3  $A \leftarrow \lfloor A / 2 \rfloor$ \\
-\hspace{3mm}6.4  $B \leftarrow \lfloor B / 2 \rfloor$ \\
-7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
-\hspace{3mm}7.2  If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
-\hspace{6mm}7.2.1  $C \leftarrow C + y$ \\
-\hspace{6mm}7.2.2  $D \leftarrow D - x$ \\
-\hspace{3mm}7.3  $C \leftarrow \lfloor C / 2 \rfloor$ \\
-\hspace{3mm}7.4  $D \leftarrow \lfloor D / 2 \rfloor$ \\
-8.  If $u \ge v$ then \\
-\hspace{3mm}8.1  $u \leftarrow u - v$ \\
-\hspace{3mm}8.2  $A \leftarrow A - C$ \\
-\hspace{3mm}8.3  $B \leftarrow B - D$ \\
-9.  else \\
-\hspace{3mm}9.1  $v \leftarrow v - u$ \\
-\hspace{3mm}9.2  $C \leftarrow C - A$ \\
-\hspace{3mm}9.3  $D \leftarrow D - B$ \\
-10.  If $u \ne 0$ goto step 6. \\
-11.  If $v \ne 1$ return(\textit{MP\_VAL}). \\
-12.  While $C \le 0$ do \\
-\hspace{3mm}12.1  $C \leftarrow C + b$ \\
-13.  While $C \ge b$ do \\
-\hspace{3mm}13.1  $C \leftarrow C - b$ \\
-14.  $c \leftarrow C$ \\
-15.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\end{figure}
-\textbf{Algorithm mp\_invmod.}
-This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$.  This algorithm is a variation of the
-extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}.  It has been modified to only compute the modular inverse and not a complete
-Diophantine solution.
-
-If $b \le 0$ than the modulus is invalid and MP\_VAL is returned.  Similarly if both $a$ and $b$ are even then there cannot be a multiplicative
-inverse for $a$ and the error is reported.
-
-The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd.  In this case
-the other variables to the Diophantine equation are solved.  The algorithm terminates when $u = 0$ in which case the solution is
-
-\begin{equation}
-Ca + Db = v
-\end{equation}
-
-If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists.  Otherwise, $C$
-is the modular inverse of $a$.  The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie
-within $1 \le a^{-1} < b$.  Step numbers twelve and thirteen adjust the inverse until it is in range.  If the original input $a$ is within $0 < a < p$
-then only a couple of additions or subtractions will be required to adjust the inverse.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_invmod.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* hac 14.61, pp608 */
-018   int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-019   \{
-020     /* b cannot be negative */
-021     if ((b->sign == MP_NEG) || (mp_iszero(b) == MP_YES)) \{
-022       return MP_VAL;
-023     \}
-024
-025   #ifdef BN_FAST_MP_INVMOD_C
-026     /* if the modulus is odd we can use a faster routine instead */
-027     if (mp_isodd (b) == MP_YES) \{
-028       return fast_mp_invmod (a, b, c);
-029     \}
-030   #endif
-031
-032   #ifdef BN_MP_INVMOD_SLOW_C
-033     return mp_invmod_slow(a, b, c);
-034   #else
-035     return MP_VAL;
-036   #endif
-037   \}
-038   #endif
-039
-\end{alltt}
-\end{small}
-
-\subsubsection{Odd Moduli}
-
-When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse.  In particular by attempting to solve
-the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$.
-
-The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed.  This
-optimization will halve the time required to compute the modular inverse.
-
-\section{Primality Tests}
-
-A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself.  For example, $a = 7$ is prime
-since the integers $2 \ldots 6$ do not evenly divide $a$.  By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$.
-
-Prime numbers arise in cryptography considerably as they allow finite fields to be formed.  The ability to determine whether an integer is prime or
-not quickly has been a viable subject in cryptography and number theory for considerable time.  The algorithms that will be presented are all
-probablistic algorithms in that when they report an integer is composite it must be composite.  However, when the algorithms report an integer is
-prime the algorithm may be incorrect.
-
-As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as
-well be zero.  For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question.
-
-\subsection{Trial Division}
-
-Trial division means to attempt to evenly divide a candidate integer by small prime integers.  If the candidate can be evenly divided it obviously
-cannot be prime.  By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime.  However, such a test
-would require a prohibitive amount of time as $n$ grows.
-
-Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead.  By performing trial division with only a subset
-of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime.  However, often it can prove a candidate is not prime.
-
-The benefit of this test is that trial division by small values is fairly efficient.  Specially compared to the other algorithms that will be
-discussed shortly.  The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by
-$1 - {1.12 \over ln(q)}$.  The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range
-$3 \le q \le 100$.
-
-At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly.  At $q = 90$ further testing is generally not going to
-be of any practical use.  In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate
-approximately $80\%$ of all candidate integers.  The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base.  The
-array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$.  \\
-\hline \\
-1.  for $ix$ from $0$ to $PRIME\_SIZE$ do \\
-\hspace{3mm}1.1  $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\
-\hspace{3mm}1.2  If $d = 0$ then \\
-\hspace{6mm}1.2.1  $c \leftarrow 1$ \\
-\hspace{6mm}1.2.2  Return(\textit{MP\_OKAY}). \\
-2.  $c \leftarrow 0$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_prime\_is\_divisible}
-\end{figure}
-\textbf{Algorithm mp\_prime\_is\_divisible.}
-This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_is\_divisible.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* determines if an integers is divisible by one
-018    * of the first PRIME_SIZE primes or not
-019    *
-020    * sets result to 0 if not, 1 if yes
-021    */
-022   int mp_prime_is_divisible (mp_int * a, int *result)
-023   \{
-024     int     err, ix;
-025     mp_digit res;
-026
-027     /* default to not */
-028     *result = MP_NO;
-029
-030     for (ix = 0; ix < PRIME_SIZE; ix++) \{
-031       /* what is a mod LBL_prime_tab[ix] */
-032       if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) \{
-033         return err;
-034       \}
-035
-036       /* is the residue zero? */
-037       if (res == 0) \{
-038         *result = MP_YES;
-039         return MP_OKAY;
-040       \}
-041     \}
-042
-043     return MP_OKAY;
-044   \}
-045   #endif
-046
-\end{alltt}
-\end{small}
-
-The algorithm defaults to a return of $0$ in case an error occurs.  The values in the prime table are all specified to be in the range of a
-mp\_digit.  The table \_\_prime\_tab is defined in the following file.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_prime\_tab.c
-\vspace{-3mm}
-\begin{alltt}
-016   const mp_digit ltm_prime_tab[] = \{
-017     0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-018     0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-019     0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-020     0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
-021   #ifndef MP_8BIT
-022     0x0083,
-023     0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-024     0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-025     0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-026     0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-027
-028     0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-029     0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-030     0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-031     0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-032     0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-033     0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-034     0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-035     0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-036
-037     0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-038     0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-039     0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-040     0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-041     0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-042     0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-043     0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-044     0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-045
-046     0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-047     0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-048     0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-049     0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-050     0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-051     0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-052     0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-053     0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
-054   #endif
-055   \};
-056   #endif
-057
-\end{alltt}
-\end{small}
-
-Note that there are two possible tables.  When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes
-upto $1619$ are used.  Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit.
-
-\subsection{The Fermat Test}
-The Fermat test is probably one the oldest tests to have a non-trivial probability of success.  It is based on the fact that if $n$ is in
-fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$.  The reason being that if $n$ is prime than the order of
-the multiplicative sub group is $n - 1$.  Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to
-$a^1 = a$.
-
-If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$.  In which case
-it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$.  However, this test is not absolute as it is possible that the order
-of a base will divide $n - 1$ which would then be reported as prime.  Such a base yields what is known as a Fermat pseudo-prime.  Several
-integers known as Carmichael numbers will be a pseudo-prime to all valid bases.  Fortunately such numbers are extremely rare as $n$ grows
-in size.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_prime\_fermat}. \\
-\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
-\textbf{Output}.  $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$.  \\
-\hline \\
-1.  $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\
-2.  If $t = b$ then \\
-\hspace{3mm}2.1  $c = 1$ \\
-3.  else \\
-\hspace{3mm}3.1  $c = 0$ \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_prime\_fermat}
-\end{figure}
-\textbf{Algorithm mp\_prime\_fermat.}
-This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not.  It uses a single modular exponentiation to
-determine the result.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_fermat.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* performs one Fermat test.
-018    *
-019    * If "a" were prime then b**a == b (mod a) since the order of
-020    * the multiplicative sub-group would be phi(a) = a-1.  That means
-021    * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
-022    *
-023    * Sets result to 1 if the congruence holds, or zero otherwise.
-024    */
-025   int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
-026   \{
-027     mp_int  t;
-028     int     err;
-029
-030     /* default to composite  */
-031     *result = MP_NO;
-032
-033     /* ensure b > 1 */
-034     if (mp_cmp_d(b, 1) != MP_GT) \{
-035        return MP_VAL;
-036     \}
-037
-038     /* init t */
-039     if ((err = mp_init (&t)) != MP_OKAY) \{
-040       return err;
-041     \}
-042
-043     /* compute t = b**a mod a */
-044     if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) \{
-045       goto LBL_T;
-046     \}
-047
-048     /* is it equal to b? */
-049     if (mp_cmp (&t, b) == MP_EQ) \{
-050       *result = MP_YES;
-051     \}
-052
-053     err = MP_OKAY;
-054   LBL_T:mp_clear (&t);
-055     return err;
-056   \}
-057   #endif
-058
-\end{alltt}
-\end{small}
-
-\subsection{The Miller-Rabin Test}
-The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen
-candidate  integers.  The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the
-value must be equal to $-1$.  The squarings are stopped as soon as $-1$ is observed.  If the value of $1$ is observed first it means that
-some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\
-\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
-\textbf{Output}.  $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$.  \\
-\hline
-1.  $a' \leftarrow a - 1$ \\
-2.  $r  \leftarrow n1$    \\
-3.  $c \leftarrow 0, s  \leftarrow 0$ \\
-4.  While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
-\hspace{3mm}4.1  $s \leftarrow s + 1$ \\
-\hspace{3mm}4.2  $r \leftarrow \lfloor r / 2 \rfloor$ \\
-5.  $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\
-6.  If $y \nequiv \pm 1$ then \\
-\hspace{3mm}6.1  $j \leftarrow 1$ \\
-\hspace{3mm}6.2  While $j \le (s - 1)$ and $y \nequiv a'$ \\
-\hspace{6mm}6.2.1  $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\
-\hspace{6mm}6.2.2  If $y = 1$ then goto step 8. \\
-\hspace{6mm}6.2.3  $j \leftarrow j + 1$ \\
-\hspace{3mm}6.3  If $y \nequiv a'$ goto step 8. \\
-7.  $c \leftarrow 1$\\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_prime\_miller\_rabin}
-\end{figure}
-\textbf{Algorithm mp\_prime\_miller\_rabin.}
-This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$.  It will set $c = 1$ if the algorithm cannot determine
-if $b$ is composite or $c = 0$ if $b$ is provably composite.  The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$.
-
-If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not.  Otherwise, the algorithm will
-square $y$ upto $s - 1$ times stopping only when $y \equiv -1$.  If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$
-is provably composite.  If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite.  If $a$ is not provably
-composite then it is \textit{probably} prime.
-
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_miller\_rabin.c
-\vspace{-3mm}
-\begin{alltt}
-016
-017   /* Miller-Rabin test of "a" to the base of "b" as described in
-018    * HAC pp. 139 Algorithm 4.24
-019    *
-020    * Sets result to 0 if definitely composite or 1 if probably prime.
-021    * Randomly the chance of error is no more than 1/4 and often
-022    * very much lower.
-023    */
-024   int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
-025   \{
-026     mp_int  n1, y, r;
-027     int     s, j, err;
-028
-029     /* default */
-030     *result = MP_NO;
-031
-032     /* ensure b > 1 */
-033     if (mp_cmp_d(b, 1) != MP_GT) \{
-034        return MP_VAL;
-035     \}
-036
-037     /* get n1 = a - 1 */
-038     if ((err = mp_init_copy (&n1, a)) != MP_OKAY) \{
-039       return err;
-040     \}
-041     if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) \{
-042       goto LBL_N1;
-043     \}
-044
-045     /* set 2**s * r = n1 */
-046     if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) \{
-047       goto LBL_N1;
-048     \}
-049
-050     /* count the number of least significant bits
-051      * which are zero
-052      */
-053     s = mp_cnt_lsb(&r);
-054
-055     /* now divide n - 1 by 2**s */
-056     if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) \{
-057       goto LBL_R;
-058     \}
-059
-060     /* compute y = b**r mod a */
-061     if ((err = mp_init (&y)) != MP_OKAY) \{
-062       goto LBL_R;
-063     \}
-064     if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) \{
-065       goto LBL_Y;
-066     \}
-067
-068     /* if y != 1 and y != n1 do */
-069     if ((mp_cmp_d (&y, 1) != MP_EQ) && (mp_cmp (&y, &n1) != MP_EQ)) \{
-070       j = 1;
-071       /* while j <= s-1 and y != n1 */
-072       while ((j <= (s - 1)) && (mp_cmp (&y, &n1) != MP_EQ)) \{
-073         if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) \{
-074            goto LBL_Y;
-075         \}
-076
-077         /* if y == 1 then composite */
-078         if (mp_cmp_d (&y, 1) == MP_EQ) \{
-079            goto LBL_Y;
-080         \}
-081
-082         ++j;
-083       \}
-084
-085       /* if y != n1 then composite */
-086       if (mp_cmp (&y, &n1) != MP_EQ) \{
-087         goto LBL_Y;
-088       \}
-089     \}
-090
-091     /* probably prime now */
-092     *result = MP_YES;
-093   LBL_Y:mp_clear (&y);
-094   LBL_R:mp_clear (&r);
-095   LBL_N1:mp_clear (&n1);
-096     return err;
-097   \}
-098   #endif
-099
-\end{alltt}
-\end{small}
-
-
-
-
-\backmatter
-\appendix
-\begin{thebibliography}{ABCDEF}
-\bibitem[1]{TAOCPV2}
-Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
-
-\bibitem[2]{HAC}
-A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
-
-\bibitem[3]{ROSE}
-Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
-
-\bibitem[4]{COMBA}
-Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
-
-\bibitem[5]{KARA}
-A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
-
-\bibitem[6]{KARAP}
-Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
-
-\bibitem[7]{BARRETT}
-Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
-
-\bibitem[8]{MONT}
-P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
-
-\bibitem[9]{DRMET}
-Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
-
-\bibitem[10]{MMB}
-J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
-
-\bibitem[11]{RSAREF}
-R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems}
-
-\bibitem[12]{DHREF}
-Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976
-
-\bibitem[13]{IEEE}
-IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985)
-
-\bibitem[14]{GMP}
-GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/}
-
-\bibitem[15]{MPI}
-Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/}
-
-\bibitem[16]{OPENSSL}
-OpenSSL Cryptographic Toolkit, \url{http://openssl.org}
-
-\bibitem[17]{LIP}
-Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip}
-
-\bibitem[18]{ISOC}
-JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.''
-
-\bibitem[19]{JAVA}
-The Sun Java Website, \url{http://java.sun.com/}
-
-\end{thebibliography}
-
-\input{tommath.ind}
-
-\end{document}
diff --git a/libtommath/tommath_private.h b/libtommath/tommath_private.h
index f054fed..d23c333 100644
--- a/libtommath/tommath_private.h
+++ b/libtommath/tommath_private.h
@@ -18,11 +18,13 @@
 #include <tommath.h>
 #include <ctype.h>
 
-#if 0
-
+#ifndef MIN
 #define MIN(x,y) (((x) < (y)) ? (x) : (y))
+#endif
 
+#ifndef MAX
 #define MAX(x,y) (((x) > (y)) ? (x) : (y))
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,8 +39,8 @@ extern "C" {
 
 #endif
 
-
 /* define heap macros */
+#if 0
 #ifndef XMALLOC
    /* default to libc stuff */
    #define XMALLOC  malloc
@@ -52,6 +54,7 @@ extern "C" {
    extern void *XCALLOC(size_t n, size_t s);
    extern void XFREE(void *p);
 #endif
+#endif
 
 /* lowlevel functions, do not call! */
 int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
@@ -109,7 +112,6 @@ int func_name (mp_int * a, type b)                       \
   mp_clamp (a);                                          \
   return MP_OKAY;                                        \
 }
-#endif
 
 #ifdef __cplusplus
    }
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2016-11-17 10:46:09 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2016-11-17 10:46:09 (GMT)
commit	c011864b3411bd607efb52ffd86bb9b91e8e1bf3 (patch)
tree	5ae27db53133eec3cea2523f3df4c28df354b71e /libtommath
parent	68111aa5bf7fc228dcfda8beb9de265734925b56 (diff)
parent	3dd86e6ebc0137c3a2c02d3a046de046571e3789 (diff)
download	tcl-c011864b3411bd607efb52ffd86bb9b91e8e1bf3.zip tcl-c011864b3411bd607efb52ffd86bb9b91e8e1bf3.tar.gz tcl-c011864b3411bd607efb52ffd86bb9b91e8e1bf3.tar.bz2