592 lines
17 KiB
C
592 lines
17 KiB
C
|
||
/***************************************************************************
|
||
*
|
||
Copyright 2013 CertiVox UK Ltd. *
|
||
*
|
||
This file is part of CertiVox MIRACL Crypto SDK. *
|
||
*
|
||
The CertiVox MIRACL Crypto SDK provides developers with an *
|
||
extensive and efficient set of cryptographic functions. *
|
||
For further information about its features and functionalities please *
|
||
refer to http://www.certivox.com *
|
||
*
|
||
* The CertiVox MIRACL Crypto SDK is free software: you can *
|
||
redistribute it and/or modify it under the terms of the *
|
||
GNU Affero General Public License as published by the *
|
||
Free Software Foundation, either version 3 of the License, *
|
||
or (at your option) any later version. *
|
||
*
|
||
* The CertiVox MIRACL Crypto SDK is distributed in the hope *
|
||
that it will be useful, but WITHOUT ANY WARRANTY; without even the *
|
||
implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
|
||
See the GNU Affero General Public License for more details. *
|
||
*
|
||
* You should have received a copy of the GNU Affero General Public *
|
||
License along with CertiVox MIRACL Crypto SDK. *
|
||
If not, see <http://www.gnu.org/licenses/>. *
|
||
*
|
||
You can be released from the requirements of the license by purchasing *
|
||
a commercial license. Buying such a license is mandatory as soon as you *
|
||
develop commercial activities involving the CertiVox MIRACL Crypto SDK *
|
||
without disclosing the source code of your own applications, or shipping *
|
||
the CertiVox MIRACL Crypto SDK with a closed source product. *
|
||
*
|
||
***************************************************************************/
|
||
/*
|
||
* Many processors support a floating-point coprocessor, which may
|
||
* implement a faster multiplication instruction than the corresponding
|
||
* integer instruction. This is the case for the Pentium processor
|
||
* which has a built-in co-processor. This can be exploited to give even
|
||
* faster performance.
|
||
*
|
||
* Note that since the partial products are accumulated in a 64-bit register
|
||
* this implies that a full-width number base (2^32) cannot be used.
|
||
* The maximum number base that can be used is 2^x where x is
|
||
* calculated such that 2^(64-2*x) > 2*WORDS_IN_MODULUS. This means that
|
||
* x will usually be 28 or 29
|
||
*
|
||
* To use this code:-
|
||
*
|
||
* (1) Implemented and tested only for the Pentium processor and
|
||
* using the Borland C compiler (BCC and BCC32)
|
||
*
|
||
* (2) Define MR_PENTIUM in mirdef.h. Determine the maximum modulus to be
|
||
* used, and from that determine the value of WORDS_IN_MODULUS.
|
||
*
|
||
* (3) Use as a number base the value of x calculated as shown above.
|
||
* For example, for 512 bit exponentiation, WORDS_IN_MODULUS will be 18
|
||
* so call mirsys(50,536870912L) in your main program.
|
||
* (Observe that 536870912 = 2^29, and that 18*29 = 522, big enough
|
||
* for 512 bit calculations).
|
||
*
|
||
* (4) Use Montgomery representation when implementing your crypto-system
|
||
* e.g. use monty_powmod(). This will automatically call the
|
||
* routines in this module.
|
||
*
|
||
* Note that it is *VITAL* that double arrays be aligned on 8-byte
|
||
* boundaries for the Pentium. The Borland C compiler does *not* do this
|
||
* automatically!!!!
|
||
*
|
||
* Many thanks are due to Paul Rubin, who suggested to me that this approach
|
||
* might be faster than the all-integer methods described elsewhere.
|
||
*
|
||
* Further speed increases can be acheived by loop-unrolling. Completely
|
||
* unrolled code (a la Comba) has been experimented with, and gives a
|
||
* 25% speed-up in some cases. Note that the basic code for a single partial
|
||
* product takes only 3 cycles.
|
||
*
|
||
* fld ... ;1 cycle
|
||
* fmul ... ;1 cycle
|
||
* fxch st(2) ;0 cycle
|
||
* fadd ;1 cycle
|
||
*
|
||
* Compare this with the integer "mul" instruction which takes 10 cycles
|
||
* on a Pentium
|
||
*
|
||
* Interestingly the fmul is faster than the fimul. So paradoxically it is
|
||
* quicker to manipulate 64-bit doubles than it is to manipulate 32-bit
|
||
* integers. Clearly the Pentium FP processor has been optimised for real
|
||
* arithmetic. However this requires us to convert all bigs from integer
|
||
* arrays to double arrays (see mrmonty.c) which is very wasteful of space
|
||
* and rather awkward.
|
||
*
|
||
*
|
||
* The FP stack is primed in prepare_monty() :-
|
||
* magic - (2^63+2^62)*base. By adding and then subtracting this number we
|
||
* get the top half of the sum.
|
||
* 1/base - Inverse of the number base
|
||
* ndash - Montgomery's constant
|
||
*
|
||
*/
|
||
|
||
#include "miracl.h"
|
||
|
||
#ifdef MR_PENTIUM
|
||
#define N 8
|
||
#define POINTER QWORD PTR
|
||
|
||
#if INLINE_ASM == 1
|
||
#define PAX ax
|
||
#define PBP bp
|
||
#define PBX bx
|
||
#define PSI si
|
||
#define PDI di
|
||
#define PCX cx
|
||
#define PDX dx
|
||
#endif
|
||
|
||
#if INLINE_ASM == 2
|
||
#define PAX ax
|
||
#define PBP bp
|
||
#define PBX bx
|
||
#define PSI si
|
||
#define PDI di
|
||
#define PCX cx
|
||
#define PDX dx
|
||
#endif
|
||
|
||
#if INLINE_ASM == 3
|
||
#define PAX eax
|
||
#define PBP ebp
|
||
#define PBX ebx
|
||
#define PSI esi
|
||
#define PDI edi
|
||
#define PCX ecx
|
||
#define PDX edx
|
||
#endif
|
||
|
||
#ifdef INLINE_ASM
|
||
#ifndef MR_LMM
|
||
/* not implemented for large memory model 16 bit */
|
||
|
||
void fastmodmult(_MIPD_ big x,big y,big z)
|
||
{
|
||
int ij,rn,nrn;
|
||
#ifdef MR_OS_THREADS
|
||
miracl *mr_mip=get_mip();
|
||
#endif
|
||
big modulus=mr_mip->modulus;
|
||
big w0=mr_mip->w0;
|
||
mr_small *wg,*mg,*xg,*yg;
|
||
wg=w0->w;
|
||
mg=modulus->w;
|
||
xg=x->w;
|
||
yg=y->w;
|
||
rn=(int)modulus->len;
|
||
for (ij=2*rn;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0;
|
||
w0->len=2*rn;
|
||
nrn=N*rn;
|
||
|
||
ASM push PBP
|
||
ASM push PDI
|
||
ASM push PSI
|
||
|
||
ASM mov PBX,xg
|
||
ASM mov PSI,yg
|
||
ASM mov PDX,mg
|
||
ASM mov PDI,wg
|
||
ASM mov PAX,nrn
|
||
|
||
ASM mov PBP,N
|
||
|
||
ASM fldz
|
||
|
||
ASM xor PCX,PCX
|
||
m1:
|
||
ASM push PCX
|
||
ASM add PSI,PCX
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM add PBX,PBP
|
||
ASM sub PSI,PBP
|
||
ASM test PCX,PCX
|
||
ASM jz m3
|
||
m2:
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM add PBX,PBP
|
||
ASM sub PSI,PBP
|
||
ASM fadd
|
||
ASM sub PCX,PBP
|
||
ASM jnz m2
|
||
m3:
|
||
ASM sub PBX,PBP
|
||
ASM add PSI,PBP
|
||
ASM fadd
|
||
ASM pop PCX
|
||
ASM sub PBX,PCX /* restore PBX */
|
||
|
||
ASM xchg PSI,PDX /* PSI -> modulus */
|
||
ASM push PCX
|
||
ASM test PCX,PCX
|
||
ASM jz m6
|
||
ASM add PSI,PCX
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM sub PCX,PBP
|
||
ASM jz m5
|
||
m4: /* this is typical of the critical inner loop */
|
||
ASM fld POINTER [PDI] /* 1 cycle */
|
||
ASM fmul POINTER [PSI] /* 1 cycle */
|
||
ASM fxch st(2) /* 0 cycle */
|
||
ASM add PDI,PBP /* 1 cycle */
|
||
ASM sub PSI,PBP /* 0 cycle */
|
||
ASM fadd /* 1 cycle */
|
||
ASM sub PCX,PBP /* 1 cycle */
|
||
ASM jnz m4 /* 0 cycle */
|
||
/* total = 5 cycles */
|
||
m5:
|
||
ASM fadd
|
||
m6:
|
||
|
||
ASM fld st(0)
|
||
ASM fadd st,st(2)
|
||
ASM fsub st,st(2)
|
||
ASM fsubr st,st(1)
|
||
ASM fmul st,st(4)
|
||
ASM fld st(0)
|
||
ASM fadd st,st(3)
|
||
ASM fsub st,st(3)
|
||
ASM fsub
|
||
ASM fst POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fadd
|
||
ASM fmul st,st(2)
|
||
|
||
ASM xchg PSI,PDX
|
||
ASM pop PCX
|
||
ASM sub PDI,PCX /* restore PDI */
|
||
|
||
|
||
ASM add PCX,PBP /* increment PCX */
|
||
ASM cmp PCX,PAX
|
||
ASM jl m1
|
||
|
||
ASM sub PCX,PBP /* PCX=12 */
|
||
ASM add PSI,PCX
|
||
ASM add PBX,PCX /* PBX -> x[4] */
|
||
ASM add PDX,PCX
|
||
ASM add PDI,PCX
|
||
ASM sub PCX,PBP /* going back down again PCX=8 */
|
||
m7:
|
||
ASM push PCX
|
||
ASM sub PBX,PCX
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM add PBX,PBP
|
||
ASM sub PSI,PBP
|
||
ASM test PCX,PCX
|
||
ASM jz m9
|
||
m8:
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM add PBX,PBP
|
||
ASM sub PSI,PBP
|
||
ASM fadd
|
||
ASM sub PCX,PBP
|
||
ASM jnz m8
|
||
m9:
|
||
ASM sub PBX,PBP
|
||
ASM add PSI,PBP
|
||
ASM fadd
|
||
ASM pop PCX
|
||
ASM add PSI,PCX /* restore PSI */
|
||
|
||
ASM sub PDI,PCX
|
||
ASM xchg PSI,PDX /* PSI -> modulus */
|
||
ASM push PCX
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM test PCX,PCX
|
||
ASM jz m11
|
||
m10:
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM fadd
|
||
ASM sub PCX,PBP
|
||
ASM jnz m10
|
||
m11:
|
||
ASM sub PDI,PBP
|
||
ASM add PSI,PBP
|
||
ASM fadd
|
||
ASM pop PCX
|
||
ASM add PSI,PCX /* restore PSI */
|
||
ASM xchg PSI,PDX
|
||
|
||
ASM push PDI
|
||
ASM add PDI,PAX
|
||
ASM sub PDI,PCX
|
||
ASM sub PDI,PBP
|
||
|
||
|
||
ASM fld st(0)
|
||
ASM fadd st,st(2)
|
||
ASM fsub st,st(2)
|
||
ASM fst st(5)
|
||
ASM fmul st,st(3)
|
||
ASM fxch st(5)
|
||
ASM fsub
|
||
ASM fstp POINTER [PDI]
|
||
ASM fld st(3)
|
||
|
||
ASM pop PDI
|
||
|
||
ASM sub PCX,PBP
|
||
ASM jge m7
|
||
|
||
ASM add PDI,PAX
|
||
|
||
ASM fld st(0)
|
||
ASM fadd st,st(2)
|
||
ASM fsub st,st(2)
|
||
ASM fst st(5)
|
||
ASM fmul st,st(3)
|
||
ASM fxch st(5)
|
||
ASM fsub
|
||
ASM fstp POINTER [PDI]
|
||
ASM fld st(3)
|
||
|
||
ASM add PDI,PBP
|
||
ASM fstp POINTER [PDI]
|
||
|
||
ASM pop PSI
|
||
ASM pop PDI
|
||
ASM pop PBP
|
||
|
||
for (ij=rn;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0;
|
||
z->len=rn;
|
||
for (ij=0;ij<rn;ij++) z->w[ij]=w0->w[ij+rn];
|
||
if (z->w[rn-1]==0.0) mr_lzero(z);
|
||
|
||
}
|
||
|
||
void fastmodsquare(_MIPD_ big x,big z)
|
||
{
|
||
int ij,rn,nrn;
|
||
#ifdef MR_OS_THREADS
|
||
miracl *mr_mip=get_mip();
|
||
#endif
|
||
big modulus=mr_mip->modulus;
|
||
big w0=mr_mip->w0;
|
||
mr_small *wg,*mg,*xg;
|
||
wg=w0->w;
|
||
mg=modulus->w;
|
||
xg=x->w;
|
||
|
||
rn=(int)modulus->len;
|
||
for (ij=2*rn;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0;
|
||
w0->len=2*rn;
|
||
nrn=N*rn;
|
||
|
||
|
||
ASM push PBP
|
||
ASM push PDI
|
||
ASM push PSI
|
||
|
||
ASM mov PBX,xg
|
||
ASM mov PSI,xg
|
||
ASM mov PDX,mg
|
||
ASM mov PDI,wg
|
||
ASM mov PAX,nrn
|
||
|
||
ASM mov PBP,N
|
||
|
||
ASM fldz
|
||
|
||
ASM xor PCX,PCX
|
||
s1:
|
||
ASM push PBX
|
||
ASM push PSI
|
||
ASM test PCX,PCX
|
||
ASM jz s4
|
||
ASM add PSI,PCX
|
||
ASM fstp st(5)
|
||
ASM fldz
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM sub PSI,PBP
|
||
ASM add PBX,PBP
|
||
ASM cmp PSI,PBX
|
||
ASM jle s3
|
||
s2:
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM sub PSI,PBP
|
||
ASM add PBX,PBP
|
||
ASM fadd
|
||
ASM cmp PSI,PBX
|
||
ASM jg s2
|
||
s3:
|
||
ASM fadd
|
||
ASM fld st(0)
|
||
ASM fadd
|
||
ASM fadd st,st(5)
|
||
s4:
|
||
ASM cmp PSI,PBX
|
||
ASM jne s5
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul st,st(0)
|
||
ASM fadd
|
||
s5:
|
||
ASM pop PSI
|
||
ASM pop PBX /* restore pointers */
|
||
|
||
ASM xchg PSI,PDX /* PSI -> modulus */
|
||
ASM push PCX
|
||
ASM test PCX,PCX
|
||
ASM jz s8
|
||
ASM add PSI,PCX
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM sub PCX,PBP
|
||
ASM jz s7
|
||
s6:
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM fadd
|
||
ASM sub PCX,PBP
|
||
ASM jnz s6
|
||
s7:
|
||
ASM fadd
|
||
s8:
|
||
|
||
ASM fld st(0)
|
||
ASM fadd st,st(2)
|
||
ASM fsub st,st(2)
|
||
ASM fsubr st,st(1)
|
||
ASM fmul st,st(4)
|
||
ASM fld st(0)
|
||
ASM fadd st,st(3)
|
||
ASM fsub st,st(3)
|
||
ASM fsub
|
||
ASM fst POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fadd
|
||
ASM fmul st,st(2)
|
||
|
||
ASM xchg PSI,PDX
|
||
ASM pop PCX
|
||
ASM sub PDI,PCX /* restore PDI */
|
||
|
||
ASM add PCX,PBP /* increment PCX */
|
||
ASM cmp PCX,PAX
|
||
ASM jl s1
|
||
|
||
ASM sub PCX,PBP /* PCX=12 */
|
||
ASM add PSI,PCX
|
||
ASM add PBX,PCX /* PBX -> x[4] */
|
||
ASM add PDX,PCX
|
||
ASM add PDI,PCX
|
||
ASM sub PCX,PBP /* going back down again PCX=8 */
|
||
s9:
|
||
ASM push PBX
|
||
ASM push PSI
|
||
ASM test PCX,PCX
|
||
ASM jz s13
|
||
s10:
|
||
ASM sub PBX,PCX
|
||
ASM fstp st(5)
|
||
ASM fldz
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM sub PSI,PBP
|
||
ASM add PBX,PBP
|
||
ASM cmp PSI,PBX
|
||
ASM jle s12
|
||
s11:
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM sub PSI,PBP
|
||
ASM add PBX,PBP
|
||
ASM fadd
|
||
ASM cmp PSI,PBX
|
||
ASM jg s11
|
||
s12:
|
||
ASM fadd
|
||
ASM fld st(0)
|
||
ASM fadd
|
||
ASM fadd st,st(5)
|
||
s13:
|
||
ASM cmp PSI,PBX
|
||
ASM jne s14
|
||
ASM fld POINTER [PBX]
|
||
ASM fmul st,st(0)
|
||
ASM fadd
|
||
s14:
|
||
ASM pop PSI
|
||
ASM pop PBX
|
||
|
||
ASM sub PDI,PCX
|
||
ASM xchg PSI,PDX /* PSI -> modulus */
|
||
ASM push PCX
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM test PCX,PCX
|
||
ASM jz s16
|
||
s15:
|
||
ASM fld POINTER [PDI]
|
||
ASM fmul POINTER [PSI]
|
||
ASM fxch st(2)
|
||
ASM add PDI,PBP
|
||
ASM sub PSI,PBP
|
||
ASM fadd
|
||
ASM sub PCX,PBP
|
||
ASM jnz s15
|
||
s16:
|
||
ASM sub PDI,PBP
|
||
ASM add PSI,PBP
|
||
ASM fadd
|
||
ASM |