KGC_TEST/KGC/miracl/source/mr87v.c

592 lines
17 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/***************************************************************************
*
Copyright 2013 CertiVox UK Ltd. *
*
This file is part of CertiVox MIRACL Crypto SDK. *
*
The CertiVox MIRACL Crypto SDK provides developers with an *
extensive and efficient set of cryptographic functions. *
For further information about its features and functionalities please *
refer to http://www.certivox.com *
*
* The CertiVox MIRACL Crypto SDK is free software: you can *
redistribute it and/or modify it under the terms of the *
GNU Affero General Public License as published by the *
Free Software Foundation, either version 3 of the License, *
or (at your option) any later version. *
*
* The CertiVox MIRACL Crypto SDK is distributed in the hope *
that it will be useful, but WITHOUT ANY WARRANTY; without even the *
implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
See the GNU Affero General Public License for more details. *
*
* You should have received a copy of the GNU Affero General Public *
License along with CertiVox MIRACL Crypto SDK. *
If not, see <http://www.gnu.org/licenses/>. *
*
You can be released from the requirements of the license by purchasing *
a commercial license. Buying such a license is mandatory as soon as you *
develop commercial activities involving the CertiVox MIRACL Crypto SDK *
without disclosing the source code of your own applications, or shipping *
the CertiVox MIRACL Crypto SDK with a closed source product. *
*
***************************************************************************/
/*
* Many processors support a floating-point coprocessor, which may
* implement a faster multiplication instruction than the corresponding
* integer instruction. This is the case for the Pentium processor
* which has a built-in co-processor. This can be exploited to give even
* faster performance.
*
* Note that since the partial products are accumulated in a 64-bit register
* this implies that a full-width number base (2^32) cannot be used.
* The maximum number base that can be used is 2^x where x is
* calculated such that 2^(64-2*x) > 2*WORDS_IN_MODULUS. This means that
* x will usually be 28 or 29
*
* To use this code:-
*
* (1) Implemented and tested only for the Pentium processor and
* using the Borland C compiler (BCC and BCC32)
*
* (2) Define MR_PENTIUM in mirdef.h. Determine the maximum modulus to be
* used, and from that determine the value of WORDS_IN_MODULUS.
*
* (3) Use as a number base the value of x calculated as shown above.
* For example, for 512 bit exponentiation, WORDS_IN_MODULUS will be 18
* so call mirsys(50,536870912L) in your main program.
* (Observe that 536870912 = 2^29, and that 18*29 = 522, big enough
* for 512 bit calculations).
*
* (4) Use Montgomery representation when implementing your crypto-system
* e.g. use monty_powmod(). This will automatically call the
* routines in this module.
*
* Note that it is *VITAL* that double arrays be aligned on 8-byte
* boundaries for the Pentium. The Borland C compiler does *not* do this
* automatically!!!!
*
* Many thanks are due to Paul Rubin, who suggested to me that this approach
* might be faster than the all-integer methods described elsewhere.
*
* Further speed increases can be acheived by loop-unrolling. Completely
* unrolled code (a la Comba) has been experimented with, and gives a
* 25% speed-up in some cases. Note that the basic code for a single partial
* product takes only 3 cycles.
*
* fld ... ;1 cycle
* fmul ... ;1 cycle
* fxch st(2) ;0 cycle
* fadd ;1 cycle
*
* Compare this with the integer "mul" instruction which takes 10 cycles
* on a Pentium
*
* Interestingly the fmul is faster than the fimul. So paradoxically it is
* quicker to manipulate 64-bit doubles than it is to manipulate 32-bit
* integers. Clearly the Pentium FP processor has been optimised for real
* arithmetic. However this requires us to convert all bigs from integer
* arrays to double arrays (see mrmonty.c) which is very wasteful of space
* and rather awkward.
*
*
* The FP stack is primed in prepare_monty() :-
* magic - (2^63+2^62)*base. By adding and then subtracting this number we
* get the top half of the sum.
* 1/base - Inverse of the number base
* ndash - Montgomery's constant
*
*/
#include "miracl.h"
#ifdef MR_PENTIUM
#define N 8
#define POINTER QWORD PTR
#if INLINE_ASM == 1
#define PAX ax
#define PBP bp
#define PBX bx
#define PSI si
#define PDI di
#define PCX cx
#define PDX dx
#endif
#if INLINE_ASM == 2
#define PAX ax
#define PBP bp
#define PBX bx
#define PSI si
#define PDI di
#define PCX cx
#define PDX dx
#endif
#if INLINE_ASM == 3
#define PAX eax
#define PBP ebp
#define PBX ebx
#define PSI esi
#define PDI edi
#define PCX ecx
#define PDX edx
#endif
#ifdef INLINE_ASM
#ifndef MR_LMM
/* not implemented for large memory model 16 bit */
void fastmodmult(_MIPD_ big x,big y,big z)
{
int ij,rn,nrn;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
big modulus=mr_mip->modulus;
big w0=mr_mip->w0;
mr_small *wg,*mg,*xg,*yg;
wg=w0->w;
mg=modulus->w;
xg=x->w;
yg=y->w;
rn=(int)modulus->len;
for (ij=2*rn;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0;
w0->len=2*rn;
nrn=N*rn;
ASM push PBP
ASM push PDI
ASM push PSI
ASM mov PBX,xg
ASM mov PSI,yg
ASM mov PDX,mg
ASM mov PDI,wg
ASM mov PAX,nrn
ASM mov PBP,N
ASM fldz
ASM xor PCX,PCX
m1:
ASM push PCX
ASM add PSI,PCX
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM add PBX,PBP
ASM sub PSI,PBP
ASM test PCX,PCX
ASM jz m3
m2:
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM add PBX,PBP
ASM sub PSI,PBP
ASM fadd
ASM sub PCX,PBP
ASM jnz m2
m3:
ASM sub PBX,PBP
ASM add PSI,PBP
ASM fadd
ASM pop PCX
ASM sub PBX,PCX /* restore PBX */
ASM xchg PSI,PDX /* PSI -> modulus */
ASM push PCX
ASM test PCX,PCX
ASM jz m6
ASM add PSI,PCX
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM add PDI,PBP
ASM sub PSI,PBP
ASM sub PCX,PBP
ASM jz m5
m4: /* this is typical of the critical inner loop */
ASM fld POINTER [PDI] /* 1 cycle */
ASM fmul POINTER [PSI] /* 1 cycle */
ASM fxch st(2) /* 0 cycle */
ASM add PDI,PBP /* 1 cycle */
ASM sub PSI,PBP /* 0 cycle */
ASM fadd /* 1 cycle */
ASM sub PCX,PBP /* 1 cycle */
ASM jnz m4 /* 0 cycle */
/* total = 5 cycles */
m5:
ASM fadd
m6:
ASM fld st(0)
ASM fadd st,st(2)
ASM fsub st,st(2)
ASM fsubr st,st(1)
ASM fmul st,st(4)
ASM fld st(0)
ASM fadd st,st(3)
ASM fsub st,st(3)
ASM fsub
ASM fst POINTER [PDI]
ASM fmul POINTER [PSI]
ASM fadd
ASM fmul st,st(2)
ASM xchg PSI,PDX
ASM pop PCX
ASM sub PDI,PCX /* restore PDI */
ASM add PCX,PBP /* increment PCX */
ASM cmp PCX,PAX
ASM jl m1
ASM sub PCX,PBP /* PCX=12 */
ASM add PSI,PCX
ASM add PBX,PCX /* PBX -> x[4] */
ASM add PDX,PCX
ASM add PDI,PCX
ASM sub PCX,PBP /* going back down again PCX=8 */
m7:
ASM push PCX
ASM sub PBX,PCX
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM add PBX,PBP
ASM sub PSI,PBP
ASM test PCX,PCX
ASM jz m9
m8:
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM add PBX,PBP
ASM sub PSI,PBP
ASM fadd
ASM sub PCX,PBP
ASM jnz m8
m9:
ASM sub PBX,PBP
ASM add PSI,PBP
ASM fadd
ASM pop PCX
ASM add PSI,PCX /* restore PSI */
ASM sub PDI,PCX
ASM xchg PSI,PDX /* PSI -> modulus */
ASM push PCX
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM add PDI,PBP
ASM sub PSI,PBP
ASM test PCX,PCX
ASM jz m11
m10:
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM add PDI,PBP
ASM sub PSI,PBP
ASM fadd
ASM sub PCX,PBP
ASM jnz m10
m11:
ASM sub PDI,PBP
ASM add PSI,PBP
ASM fadd
ASM pop PCX
ASM add PSI,PCX /* restore PSI */
ASM xchg PSI,PDX
ASM push PDI
ASM add PDI,PAX
ASM sub PDI,PCX
ASM sub PDI,PBP
ASM fld st(0)
ASM fadd st,st(2)
ASM fsub st,st(2)
ASM fst st(5)
ASM fmul st,st(3)
ASM fxch st(5)
ASM fsub
ASM fstp POINTER [PDI]
ASM fld st(3)
ASM pop PDI
ASM sub PCX,PBP
ASM jge m7
ASM add PDI,PAX
ASM fld st(0)
ASM fadd st,st(2)
ASM fsub st,st(2)
ASM fst st(5)
ASM fmul st,st(3)
ASM fxch st(5)
ASM fsub
ASM fstp POINTER [PDI]
ASM fld st(3)
ASM add PDI,PBP
ASM fstp POINTER [PDI]
ASM pop PSI
ASM pop PDI
ASM pop PBP
for (ij=rn;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0;
z->len=rn;
for (ij=0;ij<rn;ij++) z->w[ij]=w0->w[ij+rn];
if (z->w[rn-1]==0.0) mr_lzero(z);
}
void fastmodsquare(_MIPD_ big x,big z)
{
int ij,rn,nrn;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
big modulus=mr_mip->modulus;
big w0=mr_mip->w0;
mr_small *wg,*mg,*xg;
wg=w0->w;
mg=modulus->w;
xg=x->w;
rn=(int)modulus->len;
for (ij=2*rn;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0;
w0->len=2*rn;
nrn=N*rn;
ASM push PBP
ASM push PDI
ASM push PSI
ASM mov PBX,xg
ASM mov PSI,xg
ASM mov PDX,mg
ASM mov PDI,wg
ASM mov PAX,nrn
ASM mov PBP,N
ASM fldz
ASM xor PCX,PCX
s1:
ASM push PBX
ASM push PSI
ASM test PCX,PCX
ASM jz s4
ASM add PSI,PCX
ASM fstp st(5)
ASM fldz
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM sub PSI,PBP
ASM add PBX,PBP
ASM cmp PSI,PBX
ASM jle s3
s2:
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM sub PSI,PBP
ASM add PBX,PBP
ASM fadd
ASM cmp PSI,PBX
ASM jg s2
s3:
ASM fadd
ASM fld st(0)
ASM fadd
ASM fadd st,st(5)
s4:
ASM cmp PSI,PBX
ASM jne s5
ASM fld POINTER [PBX]
ASM fmul st,st(0)
ASM fadd
s5:
ASM pop PSI
ASM pop PBX /* restore pointers */
ASM xchg PSI,PDX /* PSI -> modulus */
ASM push PCX
ASM test PCX,PCX
ASM jz s8
ASM add PSI,PCX
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM add PDI,PBP
ASM sub PSI,PBP
ASM sub PCX,PBP
ASM jz s7
s6:
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM add PDI,PBP
ASM sub PSI,PBP
ASM fadd
ASM sub PCX,PBP
ASM jnz s6
s7:
ASM fadd
s8:
ASM fld st(0)
ASM fadd st,st(2)
ASM fsub st,st(2)
ASM fsubr st,st(1)
ASM fmul st,st(4)
ASM fld st(0)
ASM fadd st,st(3)
ASM fsub st,st(3)
ASM fsub
ASM fst POINTER [PDI]
ASM fmul POINTER [PSI]
ASM fadd
ASM fmul st,st(2)
ASM xchg PSI,PDX
ASM pop PCX
ASM sub PDI,PCX /* restore PDI */
ASM add PCX,PBP /* increment PCX */
ASM cmp PCX,PAX
ASM jl s1
ASM sub PCX,PBP /* PCX=12 */
ASM add PSI,PCX
ASM add PBX,PCX /* PBX -> x[4] */
ASM add PDX,PCX
ASM add PDI,PCX
ASM sub PCX,PBP /* going back down again PCX=8 */
s9:
ASM push PBX
ASM push PSI
ASM test PCX,PCX
ASM jz s13
s10:
ASM sub PBX,PCX
ASM fstp st(5)
ASM fldz
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM sub PSI,PBP
ASM add PBX,PBP
ASM cmp PSI,PBX
ASM jle s12
s11:
ASM fld POINTER [PBX]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM sub PSI,PBP
ASM add PBX,PBP
ASM fadd
ASM cmp PSI,PBX
ASM jg s11
s12:
ASM fadd
ASM fld st(0)
ASM fadd
ASM fadd st,st(5)
s13:
ASM cmp PSI,PBX
ASM jne s14
ASM fld POINTER [PBX]
ASM fmul st,st(0)
ASM fadd
s14:
ASM pop PSI
ASM pop PBX
ASM sub PDI,PCX
ASM xchg PSI,PDX /* PSI -> modulus */
ASM push PCX
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM add PDI,PBP
ASM sub PSI,PBP
ASM test PCX,PCX
ASM jz s16
s15:
ASM fld POINTER [PDI]
ASM fmul POINTER [PSI]
ASM fxch st(2)
ASM add PDI,PBP
ASM sub PSI,PBP
ASM fadd
ASM sub PCX,PBP
ASM jnz s15
s16:
ASM sub PDI,PBP
ASM add PSI,PBP
ASM fadd
ASM pop PCX
ASM add PSI,PCX /* restore PSI */
ASM xchg PSI,PDX
ASM push PDI
ASM add PDI,PAX
ASM sub PDI,PCX
ASM sub PDI,PBP
ASM fld st(0)
ASM fadd st,st(2)
ASM fsub st,st(2)
ASM fst st(5)
ASM fmul st,st(3)
ASM fxch st(5)
ASM fsub
ASM fstp POINTER [PDI]
ASM fld st(3)
ASM pop PDI
ASM sub PCX,PBP
ASM jge s9
ASM add PDI,PAX
ASM fld st(0)
ASM fadd st,st(2)
ASM fsub st,st(2)
ASM fst st(5)
ASM fmul st,st(3)
ASM fxch st(5)
ASM fsub
ASM fstp POINTER [PDI]
ASM fld st(3)
ASM add PDI,PBP
ASM fstp POINTER [PDI]
ASM pop PSI
ASM pop PDI
ASM pop PBP
for (ij=rn;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0;
z->len=rn;
for (ij=0;ij<rn;ij++) z->w[ij]=w0->w[ij+rn];
if (z->w[rn-1]==0.0) mr_lzero(z);
}
#endif
#endif
#endif