/***************************************************************************
*
Copyright 2013 CertiVox UK Ltd. *
*
This file is part of CertiVox MIRACL Crypto SDK. *
*
The CertiVox MIRACL Crypto SDK provides developers with an *
extensive and efficient set of cryptographic functions. *
For further information about its features and functionalities please *
refer to http://www.certivox.com *
*
* The CertiVox MIRACL Crypto SDK is free software: you can *
redistribute it and/or modify it under the terms of the *
GNU Affero General Public License as published by the *
Free Software Foundation, either version 3 of the License, *
or (at your option) any later version. *
*
* The CertiVox MIRACL Crypto SDK is distributed in the hope *
that it will be useful, but WITHOUT ANY WARRANTY; without even the *
implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
See the GNU Affero General Public License for more details. *
*
* You should have received a copy of the GNU Affero General Public *
License along with CertiVox MIRACL Crypto SDK. *
If not, see . *
*
You can be released from the requirements of the license by purchasing *
a commercial license. Buying such a license is mandatory as soon as you *
develop commercial activities involving the CertiVox MIRACL Crypto SDK *
without disclosing the source code of your own applications, or shipping *
the CertiVox MIRACL Crypto SDK with a closed source product. *
*
***************************************************************************/
/*
* Many processors support a floating-point coprocessor, which may
* implement a faster multiplication instruction than the corresponding
* integer instruction. This is the case for the 80486/Pentium processor
* which has a built-in co-processor. This can be exploited to give even
* faster performance.
*
* As before the fixed modulus size to be used is pre-defined as
* MR_PENTIUM in mirdef.h
*
* Note that since the partial products are accumulated in a 64-bit register
* this implies that a full-width number base (2^32) cannot be used.
* The maximum number base that can be used is 2^x where x is
* calculated such that 2^(64-2*x) > 2*MR_PENTIUM. This means that
* x will usually be 28 or 29
*
* To use this code:-
*
* (1) Define MR_PENTIUM in mirdef.h to the fixed size of the modulus
*
* (2) Use as a number base the value of x calculated as shown above
* For example, for 512 bit exponentiation, #define MR_PENTIUM 18
* in mirdef.h and call mirsys(50,536870912L) in your main program.
* (Observe that 536870912 = 2^29, and that 18*29 = 522, big enough
* for 512 bit calculations).
*
* (3) Use Montgomery representation when implementing your crypto-system
* i.e. use monty_powmod(). This will automatically call the
* routines in this module.
*
* Note that this module generates a *lot* of code e.g. > 49kbytes for
* MR_PENTIUM = 36. Compile using -B switch - you will need
* the TASM macro-assembler. If out-of-memory, try using the TASMX /ml
* version of the assembler.
*
* Note that it is *VITAL* that double arrays be aligned on 8-byte
* boundaries for maximum speed on a Pentium.
*
* Many thanks are due to Paul Rubin, who suggested to me that this approach
* might be faster than the all-integer method described elsewhere.
*
* The FP stack is primed in prepare_monty() :-
* magic - (2^63+2^62)*base. By adding and then subtracting this number we
* get the top half of the sum.
* 1/base - Inverse of the number base
* ndash - Montgomery's constant
*/
#include "miracl.h"
#ifdef MR_PENTIUM
#if INLINE_ASM == 1
#define N 8
#define POINTER QWORD PTR
#define PBX bx
#define PSI si
#define PDI di
#define PCX cx
#endif
#if INLINE_ASM == 2
#define N 8
#define POINTER QWORD PTR
#define PBX bx
#define PSI si
#define PDI di
#define PCX cx
#endif
#if INLINE_ASM == 3
#define N 8
#define POINTER QWORD PTR
#define PBX ebx
#define PSI esi
#define PDI edi
#define PCX ecx
#endif
#ifdef INLINE_ASM
#ifndef MR_LMM
/* not implemented for large memory model 16 bit */
void fastmodmult(_MIPD_ big x,big y,big z)
{
int ij;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
big w0=mr_mip->w0;
big modulus=mr_mip->modulus;
mr_small *wg,*mg,*xg,*yg;
wg=w0->w;
mg=modulus->w;
xg=x->w;
yg=y->w;
for (ij=2*MR_PENTIUM;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0;
w0->len=2*MR_PENTIUM;
ASM
{
FSTEP MACRO i,j
/* some fancy Pentium scheduling going on here ... */
fld POINTER [PBX+N*i]
fmul POINTER [PSI+N*j]
fxch st(2)
fadd
ENDM
FRSTEP MACRO i,j
fld POINTER [PDI+N*i]
fmul POINTER [PSI+N*j]
fxch st(2)
fadd
ENDM
FDSTEP MACRO i,j
fld POINTER [PBX+N*i]
fmul POINTER [PBX+N*j]
fxch st(2)
fadd
ENDM
SELF MACRO k
fld POINTER [PBX+N*k]
fmul st,st(0)
fadd
ENDM
RFINU MACRO k
fld st(0)
fadd st,st(2)
fsub st,st(2)
fsubr st,st(1)
fmul st,st(4)
fld st(0)
fadd st,st(3)
fsub st,st(3)
fsub
fst POINTER [PDI+N*k]
fmul POINTER [PSI]
fadd
fmul st,st(2)
ENDM
RFIND MACRO k
fld st(0)
fadd st,st(2)
fsub st,st(2)
fsub st(1),st
fmul st,st(3)
fxch st(1)
fstp POINTER [PDI+N*k]
ENDM
DIAG MACRO ns,ne
CNT1=ns
CNT2=ne
fld POINTER [PBX+N*CNT1]
fmul POINTER [PSI+N*CNT2]
CNT1=CNT1+1
CNT2=CNT2-1
WHILE CNT1 LE ne
FSTEP CNT1,CNT2
CNT1=CNT1+1
CNT2=CNT2-1
ENDM
fadd
ENDM
SDIAG MACRO ns,ne
CNT1=ns
CNT2=ne
IF CNT1 LT CNT2
fstp st(5) /* store carry */
fldz
fld POINTER [PBX+N*CNT1]
fmul POINTER [PBX+N*CNT2]
CNT1=CNT1+1
CNT2=CNT2-1
WHILE CNT1 LT CNT2
FDSTEP CNT1,CNT2
CNT1=CNT1+1
CNT2=CNT2-1
ENDM
fadd
fld st(0) /* now double it ... */
fadd
fadd st,st(5) /* add in carry */
ENDIF
ENDM
RDIAGU MACRO ns,ne
CNT1=ns
CNT2=ne
IF CNT1 LT ne
fld POINTER [PDI+N*CNT1]
fmul POINTER [PSI+N*CNT2]
CNT1=CNT1+1
CNT2=CNT2-1
WHILE CNT1 LT ne
FRSTEP CNT1,CNT2
CNT1=CNT1+1
CNT2=CNT2-1
ENDM
fadd
ENDIF
ENDM
RDIAGD MACRO ns,ne
CNT1=ns
CNT2=ne
fld POINTER [PDI+N*CNT1]
fmul POINTER [PSI+N*CNT2]
CNT1=CNT1+1
CNT2=CNT2-1
WHILE CNT1 LE ne
FRSTEP CNT1,CNT2
CNT1=CNT1+1
CNT2=CNT2-1
ENDM
fadd
ENDM
MODMULT MACRO
CNT=0
WHILE CNT LT MR_PENTIUM
DIAG 0,CNT
xchg PSI,PCX
RDIAGU 0,CNT
RFINU CNT
xchg PSI,PCX
CNT=CNT+1
ENDM
SCNT=0
WHILE SCNT LT (MR_PENTIUM-1)
SCNT=SCNT+1
DIAG SCNT,(MR_PENTIUM-1)
xchg PSI,PCX
RDIAGD SCNT,(MR_PENTIUM-1)
RFIND CNT
xchg PSI,PCX
CNT=CNT+1
ENDM
RFIND CNT
CNT=CNT+1
fstp POINTER [PDI+N*CNT]
ENDM
MODSQUARE MACRO
CNT=0
WHILE CNT LT MR_PENTIUM
SDIAG 0,CNT
IF (CNT MOD 2) EQ 0
SELF (CNT/2)
ENDIF
RDIAGU 0,CNT
RFINU CNT
CNT=CNT+1
ENDM
SCNT=0
WHILE SCNT LT (MR_PENTIUM-1)
SCNT=SCNT+1
SDIAG SCNT,(MR_PENTIUM-1)
IF (CNT MOD 2) EQ 0
SELF (CNT/2)
ENDIF
RDIAGD SCNT,(MR_PENTIUM-1)
RFIND CNT
CNT=CNT+1
ENDM
RFIND CNT
CNT=CNT+1
fstp POINTER [PDI+N*CNT]
ENDM
}
ASM
{
push PDI
push PSI
mov PBX,xg
mov PSI,yg
mov PCX,mg
mov PDI,wg
fldz
MODMULT
pop PSI
pop PDI
}
for (ij=MR_PENTIUM;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0;
z->len=MR_PENTIUM;
for (ij=0;ijw[ij]=w0->w[ij+MR_PENTIUM];
if (z->w[MR_PENTIUM-1]==0.0) mr_lzero(z);
}
void fastmodsquare(_MIPD_ x,z)
big x,z;
{
int ij;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
big w0=mr_mip->w0;
big modulus=mr_mip->modulus;
mr_small *wg,*mg,*xg;
wg=w0->w;
mg=modulus->w;
xg=x->w;
for (ij=2*MR_PENTIUM;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0;
w0->len=2*MR_PENTIUM;
ASM
{
push PDI
push PSI
mov PBX,xg
mov PSI,mg
mov PDI,wg
fldz
MODSQUARE
pop PSI
pop PDI
}
for (ij=MR_PENTIUM;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0;
z->len=MR_PENTIUM;
for (ij=0;ijw[ij]=w0->w[ij+MR_PENTIUM];
if (z->w[MR_PENTIUM-1]==0.0) mr_lzero(z);
}
#endif
#endif
#endif