/*************************************************************************** * Copyright 2013 CertiVox UK Ltd. * * This file is part of CertiVox MIRACL Crypto SDK. * * The CertiVox MIRACL Crypto SDK provides developers with an * extensive and efficient set of cryptographic functions. * For further information about its features and functionalities please * refer to http://www.certivox.com * * * The CertiVox MIRACL Crypto SDK is free software: you can * redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the * Free Software Foundation, either version 3 of the License, * or (at your option) any later version. * * * The CertiVox MIRACL Crypto SDK is distributed in the hope * that it will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Affero General Public License for more details. * * * You should have received a copy of the GNU Affero General Public * License along with CertiVox MIRACL Crypto SDK. * If not, see . * * You can be released from the requirements of the license by purchasing * a commercial license. Buying such a license is mandatory as soon as you * develop commercial activities involving the CertiVox MIRACL Crypto SDK * without disclosing the source code of your own applications, or shipping * the CertiVox MIRACL Crypto SDK with a closed source product. * * ***************************************************************************/ /* * Many processors support a floating-point coprocessor, which may * implement a faster multiplication instruction than the corresponding * integer instruction. This is the case for the Pentium processor * which has a built-in co-processor. This can be exploited to give even * faster performance. * * Note that since the partial products are accumulated in a 64-bit register * this implies that a full-width number base (2^32) cannot be used. * The maximum number base that can be used is 2^x where x is * calculated such that 2^(64-2*x) > 2*WORDS_IN_MODULUS. This means that * x will usually be 28 or 29 * * To use this code:- * * (1) Implemented and tested only for the Pentium processor and * using the Borland C compiler (BCC and BCC32) * * (2) Define MR_PENTIUM in mirdef.h. Determine the maximum modulus to be * used, and from that determine the value of WORDS_IN_MODULUS. * * (3) Use as a number base the value of x calculated as shown above. * For example, for 512 bit exponentiation, WORDS_IN_MODULUS will be 18 * so call mirsys(50,536870912L) in your main program. * (Observe that 536870912 = 2^29, and that 18*29 = 522, big enough * for 512 bit calculations). * * (4) Use Montgomery representation when implementing your crypto-system * e.g. use monty_powmod(). This will automatically call the * routines in this module. * * Note that it is *VITAL* that double arrays be aligned on 8-byte * boundaries for the Pentium. The Borland C compiler does *not* do this * automatically!!!! * * Many thanks are due to Paul Rubin, who suggested to me that this approach * might be faster than the all-integer methods described elsewhere. * * Further speed increases can be acheived by loop-unrolling. Completely * unrolled code (a la Comba) has been experimented with, and gives a * 25% speed-up in some cases. Note that the basic code for a single partial * product takes only 3 cycles. * * fld ... ;1 cycle * fmul ... ;1 cycle * fxch st(2) ;0 cycle * fadd ;1 cycle * * Compare this with the integer "mul" instruction which takes 10 cycles * on a Pentium * * Interestingly the fmul is faster than the fimul. So paradoxically it is * quicker to manipulate 64-bit doubles than it is to manipulate 32-bit * integers. Clearly the Pentium FP processor has been optimised for real * arithmetic. However this requires us to convert all bigs from integer * arrays to double arrays (see mrmonty.c) which is very wasteful of space * and rather awkward. * * * The FP stack is primed in prepare_monty() :- * magic - (2^63+2^62)*base. By adding and then subtracting this number we * get the top half of the sum. * 1/base - Inverse of the number base * ndash - Montgomery's constant * */ #include "miracl.h" #ifdef MR_PENTIUM #define N 8 #define POINTER QWORD PTR #if INLINE_ASM == 1 #define PAX ax #define PBP bp #define PBX bx #define PSI si #define PDI di #define PCX cx #define PDX dx #endif #if INLINE_ASM == 2 #define PAX ax #define PBP bp #define PBX bx #define PSI si #define PDI di #define PCX cx #define PDX dx #endif #if INLINE_ASM == 3 #define PAX eax #define PBP ebp #define PBX ebx #define PSI esi #define PDI edi #define PCX ecx #define PDX edx #endif #ifdef INLINE_ASM #ifndef MR_LMM /* not implemented for large memory model 16 bit */ void fastmodmult(_MIPD_ big x,big y,big z) { int ij,rn,nrn; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif big modulus=mr_mip->modulus; big w0=mr_mip->w0; mr_small *wg,*mg,*xg,*yg; wg=w0->w; mg=modulus->w; xg=x->w; yg=y->w; rn=(int)modulus->len; for (ij=2*rn;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0; w0->len=2*rn; nrn=N*rn; ASM push PBP ASM push PDI ASM push PSI ASM mov PBX,xg ASM mov PSI,yg ASM mov PDX,mg ASM mov PDI,wg ASM mov PAX,nrn ASM mov PBP,N ASM fldz ASM xor PCX,PCX m1: ASM push PCX ASM add PSI,PCX ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM add PBX,PBP ASM sub PSI,PBP ASM test PCX,PCX ASM jz m3 m2: ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM fxch st(2) ASM add PBX,PBP ASM sub PSI,PBP ASM fadd ASM sub PCX,PBP ASM jnz m2 m3: ASM sub PBX,PBP ASM add PSI,PBP ASM fadd ASM pop PCX ASM sub PBX,PCX /* restore PBX */ ASM xchg PSI,PDX /* PSI -> modulus */ ASM push PCX ASM test PCX,PCX ASM jz m6 ASM add PSI,PCX ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM add PDI,PBP ASM sub PSI,PBP ASM sub PCX,PBP ASM jz m5 m4: /* this is typical of the critical inner loop */ ASM fld POINTER [PDI] /* 1 cycle */ ASM fmul POINTER [PSI] /* 1 cycle */ ASM fxch st(2) /* 0 cycle */ ASM add PDI,PBP /* 1 cycle */ ASM sub PSI,PBP /* 0 cycle */ ASM fadd /* 1 cycle */ ASM sub PCX,PBP /* 1 cycle */ ASM jnz m4 /* 0 cycle */ /* total = 5 cycles */ m5: ASM fadd m6: ASM fld st(0) ASM fadd st,st(2) ASM fsub st,st(2) ASM fsubr st,st(1) ASM fmul st,st(4) ASM fld st(0) ASM fadd st,st(3) ASM fsub st,st(3) ASM fsub ASM fst POINTER [PDI] ASM fmul POINTER [PSI] ASM fadd ASM fmul st,st(2) ASM xchg PSI,PDX ASM pop PCX ASM sub PDI,PCX /* restore PDI */ ASM add PCX,PBP /* increment PCX */ ASM cmp PCX,PAX ASM jl m1 ASM sub PCX,PBP /* PCX=12 */ ASM add PSI,PCX ASM add PBX,PCX /* PBX -> x[4] */ ASM add PDX,PCX ASM add PDI,PCX ASM sub PCX,PBP /* going back down again PCX=8 */ m7: ASM push PCX ASM sub PBX,PCX ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM add PBX,PBP ASM sub PSI,PBP ASM test PCX,PCX ASM jz m9 m8: ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM fxch st(2) ASM add PBX,PBP ASM sub PSI,PBP ASM fadd ASM sub PCX,PBP ASM jnz m8 m9: ASM sub PBX,PBP ASM add PSI,PBP ASM fadd ASM pop PCX ASM add PSI,PCX /* restore PSI */ ASM sub PDI,PCX ASM xchg PSI,PDX /* PSI -> modulus */ ASM push PCX ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM add PDI,PBP ASM sub PSI,PBP ASM test PCX,PCX ASM jz m11 m10: ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM fxch st(2) ASM add PDI,PBP ASM sub PSI,PBP ASM fadd ASM sub PCX,PBP ASM jnz m10 m11: ASM sub PDI,PBP ASM add PSI,PBP ASM fadd ASM pop PCX ASM add PSI,PCX /* restore PSI */ ASM xchg PSI,PDX ASM push PDI ASM add PDI,PAX ASM sub PDI,PCX ASM sub PDI,PBP ASM fld st(0) ASM fadd st,st(2) ASM fsub st,st(2) ASM fst st(5) ASM fmul st,st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER [PDI] ASM fld st(3) ASM pop PDI ASM sub PCX,PBP ASM jge m7 ASM add PDI,PAX ASM fld st(0) ASM fadd st,st(2) ASM fsub st,st(2) ASM fst st(5) ASM fmul st,st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER [PDI] ASM fld st(3) ASM add PDI,PBP ASM fstp POINTER [PDI] ASM pop PSI ASM pop PDI ASM pop PBP for (ij=rn;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0; z->len=rn; for (ij=0;ijw[ij]=w0->w[ij+rn]; if (z->w[rn-1]==0.0) mr_lzero(z); } void fastmodsquare(_MIPD_ big x,big z) { int ij,rn,nrn; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif big modulus=mr_mip->modulus; big w0=mr_mip->w0; mr_small *wg,*mg,*xg; wg=w0->w; mg=modulus->w; xg=x->w; rn=(int)modulus->len; for (ij=2*rn;ij<(int)(w0->len&MR_OBITS);ij++) w0->w[ij]=0.0; w0->len=2*rn; nrn=N*rn; ASM push PBP ASM push PDI ASM push PSI ASM mov PBX,xg ASM mov PSI,xg ASM mov PDX,mg ASM mov PDI,wg ASM mov PAX,nrn ASM mov PBP,N ASM fldz ASM xor PCX,PCX s1: ASM push PBX ASM push PSI ASM test PCX,PCX ASM jz s4 ASM add PSI,PCX ASM fstp st(5) ASM fldz ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM sub PSI,PBP ASM add PBX,PBP ASM cmp PSI,PBX ASM jle s3 s2: ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM fxch st(2) ASM sub PSI,PBP ASM add PBX,PBP ASM fadd ASM cmp PSI,PBX ASM jg s2 s3: ASM fadd ASM fld st(0) ASM fadd ASM fadd st,st(5) s4: ASM cmp PSI,PBX ASM jne s5 ASM fld POINTER [PBX] ASM fmul st,st(0) ASM fadd s5: ASM pop PSI ASM pop PBX /* restore pointers */ ASM xchg PSI,PDX /* PSI -> modulus */ ASM push PCX ASM test PCX,PCX ASM jz s8 ASM add PSI,PCX ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM add PDI,PBP ASM sub PSI,PBP ASM sub PCX,PBP ASM jz s7 s6: ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM fxch st(2) ASM add PDI,PBP ASM sub PSI,PBP ASM fadd ASM sub PCX,PBP ASM jnz s6 s7: ASM fadd s8: ASM fld st(0) ASM fadd st,st(2) ASM fsub st,st(2) ASM fsubr st,st(1) ASM fmul st,st(4) ASM fld st(0) ASM fadd st,st(3) ASM fsub st,st(3) ASM fsub ASM fst POINTER [PDI] ASM fmul POINTER [PSI] ASM fadd ASM fmul st,st(2) ASM xchg PSI,PDX ASM pop PCX ASM sub PDI,PCX /* restore PDI */ ASM add PCX,PBP /* increment PCX */ ASM cmp PCX,PAX ASM jl s1 ASM sub PCX,PBP /* PCX=12 */ ASM add PSI,PCX ASM add PBX,PCX /* PBX -> x[4] */ ASM add PDX,PCX ASM add PDI,PCX ASM sub PCX,PBP /* going back down again PCX=8 */ s9: ASM push PBX ASM push PSI ASM test PCX,PCX ASM jz s13 s10: ASM sub PBX,PCX ASM fstp st(5) ASM fldz ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM sub PSI,PBP ASM add PBX,PBP ASM cmp PSI,PBX ASM jle s12 s11: ASM fld POINTER [PBX] ASM fmul POINTER [PSI] ASM fxch st(2) ASM sub PSI,PBP ASM add PBX,PBP ASM fadd ASM cmp PSI,PBX ASM jg s11 s12: ASM fadd ASM fld st(0) ASM fadd ASM fadd st,st(5) s13: ASM cmp PSI,PBX ASM jne s14 ASM fld POINTER [PBX] ASM fmul st,st(0) ASM fadd s14: ASM pop PSI ASM pop PBX ASM sub PDI,PCX ASM xchg PSI,PDX /* PSI -> modulus */ ASM push PCX ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM add PDI,PBP ASM sub PSI,PBP ASM test PCX,PCX ASM jz s16 s15: ASM fld POINTER [PDI] ASM fmul POINTER [PSI] ASM fxch st(2) ASM add PDI,PBP ASM sub PSI,PBP ASM fadd ASM sub PCX,PBP ASM jnz s15 s16: ASM sub PDI,PBP ASM add PSI,PBP ASM fadd ASM pop PCX ASM add PSI,PCX /* restore PSI */ ASM xchg PSI,PDX ASM push PDI ASM add PDI,PAX ASM sub PDI,PCX ASM sub PDI,PBP ASM fld st(0) ASM fadd st,st(2) ASM fsub st,st(2) ASM fst st(5) ASM fmul st,st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER [PDI] ASM fld st(3) ASM pop PDI ASM sub PCX,PBP ASM jge s9 ASM add PDI,PAX ASM fld st(0) ASM fadd st,st(2) ASM fsub st,st(2) ASM fst st(5) ASM fmul st,st(3) ASM fxch st(5) ASM fsub ASM fstp POINTER [PDI] ASM fld st(3) ASM add PDI,PBP ASM fstp POINTER [PDI] ASM pop PSI ASM pop PDI ASM pop PBP for (ij=rn;ij<(int)(z->len&MR_OBITS);ij++) z->w[ij]=0.0; z->len=rn; for (ij=0;ijw[ij]=w0->w[ij+rn]; if (z->w[rn-1]==0.0) mr_lzero(z); } #endif #endif #endif