/*************************************************************************** * Copyright 2013 CertiVox UK Ltd. * * This file is part of CertiVox MIRACL Crypto SDK. * * The CertiVox MIRACL Crypto SDK provides developers with an * extensive and efficient set of cryptographic functions. * For further information about its features and functionalities please * refer to http://www.certivox.com * * * The CertiVox MIRACL Crypto SDK is free software: you can * redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the * Free Software Foundation, either version 3 of the License, * or (at your option) any later version. * * * The CertiVox MIRACL Crypto SDK is distributed in the hope * that it will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Affero General Public License for more details. * * * You should have received a copy of the GNU Affero General Public * License along with CertiVox MIRACL Crypto SDK. * If not, see . * * You can be released from the requirements of the license by purchasing * a commercial license. Buying such a license is mandatory as soon as you * develop commercial activities involving the CertiVox MIRACL Crypto SDK * without disclosing the source code of your own applications, or shipping * the CertiVox MIRACL Crypto SDK with a closed source product. * * ***************************************************************************/ /* * MIRACL routines for arithmetic over GF(2^m), * mrgf2m.c * * For algorithms used, see IEEE P1363 Standard, Appendix A * unless otherwise stated. * * The time-critical routines are the multiplication routine multiply2() * and (for AFFINE co-ordinates), the modular inverse routine inverse2() * and the routines it calls. * * READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS * * No assembly language used. * * Use utility irp.cpp to generate optimal code for function reduce2(.) below * * Space can be saved by removing unneeded functions and * deleting unrequired functionality. * For example in reduce2(.) remove code for those irreducible polynomials * which will not be used by your code. */ #include #include "miracl.h" #ifdef MR_STATIC #include #endif #ifdef MR_COUNT_OPS extern int fpm2,fpi2; #endif /* must use /arch:SSE2 in compilation */ #ifdef _M_IX86_FP #if _M_IX86_FP >= 2 #define MR_SSE2_INTRINSICS #endif #endif /* must use -msse2 in compilation */ #ifdef __SSE2__ #define MR_SSE2_INTRINSICS #endif #ifdef MR_SSE2_INTRINSICS #ifdef __GNUC__ #include #else #include #endif #if MIRACL==64 #define MR_SSE2_64 /* Can use SSE2 registers for 64-bit manipulations */ #endif #endif #ifndef MR_NOFULLWIDTH /* This does not make sense using floating-point! */ /* This is extremely time-critical, and expensive */ /* Some experimental MMX code for x86-32. Seems to be slower than the standard code (on a PIV anyway).. */ #ifdef MR_MMX_x86_32 #ifdef __GNUC__ #include #else #include #endif static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r) { __m64 rg,tt[4]; mr_small q; tt[0]=_m_from_int(0); tt[1]=_m_from_int(a); tt[2]=_m_psllqi(tt[1],1); tt[3]=_m_pxor(tt[1],tt[2]); rg=tt[b&3]; rg=_m_pxor(rg,_m_psllqi(tt[(b>>2)&3],2)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>4)&3],4)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>6)&3],6)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>8)&3],8)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>10)&3],10)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>12)&3],12)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>14)&3],14)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>16)&3],16)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>18)&3],18)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>20)&3],20)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>22)&3],22)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>24)&3],24)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>26)&3],26)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>28)&3],28)); rg=_m_pxor(rg,_m_psllqi(tt[(b>>30)],30)); *r=_m_to_int(rg); q=_m_to_int(_m_psrlqi(rg,32)); return q; } #else /* This might be faster on a 16-bit processor with no variable shift instructions. The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit right shift on the hi|lo value - should be really fast in assembly language unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r) { unsigned short lo,hi,bit,w; hi=0; lo=x; bit=-(lo&1); lo>>=1; hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); hi^=(y&bit); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); *r=lo; return hi; } */ /* This might be faster on an 8-bit processor with no variable shift instructions. The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right shift on the hi|lo value - should be really fast in assembly language unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r) { unsigned char lo,hi,bit,w; hi=0; lo=x; bit=-(lo&1); lo>>=1; hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); bit=-(lo&1); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); hi^=(y&bit); w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); *r=lo; return hi; } */ /* wouldn't it be nice if instruction sets supported a one cycle "carry-free" multiplication instruction ... The SmartMips does - its called maddp */ #ifndef MR_COMBA2 #if MIRACL==8 /* maybe use a small precomputed look-up table? */ static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r) { static const mr_small look[256]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30, 0,3,6,5,12,15,10,9,24,27,30,29,20,23,18,17, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60, 0,5,10,15,20,17,30,27,40,45,34,39,60,57,54,51, 0,6,12,10,24,30,20,18,48,54,60,58,40,46,36,34, 0,7,14,9,28,27,18,21,56,63,54,49,36,35,42,45, 0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120, 0,9,18,27,36,45,54,63,72,65,90,83,108,101,126,119, 0,10,20,30,40,34,60,54,80,90,68,78,120,114,108,102, 0,11,22,29,44,39,58,49,88,83,78,69,116,127,98,105, 0,12,24,20,48,60,40,36,96,108,120,116,80,92,72,68, 0,13,26,23,52,57,46,35,104,101,114,127,92,81,70,75, 0,14,28,18,56,54,36,42,112,126,108,98,72,70,84,90, 0,15,30,17,60,51,34,45,120,119,102,105,68,75,90,85 }; mr_small x1,y0,m,p,q; x1=a&0xf0; y0=b&0x0f; a<<=4; b>>=4; p=look[(a|y0)]; q=look[(x1|b)]; m=look[a^b^x1^y0]^p^q; /* Karatsuba! */ p^=(m<<4); q^=(m>>4); *r=p; return q; } #else #ifdef MR_SSE2_64 static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r) { int i,j; __m128i pp,tt[16],m; m=_mm_set_epi32(0,0,0xf0<<24,0); tt[0]=_mm_setzero_si128(); tt[1]=_mm_loadl_epi64((__m128i *)&a); tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7)); tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6)); tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5)); tt[3]=_mm_xor_si128(tt[1],tt[2]); tt[5]=_mm_xor_si128(tt[1],tt[4]); tt[6]=_mm_xor_si128(tt[2],tt[4]); tt[7]=_mm_xor_si128(tt[6],tt[1]); tt[9]=_mm_xor_si128(tt[8],tt[1]); tt[10]=_mm_xor_si128(tt[8],tt[2]); tt[11]=_mm_xor_si128(tt[10],tt[1]); tt[12]=_mm_xor_si128(tt[8],tt[4]); tt[13]=_mm_xor_si128(tt[12],tt[1]); tt[14]=_mm_xor_si128(tt[8],tt[6]); tt[15]=_mm_xor_si128(tt[14],tt[1]); /* Thanks to Darrel Hankerson, who pointed out an optimization for this code ... */ i=(int)(b&0xF); j=(int)((b>>4)&0xF); pp=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ); i=(int)((b>>8)&0xF); j=(int)((b>>12)&0xF); pp=_mm_xor_si128(pp, _mm_slli_si128( _mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,1) ); i=(int)((b>>16)&0xF); j=(int)((b>>20)&0xF); pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,2) ); i=(int)((b>>24)&0xF); j=(int)((b>>28)&0xF); pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,3) ); i=(int)((b>>32)&0xF); j=(int)((b>>36)&0xF); pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,4) ); i=(int)((b>>40)&0xF); j=(int)((b>>44)&0xF); pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,5) ); i=(int)((b>>48)&0xF); j=(int)((b>>52)&0xF); pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,6) ); i=(int)((b>>56)&0xF); j=(int)(b>>60); pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4), _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) ) ,7) ); *r=((unsigned long long *)&pp)[0]; return ((unsigned long long *)&pp)[1]; } #else static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r) { int k; mr_small kb,t[16]; mr_small x,q,p; mr_utype tb0; #if MIRACL > 32 mr_utype tb1,tb2; #endif kb=b; #if MIRACL <= 32 t[0]=0; /* small look up table */ t[3]=t[2]=a<<1; /* it can overflow.... */ t[1]=t[2]>>1; t[3]^=t[1]; tb0=(mr_utype)(a&TOPBIT); /* remember top bit */ tb0>>=M1; /* all ones if top bit is one */ #else t[0]=0; /* larger look-up table */ t[8]=a<<3; t[4]=t[8]>>1; t[2]=t[4]>>1; t[1]=t[2]>>1; t[3]=t[5]=t[7]=t[9]=t[11]=t[13]=t[15]=t[1]; t[3]^=t[2]; t[5]^=t[4]; t[9]^=t[8]; t[6]=t[3]<<1; t[7]^=t[6]; t[10]=t[5]<<1; t[11]^=t[10]; t[12]=t[6]<<1; t[13]^=t[12]; t[14]=t[7]<<1; t[15]^=t[14]; tb0=(a&TOPBIT); /* remember top bits */ tb0>>=M1; /* all bits one, if this bit is set in a */ tb1=(a&SECBIT)<<1; tb1>>=M1; tb2=(a&THDBIT)<<2; tb2>>=M1; #endif #if MIRACL == 8 #define UNWOUNDM p=q=t[b&3]; q>>=2; x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2; x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2; x=t[(b>>6)]; q^=x; p^=(x<<6); q>>=2; #endif #if MIRACL == 16 #define UNWOUNDM p=q=t[b&3]; q>>=2; x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2; x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2; x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2; x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2; x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2; x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2; x=t[(b>>14)]; q^=x; p^=(x<<14); q>>=2; #endif #if MIRACL == 32 #define UNWOUNDM p=q=t[b&3]; q>>=2; x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2; /* 8 ASM 80386 instructions */ x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2; /* but only 4 ARM instructions! */ x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2; x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2; x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2; x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2; x=t[(b>>14)&3]; q^=x; p^=(x<<14); q>>=2; x=t[(b>>16)&3]; q^=x; p^=(x<<16); q>>=2; x=t[(b>>18)&3]; q^=x; p^=(x<<18); q>>=2; x=t[(b>>20)&3]; q^=x; p^=(x<<20); q>>=2; x=t[(b>>22)&3]; q^=x; p^=(x<<22); q>>=2; x=t[(b>>24)&3]; q^=x; p^=(x<<24); q>>=2; x=t[(b>>26)&3]; q^=x; p^=(x<<26); q>>=2; x=t[(b>>28)&3]; q^=x; p^=(x<<28); q>>=2; x=t[(b>>30)]; q^=x; p^=(x<<30); q>>=2; #endif #if MIRACL == 64 #define UNWOUNDM p=q=t[b&0xf]; q>>=4; x=t[(b>>4)&0xf]; q^=x; p^=(x<<4); q>>=4; x=t[(b>>8)&0xf]; q^=x; p^=(x<<8); q>>=4; x=t[(b>>12)&0xf]; q^=x; p^=(x<<12); q>>=4; x=t[(b>>16)&0xf]; q^=x; p^=(x<<16); q>>=4; x=t[(b>>20)&0xf]; q^=x; p^=(x<<20); q>>=4; x=t[(b>>24)&0xf]; q^=x; p^=(x<<24); q>>=4; x=t[(b>>28)&0xf]; q^=x; p^=(x<<28); q>>=4; x=t[(b>>32)&0xf]; q^=x; p^=(x<<32); q>>=4; x=t[(b>>36)&0xf]; q^=x; p^=(x<<36); q>>=4; x=t[(b>>40)&0xf]; q^=x; p^=(x<<40); q>>=4; x=t[(b>>44)&0xf]; q^=x; p^=(x<<44); q>>=4; x=t[(b>>48)&0xf]; q^=x; p^=(x<<48); q>>=4; x=t[(b>>52)&0xf]; q^=x; p^=(x<<52); q>>=4; x=t[(b>>56)&0xf]; q^=x; p^=(x<<56); q>>=4; x=t[(b>>60)]; q^=x; p^=(x<<60); q>>=4; #endif #ifndef UNWOUNDM q=p=(mr_small)0; for (k=0;k>=2; p>>=2; p|=q<>=2; q^=(t[b&3]); b>>=2; p>>=2; p|=q<>=2; q^=(t[b&3]); b>>=2; p>>=2; p|=q<>=2; q^=(t[b&3]); b>>=2; p>>=2; p|=q<>=2; } #endif #if MIRACL <= 32 p^=(tb0&(kb<>1)); /* don't break pipeline.. */ #else p^=(tb0&(kb<>1)); p^=(tb1&(kb<>2)); p^=(tb2&(kb<>3)); #endif *r=p; return q; } #endif #endif #endif #endif static int numbits(big x) { /* return degree of x */ mr_small *gx=x->w,bit=TOPBIT; int m,k=x->len; if (k==0) return 0; m=k*MIRACL; while (!(gx[k-1]&bit)) { m--; bit>>=1; } return m; } int degree2(big x) { /* returns -1 for x=0 */ return (numbits(x)-1); } /* static int zerobits(big x) { int m,n,k; mr_small *gx,lsb,bit=1; k=x->len; if (k==0) return (-1); gx=x->w; for (m=0;mlen; int w=m/MIRACL; int b=m%MIRACL; mr_small *gx=x->w; if (k==0 || m==0) return; if (w>0) { for (i=0;ilen-=w; } if (b!=0) { for (i=0;i>b)|(gx[i+1]<<(MIRACL-b)); gx[k-w-1]>>=b; if (gx[k-w-1]==0) x->len--; } } */ static void shiftleftbits(big x,int m) { int i,k=x->len; mr_small j; int w=m/MIRACL; /* words */ int b=m%MIRACL; /* bits */ mr_small *gx=x->w; if (k==0 || m==0) return; if (w>0) { for (i=k+w-1;i>=w;i--) gx[i]=gx[i-w]; for (i=w-1;i>=0;i--) gx[i]=0; x->len+=w; } /* time critical */ if (b!=0) { j=gx[k+w-1]>>(MIRACL-b); if (j!=0) { x->len++; gx[k+w]=j; } for (i=k+w-1;i>w;i--) { gx[i]=(gx[i]<>(MIRACL-b)); } gx[w]<<=b; } } static void square2(big x,big w) { /* w=x*x where x can be NULL so be careful */ int i,j,n,m; mr_small a,t,r,*gw; static const mr_small look[16]= {0,(mr_small)1<len; if (n==0) return; m=n+n; w->len=m; gw=w->w; for (i=n-1;i>=0;i--) { a=gw[i]; #if MIRACL == 8 #define UNWOUNDS gw[i+i]=look[a&0xF]; gw[i+i+1]=look[(a>>4)]; #endif #if MIRACL == 16 #define UNWOUNDS gw[i+i]=(look[a&0xF]>>8)|look[(a>>4)&0xF]; gw[i+i+1]=(look[(a>>8)&0xF]>>8)|look[(a>>12)]; #endif #if MIRACL == 32 #define UNWOUNDS gw[i+i]=(look[a&0xF]>>24)|(look[(a>>4)&0xF]>>16)|(look[(a>>8)&0xF]>>8)|look[(a>>12)&0xF]; gw[i+i+1]=(look[(a>>16)&0xF]>>24)|(look[(a>>20)&0xF]>>16)|(look[(a>>24)&0xF]>>8)|look[(a>>28)]; #endif #ifndef UNWOUNDS r=0; for (j=0;j>=4; r>>=8; r|=t; } gw[i+i]=r; r=0; for (j=0;j>=4; r>>=8; r|=t; } gw[i+i+1]=r; #endif } if (gw[m-1]==0) { w->len--; if (gw[m-2]==0) mr_lzero(w); } } /* Use karatsuba to multiply two polynomials with coefficients in GF(2^m) */ #ifndef MR_STATIC void karmul2_poly(_MIPD_ int n,big *t,big *x,big *y,big *z) { int m,nd2,nd,md,md2; if (n==1) { /* finished */ modmult2(_MIPP_ *x,*y,*z); zero(z[1]); return; } if (n==2) { /* in-line 2x2 */ modmult2(_MIPP_ x[0],y[0],z[0]); modmult2(_MIPP_ x[1],y[1],z[2]); add2(x[0],x[1],t[0]); add2(y[0],y[1],t[1]); modmult2(_MIPP_ t[0],t[1],z[1]); add2(z[1],z[0],z[1]); add2(z[1],z[2],z[1]); zero(z[3]); return; } if (n==3) { modmult2(_MIPP_ x[0],y[0],z[0]); modmult2(_MIPP_ x[1],y[1],z[2]); modmult2(_MIPP_ x[2],y[2],z[4]); add2(x[0],x[1],t[0]); add2(y[0],y[1],t[1]); modmult2(_MIPP_ t[0],t[1],z[1]); add2(z[1],z[0],z[1]); add2(z[1],z[2],z[1]); add2(x[1],x[2],t[0]); add2(y[1],y[2],t[1]); modmult2(_MIPP_ t[0],t[1],z[3]); add2(z[3],z[2],z[3]); add2(z[3],z[4],z[3]); add2(x[0],x[2],t[0]); add2(y[0],y[2],t[1]); modmult2(_MIPP_ t[0],t[1],t[0]); add2(z[2],t[0],z[2]); add2(z[2],z[0],z[2]); add2(z[2],z[4],z[2]); zero(z[5]); return; } if (n%2==0) { md=nd=n; md2=nd2=n/2; } else { nd=n+1; md=n-1; nd2=nd/2; md2=md/2; } for (m=0;mw0; if (x==NULL || y==NULL) { zero(w); return; } if (x->len==0 || y->len==0) { zero(w); return; } xl=x->len; yl=y->len; zero(w0); #ifdef CLAIRE /* Comba method */ w0->len=xl+yl; d=1+mr_mip->M/MIRACL; hi=lo=0; for (i=0;iw[j],y->w[i-j],&p); hi^=q; lo^=p; } w0->w[i]=lo; lo=hi; hi=0; } for (i=d;i<2*d-1;i++) { for (j=i-d+1;jw[j],y->w[i-j],&p); hi^=q; lo^=p; } w0->w[i]=lo; lo=hi; hi=0; } w0->w[2*d-1]=lo; mr_lzero(w0); copy(w0,w); #else /* recommended method as mr_mul2 is so slow... */ if (xl>=MR_KARATSUBA && yl>=MR_KARATSUBA) { if (xl>yl) ml=xl; else ml=yl; karmul2(ml,mr_mip->w7->w,x->w,y->w,w0->w); mr_mip->w7->len=w0->len=2*ml+1; mr_lzero(w0); mr_lzero(mr_mip->w7); copy(w0,w); return; } w0->len=xl+yl; for (i=0;iw[i],y->w[j],&p); w0->w[i+j]^=p; w0->w[i+j+1]^=q; } } mr_lzero(w0); copy(w0,w); #endif #endif } void add2(big x,big y,big z) { /* XOR x and y */ int i,lx,ly,lz,lm; mr_small *gx,*gy,*gz; if (x==y) { zero(z); return; } if (y==NULL) { copy(x,z); return; } else if (x==NULL) { copy(y,z); return; } if (x==z) { gy=y->w; gz=z->w; ly=y->len; lz=z->len; lm=lz; if (ly>lz) lm=ly; for (i=0;ilen=lm; if (gz[lm-1]==0) mr_lzero(z); } else { gx=x->w; gy=y->w; gz=z->w; lx=x->len; ly=y->len; lz=z->len; lm=lx; if (ly>lx) lm=ly; for (i=0;ilen=lm; if (gz[lm-1]==0) mr_lzero(z); } } static void remain2(_MIPD_ big y,big x) { /* generic "remainder" program. x%=y */ #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif int my=numbits(y); int mx=numbits(x); while (mx>=my) { copy(y,mr_mip->w7); shiftleftbits(mr_mip->w7,mx-my); add2(x,mr_mip->w7,x); mx=numbits(x); } return; } void gcd2(_MIPD_ big x,big y,big g) { #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif if (size(y)==0) { copy(x,g); return; } copy(x,mr_mip->w1); copy(y,mr_mip->w2); forever { remain2(_MIPP_ mr_mip->w2,mr_mip->w1); if (size(mr_mip->w1)==0) break; copy(mr_mip->w1,mr_mip->w3); copy(mr_mip->w2,mr_mip->w1); copy(mr_mip->w3,mr_mip->w2); } copy(mr_mip->w2,g); } /* See "Elliptic Curves in Cryptography", Blake, Seroussi & Smart, Cambridge University Press, 1999, page 20, for this fast reduction routine - algorithm II.9 */ void reduce2(_MIPD_ big y,big x) { /* reduction wrt the trinomial or pentanomial modulus * * Note that this is linear O(n), and thus not time critical */ int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4,i; int M,A,B,C; int xl; mr_small top,*gx,w; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif if (x!=y) copy(y,x); xl=x->len; gx=x->w; M=mr_mip->M; A=mr_mip->AA; if (A==0) { mr_berror(_MIPP_ MR_ERR_NO_BASIS); return; } B=mr_mip->BB; C=mr_mip->CC; /* If optimizing agressively it makes sense to make this code specific to a particular field For example code like this can be optimized for the case m=163. Note that the general purpose code involves lots of branches - these cause breaks in the pipeline and they are slow. Further loop unrolling would be even faster... Version 5.10 - optimal code for 32-bit processors and for some NIST curves added Version 5.22 - some code for a 16-bit processor.. Version 5.23 - Use findbase.cpp to find "best" irreducible polynomial Version 5.23 - Use utility irp.cpp to automatically generate optimal code for insertion here */ #if MIRACL == 8 if (M==163 && A==7 && B==6 && C==3) { for (i=xl-1;i>=21;i--) { w=gx[i]; gx[i]=0; gx[i-19]^=(w>>4)^(w>>5); gx[i-20]^=(w>>3)^(w<<4)^(w<<3)^w; gx[i-21]^=(w<<5); } /* XORs= 7 shifts= 6 */ top=gx[20]>>3; gx[0]^=top; top<<=3; gx[0]^=(top<<4)^(top<<3)^top; gx[1]^=(top>>4)^(top>>5); gx[20]^=top; x->len=21; if (gx[20]==0) mr_lzero(x); return; } if (M==271 && A==201) { for (i=xl-1;i>=34;i--) { w=gx[i]; gx[i]=0; gx[i-8]^=(w>>6); gx[i-9]^=(w<<2); gx[i-33]^=(w>>7); gx[i-34]^=(w<<1); } /* XORs= 4 shifts= 4 */ top=gx[33]>>7; gx[0]^=top; top<<=7; gx[24]^=(top<<2); gx[25]^=(top>>6); gx[33]^=top; x->len=34; if (gx[33]==0) mr_lzero(x); return; } if (M==271 && A==207 && B==175 && C==111) { for (i=xl-1;i>=34;i--) { w=gx[i]; gx[i]=0; gx[i-8]^=w; gx[i-12]^=w; gx[i-20]^=w; gx[i-33]^=(w>>7); gx[i-34]^=(w<<1); } /* XORs= 5 shifts= 2 */ top=gx[33]>>7; gx[0]^=top; top<<=7; gx[13]^=top; gx[21]^=top; gx[25]^=top; gx[33]^=top; x->len=34; if (gx[33]==0) mr_lzero(x); return; } #endif #if MIRACL == 16 if (M==163 && A==7 && B==6 && C==3) { for (i=xl-1;i>=11;i--) { w=gx[i]; gx[i]=0; gx[i-10]^=(w>>3)^(w<<3)^(w<<4)^w; gx[i-11]^=(w<<13); gx[i-9]^=(w>>12)^(w>>13); } top=gx[10]>>3; gx[0]^=top; top<<=3; gx[1]^=(top>>12)^(top>>13); gx[0]^=(top<<4)^(top<<3)^top; gx[10]^=top; x->len=11; if (gx[10]==0) mr_lzero(x); return; } if (M==271 && A==201 && B==0) { for (i=xl-1;i>=17;i--) { w=gx[i]; gx[i]=0; gx[i-17]^=(w<<1); gx[i-16]^=(w>>15); gx[i-5]^=(w<<10); gx[i-4]^=(w>>6); } top=gx[16]>>15; gx[0]^=top; top<<=15; gx[12]^=(top>>6); gx[11]^=(top<<10); gx[16]^=top; x->len=17; if (gx[16]==0) mr_lzero(x); return; } if (M==271 && A==207 && B==175 && C==111) { for (i=xl-1;i>=17;i--) { w=gx[i]; gx[i]=0; gx[i-4]^=w; gx[i-6]^=w; gx[i-10]^=w; gx[i-16]^=(w>>15); gx[i-17]^=(w<<1); } /* XORs= 5 shifts= 2 */ top=gx[16]>>15; gx[0]^=top; top<<=15; gx[6]^=top; gx[10]^=top; gx[12]^=top; gx[16]^=top; x->len=17; if (gx[16]==0) mr_lzero(x); return; } #endif #if MIRACL == 32 if (M==127 && A==63) { for (i=xl-1;i>=4;i--) { w=gx[i]; gx[i]=0; gx[i-2]^=w; gx[i-3]^=(w>>31); gx[i-4]^=(w<<1); } /* XORs= 3 shifts= 2 */ top=gx[3]>>31; gx[0]^=top; top<<=31; gx[1]^=top; gx[3]^=top; x->len=4; if (gx[3]==0) mr_lzero(x); return; } if (M==163 && A==7 && B==6 && C==3) { for (i=xl-1;i>=6;i--) { w=gx[i]; gx[i]=0; gx[i-5]^=((w>>3)^(w<<4)^(w<<3)^w); gx[i-6]^=(w<<29); gx[i-4]^=((w>>28)^(w>>29)); } top=gx[5]>>3; gx[0]^=top; top<<=3; gx[1]^=(top>>28)^(top>>29); gx[0]^=top^(top<<4)^(top<<3); gx[5]^=top; x->len=6; if (gx[5]==0) mr_lzero(x); return; } if (M==163 && A==99 && B==97 && C==3) { for (i=xl-1;i>=6;i--) { w=gx[i]; gx[i]=0; gx[i-2]^=w^(w>>2); gx[i-3]^=(w<<30); gx[i-5]^=(w>>3)^w; gx[i-6]^=(w<<29); } top=gx[5]>>3; gx[0]^=top; top<<=3; gx[0]^=top; gx[2]^=(top<<30); gx[3]^=top^(top>>2); gx[5]^=top; x->len=6; if (gx[5]==0) mr_lzero(x); return; } if (M==233 && A==74 && B==0) { for (i=xl-1;i>=8;i--) { w=gx[i]; gx[i]=0; gx[i-8]^=(w<<23); gx[i-7]^=(w>>9); gx[i-5]^=(w<<1); gx[i-4]^=(w>>31); } top=gx[7]>>9; gx[0]^=top; gx[2]^=(top<<10); gx[3]^=(top>>22); gx[7]&=0x1FF; x->len=8; if (gx[7]==0) mr_lzero(x); return; } if (M==233 && A==159 && B==0) { for (i=xl-1;i>=8;i--) { w=gx[i]; gx[i]=0; gx[i-2]^=(w>>10); gx[i-3]^=(w<<22); gx[i-7]^=(w>>9); gx[i-8]^=(w<<23); } top=gx[7]>>9; gx[0]^=top; top<<=9; gx[4]^=(top<<22); gx[5]^=(top>>10); gx[7]^=top; x->len=8; if (gx[7]==0) mr_lzero(x); return; } if (M==233 && A==201 && B==105 && C==9) { for (i=xl-1;i>=8;i--) { w=gx[i]; gx[i]=0; gx[i-1]^=w; gx[i-4]^=w; gx[i-7]^=(w>>9)^w; gx[i-8]^=(w<<23); } top=gx[7]>>9; gx[0]^=top; top<<=9; gx[0]^=top; gx[3]^=top; gx[6]^=top; gx[7]^=top; x->len=8; if (gx[7]==0) mr_lzero(x); return; } if (M==103 && A==9 && B==0) { for (i=xl-1;i>=4;i--) { w=gx[i]; gx[i]=0; gx[i-3]^=((w>>7)^(w<<2)); gx[i-4]^=(w<<25); gx[i-2]^=(w>>30); } top=gx[3]>>7; gx[0]^=top; top<<=7; gx[1]^=(top>>30); gx[0]^=(top<<2); gx[3]^=top; x->len=4; if (gx[3]==0) mr_lzero(x); return; } if (M==283 && A==12 && B==7 && C==5) { for (i=xl-1;i>=9;i--) { w=gx[i]; gx[i]=0; gx[i-9]^=(w<<5)^(w<<10)^(w<<12)^(w<<17); gx[i-8]^=(w>>27)^(w>>22)^(w>>20)^(w>>15); } top=gx[8]>>27; gx[0]^=top^(top<<5)^(top<<7)^(top<<12); gx[8]&=0x7FFFFFF; x->len=9; if (gx[8]==0) mr_lzero(x); return; } if (M==283 && A==249 && B==219 && C==27) { for (i=xl-1;i>=9;i--) { w=gx[i]; gx[i]=0; gx[i-1]^=(w>>2); gx[i-2]^=(w<<30)^w; gx[i-8]^=(w>>27)^w; gx[i-9]^=(w<<5); } /* XORs= 6 shifts= 4 */ top=gx[8]>>27; gx[0]^=top; top<<=27; gx[0]^=top; gx[6]^=(top<<30)^top; gx[7]^=(top>>2); gx[8]^=top; x->len=9; if (gx[8]==0) mr_lzero(x); return; } if (M==313 && A==121 && B==0) { for (i=xl-1;i>=10;i--) { w=gx[i]; gx[i]=0; gx[i-6]^=w; gx[i-9]^=(w>>25); gx[i-10]^=(w<<7); } top=gx[9]>>25; gx[0]^=top; top<<=25; gx[3]^=top; gx[9]^=top; x->len=10; if (gx[9]==0) mr_lzero(x); return; } if (M==379 && A==253 && B==251 && C==59) { for (i=xl-1;i>=12;i--) { w=gx[i]; gx[i]=0; gx[i-3]^=(w>>30); gx[i-4]^=(w<<2)^w; gx[i-10]^=w; gx[i-11]^=(w>>27); gx[i-12]^=(w<<5); } /* XORs= 6 shifts= 4 */ top=gx[11]>>27; gx[0]^=top; top<<=27; gx[1]^=top; gx[7]^=(top<<2)^top; gx[8]^=(top>>30); gx[11]^=top; x->len=12; if (gx[11]==0) mr_lzero(x); return; } if (M==571 && A==10 && B==5 && C==2) { for (i=xl-1;i>=18;i--) { w=gx[i]; gx[i]=0; gx[i-18]^=(w<<5)^(w<<7)^(w<<10)^(w<<15); gx[i-17]^=(w>>27)^(w>>25)^(w>>22)^(w>>17); } top=gx[17]>>27; gx[0]^=top^(top<<2)^(top<<5)^(top<<10); gx[17]&=0x7FFFFFF; x->len=18; if (gx[17]==0) mr_lzero(x); return; } if (M==571 && A==507 && B==475 && C==417) { for (i=xl-1;i>=18;i--) { w=gx[i]; gx[i]=0; gx[i-2]^=w; gx[i-3]^=w; gx[i-4]^=(w>>26); gx[i-5]^=(w<<6); gx[i-17]^=(w>>27); gx[i-18]^=(w<<5); } /* XORs= 6 shifts= 4 */ top=gx[17]>>27; gx[0]^=top; top<<=27; gx[12]^=(top<<6); gx[13]^=(top>>26); gx[14]^=top; gx[15]^=top; gx[17]^=top; x->len=18; if (gx[17]==0) mr_lzero(x); return; } if (M==1223 && A==255) { for (i=xl-1;i>=39;i--) { w=gx[i]; gx[i]=0; gx[i-30]^=(w>>8); gx[i-31]^=(w<<24); gx[i-38]^=(w>>7); gx[i-39]^=(w<<25); } /* XORs= 4 shifts= 4 */ top=gx[38]>>7; gx[0]^=top; top<<=7; gx[7]^=(top<<24); gx[8]^=(top>>8); gx[38]^=top; x->len=39; if (gx[38]==0) mr_lzero(x); return; } #endif #if MIRACL == 64 if (M==1223 && A==255) { for (i=xl-1;i>=20;i--) { w=gx[i]; gx[i]=0; gx[i-15]^=(w>>8); gx[i-16]^=(w<<56); gx[i-19]^=(w>>7); gx[i-20]^=(w<<57); } top=gx[19]>>7; gx[0]^=top; top<<=7; gx[3]^=(top<<56); gx[4]^=(top>>8); gx[19]^=top; x->len=20; if (gx[19]==0) mr_lzero(x); return; } if (M==379 && A==253 && B==251 && C==59) { for (i=xl-1;i>=6;i--) { w=gx[i]; gx[i]=0; gx[i-1]^=(w>>62); gx[i-2]^=(w<<2)^w; gx[i-5]^=(w>>59)^w; gx[i-6]^=(w<<5); } /* XORs= 6 shifts= 4 */ top=gx[5]>>59; gx[0]^=top; top<<=59; gx[0]^=top; gx[3]^=(top<<2)^top; gx[4]^=(top>>62); gx[5]^=top; x->len=6; if (gx[5]==0) mr_lzero(x); return; } #endif k3=k4=rs3=ls3=rs4=ls4=0; k1=1+M/MIRACL; /* words from MSB to LSB */ if (xl<=k1) { if (numbits(x)<=M) return; } rs1=M%MIRACL; ls1=MIRACL-rs1; if (M-A < MIRACL) { /* slow way */ while (numbits(x)>=M+1) { copy(mr_mip->modulus,mr_mip->w7); shiftleftbits(mr_mip->w7,numbits(x)-M-1); add2(x,mr_mip->w7,x); } return; } k2=1+(M-A)/MIRACL; /* words from MSB to bit */ rs2=(M-A)%MIRACL; ls2=MIRACL-rs2; if (B) { /* Pentanomial */ k3=1+(M-B)/MIRACL; rs3=(M-B)%MIRACL; ls3=MIRACL-rs3; k4=1+(M-C)/MIRACL; rs4=(M-C)%MIRACL; ls4=MIRACL-rs4; } for (i=xl-1;i>=k1;i--) { w=gx[i]; gx[i]=0; if (rs1==0) gx[i-k1+1]^=w; else { gx[i-k1+1]^=(w>>rs1); gx[i-k1]^=(w<>rs2); gx[i-k2]^=(w<>rs3); gx[i-k3]^=(w<>rs4); gx[i-k4]^=(w<>rs1; if (top!=0) { gx[0]^=top; top<<=rs1; if (rs2==0) gx[k1-k2]^=top; else { gx[k1-k2]^=(top>>rs2); if (k1>k2) gx[k1-k2-1]^=(top<>rs3); if (k1>k3) gx[k1-k3-1]^=(top<>rs4); if (k1>k4) gx[k1-k4-1]^=(top<len=k1; if (gx[k1-1]==0) mr_lzero(x); } void incr2(big x,int n,big w) { /* increment x by small amount */ if (x!=w) copy(x,w); if (n==0) return; if (w->len==0) { w->len=1; w->w[0]=n; } else { w->w[0]^=(mr_small)n; if (w->len==1 && w->w[0]==0) w->len=0; } } void modsquare2(_MIPD_ big x,big w) { /* w=x*x mod f */ #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif square2(x,mr_mip->w0); reduce2(_MIPP_ mr_mip->w0,mr_mip->w0); copy(mr_mip->w0,w); } /* Experimental code for GF(2^103) modular multiplication * * Inspired by Robert Harley's ECDL code */ #ifdef SP103 #ifdef __GNUC__ #include #else #include #endif void modmult2(_MIPD_ big x,big y,big w) { int i,j; mr_small b; __m128i t[16]; __m128i m,r,s,p,q,xe,xo; __m64 a3,a2,a1,a0,top; if (x==y) { modsquare2(_MIPP_ x,w); return; } if (x->len==0 || y->len==0) { zero(w); return; } #ifdef MR_COUNT_OPS fpm2++; #endif m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */ /* precompute a small table */ t[0]=_mm_set1_epi32(0); xe=_mm_set_epi32(0,x->w[2],0,x->w[0]); xo=_mm_set_epi32(0,x->w[3],0,x->w[1]); t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); xe=_mm_slli_epi64(xe,1); xo=_mm_slli_epi64(xo,1); t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); t[3]=_mm_xor_si128(t[2],t[1]); xe=_mm_slli_epi64(xe,1); xo=_mm_slli_epi64(xo,1); t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); t[5]=_mm_xor_si128(t[4],t[1]); t[6]=_mm_xor_si128(t[4],t[2]); t[7]=_mm_xor_si128(t[4],t[3]); xe=_mm_slli_epi64(xe,1); xo=_mm_slli_epi64(xo,1); t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); t[9]=_mm_xor_si128(t[8],t[1]); t[10]=_mm_xor_si128(t[8],t[2]); t[11]=_mm_xor_si128(t[8],t[3]); t[12]=_mm_xor_si128(t[8],t[4]); t[13]=_mm_xor_si128(t[8],t[5]); t[14]=_mm_xor_si128(t[8],t[6]); t[15]=_mm_xor_si128(t[8],t[7]); b=y->w[0]; i=b&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */ r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); p=q=r; q=_mm_srli_si128(q,1); i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>24)&0xf; j=(b>>28); r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); b=y->w[1]; i=(b)&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>24)&0xf; j=(b>>28); r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); b=y->w[2]; i=(b)&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,10); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>24)&0xf; j=(b>>28); r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,11); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); b=y->w[3]; i=(b)&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,12); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,4); /* only 103 bits, so we are done */ /* modular reduction - x^103+x^9+1 */ a0=_mm_movepi64_pi64(p); a1=_mm_movepi64_pi64(_mm_srli_si128(p,8)); a2=_mm_movepi64_pi64(q); a3=_mm_movepi64_pi64(_mm_srli_si128(q,8)); a2=_m_pxor(a2,_m_psrlqi(a3,39)); a2=_m_pxor(a2,_m_psrlqi(a3,30)); a1=_m_pxor(a1,_m_psllqi(a3,25)); a1=_m_pxor(a1,_m_psllqi(a3,34)); a1=_m_pxor(a1,_m_psrlqi(a2,39)); a1=_m_pxor(a1,_m_psrlqi(a2,30)); a0=_m_pxor(a0,_m_psllqi(a2,25)); a0=_m_pxor(a0,_m_psllqi(a2,34)); top=_m_psrlqi(a1,39); a0=_m_pxor(a0,top); top=_m_psllqi(top,39); a0=_m_pxor(a0,_m_psrlqi(top,30)); a1=_m_pxor(a1,top); if (w->len>4) zero(w); w->w[0]=_m_to_int(a0); a0=_m_psrlqi(a0,32); w->w[1]=_m_to_int(a0); w->w[2]=_m_to_int(a1); a1=_m_psrlqi(a1,32); w->w[3]=_m_to_int(a1); w->len=4; if (w->w[3]==0) mr_lzero(w); _m_empty(); } #endif #ifdef SP79 #ifdef __GNUC__ #include #else #include #endif void modmult2(_MIPD_ big x,big y,big w) { int i,j; mr_small b; __m128i t[16]; __m128i m,r,s,p,q,xe,xo; __m64 a2,a1,a0,top; if (x==y) { modsquare2(_MIPP_ x,w); return; } #ifdef MR_COUNT_OPS fpm2++; #endif if (x->len==0 || y->len==0) { zero(w); return; } m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */ /* precompute a small table */ t[0]=_mm_set1_epi32(0); xe=_mm_set_epi32(0,x->w[2],0,x->w[0]); xo=_mm_set_epi32(0,0,0,x->w[1]); t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); xe=_mm_slli_epi64(xe,1); xo=_mm_slli_epi64(xo,1); t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); t[3]=_mm_xor_si128(t[2],t[1]); xe=_mm_slli_epi64(xe,1); xo=_mm_slli_epi64(xo,1); t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); t[5]=_mm_xor_si128(t[4],t[1]); t[6]=_mm_xor_si128(t[4],t[2]); t[7]=_mm_xor_si128(t[4],t[3]); xe=_mm_slli_epi64(xe,1); xo=_mm_slli_epi64(xo,1); t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4)); t[9]=_mm_xor_si128(t[8],t[1]); t[10]=_mm_xor_si128(t[8],t[2]); t[11]=_mm_xor_si128(t[8],t[3]); t[12]=_mm_xor_si128(t[8],t[4]); t[13]=_mm_xor_si128(t[8],t[5]); t[14]=_mm_xor_si128(t[8],t[6]); t[15]=_mm_xor_si128(t[8],t[7]); b=y->w[0]; i=b&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */ r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); p=q=r; q=_mm_srli_si128(q,1); i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>24)&0xf; j=(b>>28); r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); b=y->w[1]; i=(b)&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>24)&0xf; j=(b>>28); r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); b=y->w[2]; i=(b)&0xf; j=(b>>4)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1); i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j]; s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4); s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]); q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9); p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,7); /* only 79 bits, so we are done */ /* modular reduction - x^79+x^9+1 */ a0=_mm_movepi64_pi64(p); a1=_mm_movepi64_pi64(_mm_srli_si128(p,8)); a2=_mm_movepi64_pi64(q); a1=_m_pxor(a1,_m_psrlqi(a2,15)); a1=_m_pxor(a1,_m_psrlqi(a2,6)); a0=_m_pxor(a0,_m_psllqi(a2,49)); a0=_m_pxor(a0,_m_psllqi(a2,58)); top=_m_psrlqi(a1,15); a0=_m_pxor(a0,top); top=_m_psllqi(top,15); a0=_m_pxor(a0,_m_psrlqi(top,6)); a1=_m_pxor(a1,top); w->w[2]=_m_to_int(a1); if (w->len>3) { /* Yes I know its crazy, but its needed to fix the broken /O2 optimizer */ for (i=3;ilen;i++) w->w[i]=0; } w->w[0]=_m_to_int(a0); a0=_m_psrlqi(a0,32); w->w[1]=_m_to_int(a0); w->len=3; if (w->w[2]==0) mr_lzero(w); _m_empty(); } #endif #ifndef SP103 #ifndef SP79 /*#ifndef SP271 */ void modmult2(_MIPD_ big x,big y,big w) { /* w=x*y mod f */ #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif if (x==NULL || y==NULL) { zero(w); return; } if (x==y) { modsquare2(_MIPP_ x,w); return; } if (y->len==0) { zero(w); return; } if (y->len==1) { if (y->w[0]==1) { copy(x,w); return; } } #ifdef MR_COUNT_OPS fpm2++; #endif multiply2(_MIPP_ x,y,mr_mip->w0); reduce2(_MIPP_ mr_mip->w0,mr_mip->w0); copy(mr_mip->w0,w); } #endif #endif /*#endif*/ /* Will be *much* faster if M,A,(B and C) are all odd */ /* This could/should be optimized for a particular irreducible polynomial and fixed A, B and C */ void sqroot2(_MIPD_ big x,big y) { int i,M,A,B,C; int k,n,h,s,a,aw,ab,bw,bb,cw,cb; #if MIRACL != 32 int mm,j; #endif mr_small *wk,w,we,wo; BOOL slow; /* Using Harley's trick */ static const mr_small evens[16]= {0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15}; static const mr_small odds[16]= {0,4,1,5,8,12,9,13,2,6,3,7,10,14,11,15}; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif M=mr_mip->M; A=mr_mip->AA; if (A==0) { mr_berror(_MIPP_ MR_ERR_NO_BASIS); return; } B=mr_mip->BB; C=mr_mip->CC; slow=FALSE; if (B) { if (M%2!=1 || A%2!=1 || B%2!=1 || C%2!=1) slow=TRUE; } else { if (M%2!=1 || A%2!=1) slow=TRUE; } if (slow) { copy(x,y); for (i=1;iM;i++) modsquare2(_MIPP_ y,y); return; } bb=cb=cw=bw=0; /* M, A (B and C) are all odd - so use fast Fong, Hankerson, Lopez and Menezes method */ if (x==y) { copy (x,mr_mip->w0); wk=mr_mip->w0->w; } else { wk=x->w; } zero(y); #if MIRACL==8 if (M==271 && A==207 && B==175 && C==111) { y->len=34; for (i=0;i<34;i++) { n=i/2; w=wk[i]; we=evens[((w&0x5)+((w&0x50)>>3))]; wo=odds[((w&0xA)+((w&0xA0)>>5))]; i++; w=wk[i]; we|=evens[((w&0x5)+((w&0x50)>>3))]<<4; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4; y->w[n]^=we; y->w[n+17]=wo; y->w[n+13]^=wo; y->w[n+11]^=wo; y->w[n+7]^=wo; } if (y->w[33]==0) mr_lzero(y); return; } #endif #if MIRACL==32 if (M==1223 && A==255) { y->len=39; for (i=0;i<39;i++) { n=i/2; w=wk[i]; we=evens[((w&0x5)+((w&0x50)>>3))]; wo=odds[((w&0xA)+((w&0xA0)>>5))]; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<4; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<8; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<12; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12; i++; if (i<39) { w=wk[i]; we|=evens[((w&0x5)+((w&0x50)>>3))]<<16; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<20; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<24; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<28; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28; } y->w[n]^=we; y->w[20+n-1]^=wo<<4; y->w[20+n]^=wo>>28; y->w[n+4]^=wo; } if (y->w[38]==0) mr_lzero(y); return; } #endif #if MIRACL==64 if (M==1223 && A==255) { y->len=20; for (i=0;i<20;i++) { n=i/2; w=wk[i]; we=evens[((w&0x5)+((w&0x50)>>3))]; wo=odds[((w&0xA)+((w&0xA0)>>5))]; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<4; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<8; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<12; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<16; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<20; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<24; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<28; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28; i++; w=wk[i]; we|=evens[((w&0x5)+((w&0x50)>>3))]<<32; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<32; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<36; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<36; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<40; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<40; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<44; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<44; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<48; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<48; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<52; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<52; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<56; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<56; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<60; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<60; y->w[n]^=we; y->w[10+n-1]^=wo<<36; y->w[10+n]^=wo>>28; y->w[n+2]^=wo; } if (y->w[19]==0) mr_lzero(y); return; } #endif k=1+(M/MIRACL); h=(k+1)/2; a=(A+1)/2; aw=a/MIRACL; ab=a%MIRACL; if (B) { a=(B+1)/2; bw=a/MIRACL; bb=a%MIRACL; a=(C+1)/2; cw=a/MIRACL; cb=a%MIRACL; } s=h*MIRACL-1-(M-1)/2; y->len=k; for (i=0;i>3))]; wo=odds[((w&0xA)+((w&0xA0)>>5))]; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<4; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<8; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<12; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12; #else mm=0; we=wo=0; for (j=0;j>3))]<>5))]<>=8; } #endif i++; if (i>3))]<<16; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<20; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<24; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24; w>>=8; we|=evens[((w&0x5)+((w&0x50)>>3))]<<28; wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28; #else for (j=0;j>3))]<>5))]<>=8; } #endif } y->w[n]^=we; if (s==0) y->w[h+n]=wo; else { y->w[h+n-1]^=wo<<(MIRACL-s); y->w[h+n]^=wo>>s; /* abutt odd bits to even */ } if (ab==0) y->w[n+aw]^=wo; else { y->w[n+aw]^=wo<w[n+aw+1]^=wo>>(MIRACL-ab); } if (B) { if (bb==0) y->w[n+bw]^=wo; else { y->w[n+bw]^=wo<w[n+bw+1]^=wo>>(MIRACL-bb); } if (cb==0) y->w[n+cw]^=wo; else { y->w[n+cw]^=wo<w[n+cw+1]^=wo>>(MIRACL-cb); } } } if (y->w[k-1]==0) mr_lzero(y); } #ifndef MR_STATIC void power2(_MIPD_ big x,int m,big w) { /* w=x^m mod f. Could be optimised a lot, but not time critical for me */ #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif copy(x,mr_mip->w1); convert(_MIPP_ 1,w); forever { if (m%2!=0) modmult2(_MIPP_ w,mr_mip->w1,w); m/=2; if (m==0) break; modsquare2(_MIPP_ mr_mip->w1,mr_mip->w1); } } #endif /* Euclidean Algorithm */ BOOL inverse2(_MIPD_ big x,big w) { mr_small bit; int i,j,n,n3,k,n4,mb,mw; big t; BOOL newword; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif if (size(x)==0) return FALSE; convert(_MIPP_ 1,mr_mip->w1); zero(mr_mip->w2); copy(x,mr_mip->w3); copy(mr_mip->modulus,mr_mip->w4); n3=numbits(mr_mip->w3); n4=mr_mip->M+1; #ifdef MR_COUNT_OPS fpi2++; #endif while (n3!=1) { j=n3-n4; if (j<0) { t=mr_mip->w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t; t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t; j=-j; n=n3; n3=n4; n4=n; } mw=j/MIRACL; mb=j%MIRACL; if (n3w3->w[0]^=mr_mip->w4->w[0]<w3->w[0]&bit)) { n3--; bit>>=1; } } else { k=mr_mip->w3->len; if (mb==0) { for (i=mw;iw3->w[i]^=mr_mip->w4->w[i-mw]; } else { mr_mip->w3->w[mw]^=mr_mip->w4->w[0]<w3->w[i]^=((mr_mip->w4->w[i-mw]<w4->w[i-mw-1]>>(MIRACL-mb))); } newword=FALSE; while (mr_mip->w3->w[k-1]==0) {k--; newword=TRUE;} /* bit=mr_mip->w3->w[k-1]; ASM mov eax,bit ASM bsr ecx,eax ASM mov shift,ecx n3=(k-1)*MIRACL+shift+1; */ if (newword) { bit=TOPBIT; n3=k*MIRACL; } else { n3--; bit=((mr_small)1<<((n3-1)%MIRACL)); } while (!(mr_mip->w3->w[k-1]&bit)) { n3--; bit>>=1; } mr_mip->w3->len=k; } k=mr_mip->w2->len+mw+1; if ((int)mr_mip->w1->len>k) k=mr_mip->w1->len; if (mb==0) { for (i=mw;iw1->w[i]^=mr_mip->w2->w[i-mw]; } else { mr_mip->w1->w[mw]^=mr_mip->w2->w[0]<w1->w[i]^=((mr_mip->w2->w[i-mw]<w2->w[i-mw-1]>>(MIRACL-mb))); } while (mr_mip->w1->w[k-1]==0) k--; mr_mip->w1->len=k; } copy(mr_mip->w1,w); return TRUE; } /* Schroeppel, Orman, O'Malley, Spatscheck * * "Almost Inverse" algorithm, Crypto '95 * * More optimization here and in-lining would * * speed up AFFINE mode. I observe that * * pentanomials would be more efficient if C * * were greater */ /* BOOL inverse2(_MIPD_ big x,big w) { mr_small lsw,*gw; int i,n,bits,step,n3,n4,k; int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4; int M,A,B,C; big t; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif if (size(x)==0) return FALSE; M=mr_mip->M; A=mr_mip->AA; if (A==0) { mr_berror(_MIPP_ MR_ERR_NO_BASIS); return FALSE; } B=mr_mip->BB; C=mr_mip->CC; convert(_MIPP_ 1,mr_mip->w1); zero(mr_mip->w2); copy(x,mr_mip->w3); copy(mr_mip->modulus,mr_mip->w4); bits=zerobits(mr_mip->w3); shiftrightbits(mr_mip->w3,bits); k=bits; n3=numbits(mr_mip->w3); n4=M+1; if (n3>1) forever { if (n3w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t; t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t; n=n3; n3=n4; n4=n; } add2(mr_mip->w3,mr_mip->w4,mr_mip->w3); add2(mr_mip->w1,mr_mip->w2,mr_mip->w1); if (n3==n4) n3=numbits(mr_mip->w3); bits=zerobits(mr_mip->w3); k+=bits; n3-=bits; if (n3==1) break; shiftrightbits(mr_mip->w3,bits); shiftleftbits(mr_mip->w2,bits); } copy(mr_mip->w1,w); if (k==0) { mr_lzero(w); return TRUE; } step=MIRACL; if (Aw; while (k>0) { if (k>step) n=step; else n=k; if (n==MIRACL) lsw=gw[0]; else lsw=gw[0]&(((mr_small)1<len=k1; if (rs1==0) gw[k1-1]^=lsw; else { w->len++; gw[k1]^=(lsw>>ls1); gw[k1-1]^=(lsw<>ls2); gw[k2-1]^=(lsw<>ls3); gw[k3-1]^=(lsw<>ls4); gw[k4-1]^=(lsw<w6); if (size(mr_mip->w6)==0) { mr_berror(_MIPP_ MR_ERR_DIV_BY_ZERO); return FALSE; } inverse2(_MIPP_ mr_mip->w6,mr_mip->w6); /* y=1/y */ copy(x[m-1],mr_mip->w5); modmult2(_MIPP_ w[m-1],mr_mip->w6,w[m-1]); for (i=m-2;;i--) { if (i==0) { modmult2(_MIPP_ mr_mip->w5,mr_mip->w6,w[0]); break; } modmult2(_MIPP_ w[i],mr_mip->w5,w[i]); modmult2(_MIPP_ w[i],mr_mip->w6,w[i]); modmult2(_MIPP_ mr_mip->w5,x[i],mr_mip->w5); } return TRUE; } #ifndef MR_STATIC int trace2(_MIPD_ big x) { int i; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif copy(x,mr_mip->w1); for (i=1;iM;i++) { modsquare2(_MIPP_ mr_mip->w1,mr_mip->w1); add2(mr_mip->w1,x,mr_mip->w1); } return (int)(mr_mip->w1->w[0]&1); } #endif #ifndef MR_NO_RAND void rand2(_MIPD_ big x) { /* random number */ int i,k; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif zero(x); k=1+mr_mip->M/MIRACL; x->len=k; for (i=0;iw[i]=brand(_MIPPO_ ); mr_lzero(x); reduce2(_MIPP_ x,x); } #endif int parity2(big x) { /* return LSB */ if (x->len==0) return 0; return (int)(x->w[0]%2); } void halftrace2(_MIPD_ big b,big w) { int i,M; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif M=mr_mip->M; if (M%2==0) return; copy(b,mr_mip->w1); copy(b,w); for (i=1;i<=(M-1)/2;i++) { modsquare2(_MIPP_ w,w); modsquare2(_MIPP_ w,w); add2(w,mr_mip->w1,w); } } BOOL quad2(_MIPD_ big b,big w) { /* Solves x^2 + x = b for a root w * * returns TRUE if a solution exists * * the "other" solution is w+1 */ int i,M; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif M=mr_mip->M; copy(b,mr_mip->w1); if (M%2==1) halftrace2(_MIPP_ b,w); /* M is odd, so its the Half-Trace */ else { zero(mr_mip->w2); forever { #ifndef MR_NO_RAND rand2(_MIPP_ mr_mip->w2); #else incr(_MIPP_ mr_mip->w2,1,mr_mip->w2); #endif zero(w); copy(mr_mip->w2,mr_mip->w3); for (i=1;iw3,mr_mip->w3); modmult2(_MIPP_ mr_mip->w3,mr_mip->w1,mr_mip->w4); modsquare2(_MIPP_ w,w); add2(w,mr_mip->w4,w); add2(mr_mip->w3,mr_mip->w2,mr_mip->w3); } if (size(mr_mip->w3)!=0) break; } } copy(w,mr_mip->w2); modsquare2(_MIPP_ mr_mip->w2,mr_mip->w2); add2(mr_mip->w2,w,mr_mip->w2); if (mr_compare(mr_mip->w1,mr_mip->w2)==0) return TRUE; return FALSE; } #ifndef MR_STATIC void gf2m_dotprod(_MIPD_ int n,big *x,big *y,big w) { /* dot product - only one reduction! */ int i; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif mr_mip->check=OFF; zero(mr_mip->w5); for (i=0;iw0); add2(mr_mip->w5,mr_mip->w0,mr_mip->w5); } reduce2(_MIPP_ mr_mip->w5,mr_mip->w5); copy(mr_mip->w5,w); mr_mip->check=ON; } #endif BOOL prepare_basis(_MIPD_ int m,int a,int b,int c,BOOL check) { int i,k,sh; #ifdef MR_OS_THREADS miracl *mr_mip=get_mip(); #endif if (mr_mip->ERNUM) return FALSE; if (b==0) c=0; if (m==mr_mip->M && a==mr_mip->AA && b==mr_mip->BB && c==mr_mip->CC) return TRUE; /* its already prepared... */ MR_IN(138) if (m <=0 || a<=0 || a>=m || b>=a) { mr_berror(_MIPP_ MR_ERR_BAD_MODULUS); MR_OUT return FALSE; } mr_mip->M=m; mr_mip->AA=a; mr_mip->BB=0; mr_mip->CC=0; zero(mr_mip->modulus); convert(_MIPP_ 1,mr_mip->one); k=1+m/MIRACL; if (k>mr_mip->nib) { mr_berror(_MIPP_ MR_ERR_OVERFLOW); MR_OUT return FALSE; } mr_mip->modulus->len=k; sh=m%MIRACL; mr_mip->modulus->w[k-1]=((mr_small)1<modulus->w[0]^=1; mr_mip->modulus->w[a/MIRACL]^=((mr_small)1<<(a%MIRACL)); if (b!=0) { mr_mip->BB=b; mr_mip->CC=c; mr_mip->modulus->w[b/MIRACL]^=((mr_small)1<<(b%MIRACL)); mr_mip->modulus->w[c/MIRACL]^=((mr_small)1<<(c%MIRACL)); } if (!check) { MR_OUT return TRUE; } /* check for irreducibility of basis */ zero(mr_mip->w4); mr_mip->w4->len=1; mr_mip->w4->w[0]=2; /* f(t) = t */ for (i=1;i<=m/2;i++) { modsquare2(_MIPP_ mr_mip->w4,mr_mip->w4); incr2(mr_mip->w4,2,mr_mip->w5); gcd2(_MIPP_ mr_mip->w5,mr_mip->modulus,mr_mip->w6); if (size(mr_mip->w6)!=1) { mr_berror(_MIPP_ MR_ERR_NOT_IRREDUC); MR_OUT return FALSE; } } MR_OUT return TRUE; } #endif