/***************************************************************************
*
Copyright 2013 CertiVox UK Ltd. *
*
This file is part of CertiVox MIRACL Crypto SDK. *
*
The CertiVox MIRACL Crypto SDK provides developers with an *
extensive and efficient set of cryptographic functions. *
For further information about its features and functionalities please *
refer to http://www.certivox.com *
*
* The CertiVox MIRACL Crypto SDK is free software: you can *
redistribute it and/or modify it under the terms of the *
GNU Affero General Public License as published by the *
Free Software Foundation, either version 3 of the License, *
or (at your option) any later version. *
*
* The CertiVox MIRACL Crypto SDK is distributed in the hope *
that it will be useful, but WITHOUT ANY WARRANTY; without even the *
implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
See the GNU Affero General Public License for more details. *
*
* You should have received a copy of the GNU Affero General Public *
License along with CertiVox MIRACL Crypto SDK. *
If not, see . *
*
You can be released from the requirements of the license by purchasing *
a commercial license. Buying such a license is mandatory as soon as you *
develop commercial activities involving the CertiVox MIRACL Crypto SDK *
without disclosing the source code of your own applications, or shipping *
the CertiVox MIRACL Crypto SDK with a closed source product. *
*
***************************************************************************/
/*
* MIRACL routines for arithmetic over GF(2^m),
* mrgf2m.c
*
* For algorithms used, see IEEE P1363 Standard, Appendix A
* unless otherwise stated.
*
* The time-critical routines are the multiplication routine multiply2()
* and (for AFFINE co-ordinates), the modular inverse routine inverse2()
* and the routines it calls.
*
* READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS
*
* No assembly language used.
*
* Use utility irp.cpp to generate optimal code for function reduce2(.) below
*
* Space can be saved by removing unneeded functions and
* deleting unrequired functionality.
* For example in reduce2(.) remove code for those irreducible polynomials
* which will not be used by your code.
*/
#include
#include "miracl.h"
#ifdef MR_STATIC
#include
#endif
#ifdef MR_COUNT_OPS
extern int fpm2,fpi2;
#endif
/* must use /arch:SSE2 in compilation */
#ifdef _M_IX86_FP
#if _M_IX86_FP >= 2
#define MR_SSE2_INTRINSICS
#endif
#endif
/* must use -msse2 in compilation */
#ifdef __SSE2__
#define MR_SSE2_INTRINSICS
#endif
#ifdef MR_SSE2_INTRINSICS
#ifdef __GNUC__
#include
#else
#include
#endif
#if MIRACL==64
#define MR_SSE2_64
/* Can use SSE2 registers for 64-bit manipulations */
#endif
#endif
#ifndef MR_NOFULLWIDTH
/* This does not make sense using floating-point! */
/* This is extremely time-critical, and expensive */
/* Some experimental MMX code for x86-32. Seems to be slower than the standard code (on a PIV anyway).. */
#ifdef MR_MMX_x86_32
#ifdef __GNUC__
#include
#else
#include
#endif
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
__m64 rg,tt[4];
mr_small q;
tt[0]=_m_from_int(0);
tt[1]=_m_from_int(a);
tt[2]=_m_psllqi(tt[1],1);
tt[3]=_m_pxor(tt[1],tt[2]);
rg=tt[b&3];
rg=_m_pxor(rg,_m_psllqi(tt[(b>>2)&3],2));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>4)&3],4));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>6)&3],6));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>8)&3],8));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>10)&3],10));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>12)&3],12));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>14)&3],14));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>16)&3],16));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>18)&3],18));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>20)&3],20));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>22)&3],22));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>24)&3],24));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>26)&3],26));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>28)&3],28));
rg=_m_pxor(rg,_m_psllqi(tt[(b>>30)],30));
*r=_m_to_int(rg);
q=_m_to_int(_m_psrlqi(rg,32));
return q;
}
#else
/* This might be faster on a 16-bit processor with no variable shift instructions.
The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit right shift on
the hi|lo value - should be really fast in assembly language
unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r)
{
unsigned short lo,hi,bit,w;
hi=0;
lo=x;
bit=-(lo&1);
lo>>=1;
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
hi^=(y&bit);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
*r=lo;
return hi;
}
*/
/* This might be faster on an 8-bit processor with no variable shift instructions.
The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right shift on
the hi|lo value - should be really fast in assembly language
unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r)
{
unsigned char lo,hi,bit,w;
hi=0;
lo=x;
bit=-(lo&1);
lo>>=1;
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit); bit=-(lo&1);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
hi^=(y&bit);
w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
*r=lo;
return hi;
}
*/
/* wouldn't it be nice if instruction sets supported a
one cycle "carry-free" multiplication instruction ...
The SmartMips does - its called maddp */
#ifndef MR_COMBA2
#if MIRACL==8
/* maybe use a small precomputed look-up table? */
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
static const mr_small look[256]=
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,
0,3,6,5,12,15,10,9,24,27,30,29,20,23,18,17,
0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,
0,5,10,15,20,17,30,27,40,45,34,39,60,57,54,51,
0,6,12,10,24,30,20,18,48,54,60,58,40,46,36,34,
0,7,14,9,28,27,18,21,56,63,54,49,36,35,42,45,
0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,
0,9,18,27,36,45,54,63,72,65,90,83,108,101,126,119,
0,10,20,30,40,34,60,54,80,90,68,78,120,114,108,102,
0,11,22,29,44,39,58,49,88,83,78,69,116,127,98,105,
0,12,24,20,48,60,40,36,96,108,120,116,80,92,72,68,
0,13,26,23,52,57,46,35,104,101,114,127,92,81,70,75,
0,14,28,18,56,54,36,42,112,126,108,98,72,70,84,90,
0,15,30,17,60,51,34,45,120,119,102,105,68,75,90,85
};
mr_small x1,y0,m,p,q;
x1=a&0xf0;
y0=b&0x0f;
a<<=4;
b>>=4;
p=look[(a|y0)];
q=look[(x1|b)];
m=look[a^b^x1^y0]^p^q; /* Karatsuba! */
p^=(m<<4);
q^=(m>>4);
*r=p;
return q;
}
#else
#ifdef MR_SSE2_64
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
int i,j;
__m128i pp,tt[16],m;
m=_mm_set_epi32(0,0,0xf0<<24,0);
tt[0]=_mm_setzero_si128();
tt[1]=_mm_loadl_epi64((__m128i *)&a);
tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
tt[3]=_mm_xor_si128(tt[1],tt[2]);
tt[5]=_mm_xor_si128(tt[1],tt[4]);
tt[6]=_mm_xor_si128(tt[2],tt[4]);
tt[7]=_mm_xor_si128(tt[6],tt[1]);
tt[9]=_mm_xor_si128(tt[8],tt[1]);
tt[10]=_mm_xor_si128(tt[8],tt[2]);
tt[11]=_mm_xor_si128(tt[10],tt[1]);
tt[12]=_mm_xor_si128(tt[8],tt[4]);
tt[13]=_mm_xor_si128(tt[12],tt[1]);
tt[14]=_mm_xor_si128(tt[8],tt[6]);
tt[15]=_mm_xor_si128(tt[14],tt[1]);
/* Thanks to Darrel Hankerson, who pointed out an optimization for this code ... */
i=(int)(b&0xF); j=(int)((b>>4)&0xF);
pp=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) );
i=(int)((b>>8)&0xF); j=(int)((b>>12)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128( _mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,1) );
i=(int)((b>>16)&0xF); j=(int)((b>>20)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,2) );
i=(int)((b>>24)&0xF); j=(int)((b>>28)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,3) );
i=(int)((b>>32)&0xF); j=(int)((b>>36)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,4) );
i=(int)((b>>40)&0xF); j=(int)((b>>44)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,5) );
i=(int)((b>>48)&0xF); j=(int)((b>>52)&0xF);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,6) );
i=(int)((b>>56)&0xF); j=(int)(b>>60);
pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
_mm_or_si128(_mm_slli_epi64(tt[j],4),
_mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
) ,7) );
*r=((unsigned long long *)&pp)[0];
return ((unsigned long long *)&pp)[1];
}
#else
static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
{
int k;
mr_small kb,t[16];
mr_small x,q,p;
mr_utype tb0;
#if MIRACL > 32
mr_utype tb1,tb2;
#endif
kb=b;
#if MIRACL <= 32
t[0]=0; /* small look up table */
t[3]=t[2]=a<<1; /* it can overflow.... */
t[1]=t[2]>>1;
t[3]^=t[1];
tb0=(mr_utype)(a&TOPBIT); /* remember top bit */
tb0>>=M1; /* all ones if top bit is one */
#else
t[0]=0; /* larger look-up table */
t[8]=a<<3;
t[4]=t[8]>>1;
t[2]=t[4]>>1;
t[1]=t[2]>>1;
t[3]=t[5]=t[7]=t[9]=t[11]=t[13]=t[15]=t[1];
t[3]^=t[2];
t[5]^=t[4];
t[9]^=t[8];
t[6]=t[3]<<1;
t[7]^=t[6];
t[10]=t[5]<<1;
t[11]^=t[10];
t[12]=t[6]<<1;
t[13]^=t[12];
t[14]=t[7]<<1;
t[15]^=t[14];
tb0=(a&TOPBIT); /* remember top bits */
tb0>>=M1; /* all bits one, if this bit is set in a */
tb1=(a&SECBIT)<<1;
tb1>>=M1;
tb2=(a&THDBIT)<<2;
tb2>>=M1;
#endif
#if MIRACL == 8
#define UNWOUNDM
p=q=t[b&3]; q>>=2;
x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2;
x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2;
x=t[(b>>6)]; q^=x; p^=(x<<6); q>>=2;
#endif
#if MIRACL == 16
#define UNWOUNDM
p=q=t[b&3]; q>>=2;
x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2;
x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2;
x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2;
x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2;
x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2;
x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2;
x=t[(b>>14)]; q^=x; p^=(x<<14); q>>=2;
#endif
#if MIRACL == 32
#define UNWOUNDM
p=q=t[b&3]; q>>=2;
x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2; /* 8 ASM 80386 instructions */
x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2; /* but only 4 ARM instructions! */
x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2;
x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2;
x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2;
x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2;
x=t[(b>>14)&3]; q^=x; p^=(x<<14); q>>=2;
x=t[(b>>16)&3]; q^=x; p^=(x<<16); q>>=2;
x=t[(b>>18)&3]; q^=x; p^=(x<<18); q>>=2;
x=t[(b>>20)&3]; q^=x; p^=(x<<20); q>>=2;
x=t[(b>>22)&3]; q^=x; p^=(x<<22); q>>=2;
x=t[(b>>24)&3]; q^=x; p^=(x<<24); q>>=2;
x=t[(b>>26)&3]; q^=x; p^=(x<<26); q>>=2;
x=t[(b>>28)&3]; q^=x; p^=(x<<28); q>>=2;
x=t[(b>>30)]; q^=x; p^=(x<<30); q>>=2;
#endif
#if MIRACL == 64
#define UNWOUNDM
p=q=t[b&0xf]; q>>=4;
x=t[(b>>4)&0xf]; q^=x; p^=(x<<4); q>>=4;
x=t[(b>>8)&0xf]; q^=x; p^=(x<<8); q>>=4;
x=t[(b>>12)&0xf]; q^=x; p^=(x<<12); q>>=4;
x=t[(b>>16)&0xf]; q^=x; p^=(x<<16); q>>=4;
x=t[(b>>20)&0xf]; q^=x; p^=(x<<20); q>>=4;
x=t[(b>>24)&0xf]; q^=x; p^=(x<<24); q>>=4;
x=t[(b>>28)&0xf]; q^=x; p^=(x<<28); q>>=4;
x=t[(b>>32)&0xf]; q^=x; p^=(x<<32); q>>=4;
x=t[(b>>36)&0xf]; q^=x; p^=(x<<36); q>>=4;
x=t[(b>>40)&0xf]; q^=x; p^=(x<<40); q>>=4;
x=t[(b>>44)&0xf]; q^=x; p^=(x<<44); q>>=4;
x=t[(b>>48)&0xf]; q^=x; p^=(x<<48); q>>=4;
x=t[(b>>52)&0xf]; q^=x; p^=(x<<52); q>>=4;
x=t[(b>>56)&0xf]; q^=x; p^=(x<<56); q>>=4;
x=t[(b>>60)]; q^=x; p^=(x<<60); q>>=4;
#endif
#ifndef UNWOUNDM
q=p=(mr_small)0;
for (k=0;k>=2;
p>>=2;
p|=q<>=2;
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<>=2;
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<>=2;
q^=(t[b&3]);
b>>=2;
p>>=2;
p|=q<>=2;
}
#endif
#if MIRACL <= 32
p^=(tb0&(kb<>1)); /* don't break pipeline.. */
#else
p^=(tb0&(kb<>1));
p^=(tb1&(kb<>2));
p^=(tb2&(kb<>3));
#endif
*r=p;
return q;
}
#endif
#endif
#endif
#endif
static int numbits(big x)
{ /* return degree of x */
mr_small *gx=x->w,bit=TOPBIT;
int m,k=x->len;
if (k==0) return 0;
m=k*MIRACL;
while (!(gx[k-1]&bit))
{
m--;
bit>>=1;
}
return m;
}
int degree2(big x)
{ /* returns -1 for x=0 */
return (numbits(x)-1);
}
/*
static int zerobits(big x)
{
int m,n,k;
mr_small *gx,lsb,bit=1;
k=x->len;
if (k==0) return (-1);
gx=x->w;
for (m=0;mlen;
int w=m/MIRACL;
int b=m%MIRACL;
mr_small *gx=x->w;
if (k==0 || m==0) return;
if (w>0)
{
for (i=0;ilen-=w;
}
if (b!=0)
{
for (i=0;i>b)|(gx[i+1]<<(MIRACL-b));
gx[k-w-1]>>=b;
if (gx[k-w-1]==0) x->len--;
}
}
*/
static void shiftleftbits(big x,int m)
{
int i,k=x->len;
mr_small j;
int w=m/MIRACL; /* words */
int b=m%MIRACL; /* bits */
mr_small *gx=x->w;
if (k==0 || m==0) return;
if (w>0)
{
for (i=k+w-1;i>=w;i--)
gx[i]=gx[i-w];
for (i=w-1;i>=0;i--) gx[i]=0;
x->len+=w;
}
/* time critical */
if (b!=0)
{
j=gx[k+w-1]>>(MIRACL-b);
if (j!=0)
{
x->len++;
gx[k+w]=j;
}
for (i=k+w-1;i>w;i--)
{
gx[i]=(gx[i]<>(MIRACL-b));
}
gx[w]<<=b;
}
}
static void square2(big x,big w)
{ /* w=x*x where x can be NULL so be careful */
int i,j,n,m;
mr_small a,t,r,*gw;
static const mr_small look[16]=
{0,(mr_small)1<len;
if (n==0) return;
m=n+n;
w->len=m;
gw=w->w;
for (i=n-1;i>=0;i--)
{
a=gw[i];
#if MIRACL == 8
#define UNWOUNDS
gw[i+i]=look[a&0xF];
gw[i+i+1]=look[(a>>4)];
#endif
#if MIRACL == 16
#define UNWOUNDS
gw[i+i]=(look[a&0xF]>>8)|look[(a>>4)&0xF];
gw[i+i+1]=(look[(a>>8)&0xF]>>8)|look[(a>>12)];
#endif
#if MIRACL == 32
#define UNWOUNDS
gw[i+i]=(look[a&0xF]>>24)|(look[(a>>4)&0xF]>>16)|(look[(a>>8)&0xF]>>8)|look[(a>>12)&0xF];
gw[i+i+1]=(look[(a>>16)&0xF]>>24)|(look[(a>>20)&0xF]>>16)|(look[(a>>24)&0xF]>>8)|look[(a>>28)];
#endif
#ifndef UNWOUNDS
r=0;
for (j=0;j>=4;
r>>=8;
r|=t;
}
gw[i+i]=r; r=0;
for (j=0;j>=4;
r>>=8;
r|=t;
}
gw[i+i+1]=r;
#endif
}
if (gw[m-1]==0)
{
w->len--;
if (gw[m-2]==0)
mr_lzero(w);
}
}
/* Use karatsuba to multiply two polynomials with coefficients in GF(2^m) */
#ifndef MR_STATIC
void karmul2_poly(_MIPD_ int n,big *t,big *x,big *y,big *z)
{
int m,nd2,nd,md,md2;
if (n==1)
{ /* finished */
modmult2(_MIPP_ *x,*y,*z);
zero(z[1]);
return;
}
if (n==2)
{ /* in-line 2x2 */
modmult2(_MIPP_ x[0],y[0],z[0]);
modmult2(_MIPP_ x[1],y[1],z[2]);
add2(x[0],x[1],t[0]);
add2(y[0],y[1],t[1]);
modmult2(_MIPP_ t[0],t[1],z[1]);
add2(z[1],z[0],z[1]);
add2(z[1],z[2],z[1]);
zero(z[3]);
return;
}
if (n==3)
{
modmult2(_MIPP_ x[0],y[0],z[0]);
modmult2(_MIPP_ x[1],y[1],z[2]);
modmult2(_MIPP_ x[2],y[2],z[4]);
add2(x[0],x[1],t[0]);
add2(y[0],y[1],t[1]);
modmult2(_MIPP_ t[0],t[1],z[1]);
add2(z[1],z[0],z[1]);
add2(z[1],z[2],z[1]);
add2(x[1],x[2],t[0]);
add2(y[1],y[2],t[1]);
modmult2(_MIPP_ t[0],t[1],z[3]);
add2(z[3],z[2],z[3]);
add2(z[3],z[4],z[3]);
add2(x[0],x[2],t[0]);
add2(y[0],y[2],t[1]);
modmult2(_MIPP_ t[0],t[1],t[0]);
add2(z[2],t[0],z[2]);
add2(z[2],z[0],z[2]);
add2(z[2],z[4],z[2]);
zero(z[5]);
return;
}
if (n%2==0)
{
md=nd=n;
md2=nd2=n/2;
}
else
{
nd=n+1;
md=n-1;
nd2=nd/2; md2=md/2;
}
for (m=0;mw0;
if (x==NULL || y==NULL)
{
zero(w);
return;
}
if (x->len==0 || y->len==0)
{
zero(w);
return;
}
xl=x->len;
yl=y->len;
zero(w0);
#ifdef CLAIRE
/* Comba method */
w0->len=xl+yl;
d=1+mr_mip->M/MIRACL;
hi=lo=0;
for (i=0;iw[j],y->w[i-j],&p);
hi^=q; lo^=p;
}
w0->w[i]=lo; lo=hi; hi=0;
}
for (i=d;i<2*d-1;i++)
{
for (j=i-d+1;jw[j],y->w[i-j],&p);
hi^=q; lo^=p;
}
w0->w[i]=lo; lo=hi; hi=0;
}
w0->w[2*d-1]=lo;
mr_lzero(w0);
copy(w0,w);
#else
/* recommended method as mr_mul2 is so slow... */
if (xl>=MR_KARATSUBA && yl>=MR_KARATSUBA)
{
if (xl>yl) ml=xl;
else ml=yl;
karmul2(ml,mr_mip->w7->w,x->w,y->w,w0->w);
mr_mip->w7->len=w0->len=2*ml+1;
mr_lzero(w0);
mr_lzero(mr_mip->w7);
copy(w0,w);
return;
}
w0->len=xl+yl;
for (i=0;iw[i],y->w[j],&p);
w0->w[i+j]^=p;
w0->w[i+j+1]^=q;
}
}
mr_lzero(w0);
copy(w0,w);
#endif
#endif
}
void add2(big x,big y,big z)
{ /* XOR x and y */
int i,lx,ly,lz,lm;
mr_small *gx,*gy,*gz;
if (x==y)
{
zero(z);
return;
}
if (y==NULL)
{
copy(x,z);
return;
}
else if (x==NULL)
{
copy(y,z);
return;
}
if (x==z)
{
gy=y->w; gz=z->w;
ly=y->len; lz=z->len;
lm=lz; if (ly>lz) lm=ly;
for (i=0;ilen=lm;
if (gz[lm-1]==0) mr_lzero(z);
}
else
{
gx=x->w; gy=y->w; gz=z->w;
lx=x->len; ly=y->len; lz=z->len;
lm=lx; if (ly>lx) lm=ly;
for (i=0;ilen=lm;
if (gz[lm-1]==0) mr_lzero(z);
}
}
static void remain2(_MIPD_ big y,big x)
{ /* generic "remainder" program. x%=y */
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
int my=numbits(y);
int mx=numbits(x);
while (mx>=my)
{
copy(y,mr_mip->w7);
shiftleftbits(mr_mip->w7,mx-my);
add2(x,mr_mip->w7,x);
mx=numbits(x);
}
return;
}
void gcd2(_MIPD_ big x,big y,big g)
{
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (size(y)==0)
{
copy(x,g);
return;
}
copy(x,mr_mip->w1);
copy(y,mr_mip->w2);
forever
{
remain2(_MIPP_ mr_mip->w2,mr_mip->w1);
if (size(mr_mip->w1)==0) break;
copy(mr_mip->w1,mr_mip->w3);
copy(mr_mip->w2,mr_mip->w1);
copy(mr_mip->w3,mr_mip->w2);
}
copy(mr_mip->w2,g);
}
/* See "Elliptic Curves in Cryptography", Blake, Seroussi & Smart,
Cambridge University Press, 1999, page 20, for this fast reduction
routine - algorithm II.9 */
void reduce2(_MIPD_ big y,big x)
{ /* reduction wrt the trinomial or pentanomial modulus *
* Note that this is linear O(n), and thus not time critical */
int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4,i;
int M,A,B,C;
int xl;
mr_small top,*gx,w;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (x!=y) copy(y,x);
xl=x->len;
gx=x->w;
M=mr_mip->M;
A=mr_mip->AA;
if (A==0)
{
mr_berror(_MIPP_ MR_ERR_NO_BASIS);
return;
}
B=mr_mip->BB;
C=mr_mip->CC;
/* If optimizing agressively it makes sense to make this code specific to a particular field
For example code like this can be optimized for the case
m=163. Note that the general purpose code involves lots of branches - these cause breaks in
the pipeline and they are slow. Further loop unrolling would be even faster...
Version 5.10 - optimal code for 32-bit processors and for some NIST curves added
Version 5.22 - some code for a 16-bit processor..
Version 5.23 - Use findbase.cpp to find "best" irreducible polynomial
Version 5.23 - Use utility irp.cpp to automatically generate optimal code for insertion here
*/
#if MIRACL == 8
if (M==163 && A==7 && B==6 && C==3)
{
for (i=xl-1;i>=21;i--)
{
w=gx[i]; gx[i]=0;
gx[i-19]^=(w>>4)^(w>>5);
gx[i-20]^=(w>>3)^(w<<4)^(w<<3)^w;
gx[i-21]^=(w<<5);
} /* XORs= 7 shifts= 6 */
top=gx[20]>>3;
gx[0]^=top;
top<<=3;
gx[0]^=(top<<4)^(top<<3)^top;
gx[1]^=(top>>4)^(top>>5);
gx[20]^=top;
x->len=21;
if (gx[20]==0) mr_lzero(x);
return;
}
if (M==271 && A==201)
{
for (i=xl-1;i>=34;i--)
{
w=gx[i]; gx[i]=0;
gx[i-8]^=(w>>6);
gx[i-9]^=(w<<2);
gx[i-33]^=(w>>7);
gx[i-34]^=(w<<1);
} /* XORs= 4 shifts= 4 */
top=gx[33]>>7;
gx[0]^=top;
top<<=7;
gx[24]^=(top<<2);
gx[25]^=(top>>6);
gx[33]^=top;
x->len=34;
if (gx[33]==0) mr_lzero(x);
return;
}
if (M==271 && A==207 && B==175 && C==111)
{
for (i=xl-1;i>=34;i--)
{
w=gx[i]; gx[i]=0;
gx[i-8]^=w;
gx[i-12]^=w;
gx[i-20]^=w;
gx[i-33]^=(w>>7);
gx[i-34]^=(w<<1);
} /* XORs= 5 shifts= 2 */
top=gx[33]>>7;
gx[0]^=top;
top<<=7;
gx[13]^=top;
gx[21]^=top;
gx[25]^=top;
gx[33]^=top;
x->len=34;
if (gx[33]==0) mr_lzero(x);
return;
}
#endif
#if MIRACL == 16
if (M==163 && A==7 && B==6 && C==3)
{
for (i=xl-1;i>=11;i--)
{
w=gx[i]; gx[i]=0;
gx[i-10]^=(w>>3)^(w<<3)^(w<<4)^w;
gx[i-11]^=(w<<13);
gx[i-9]^=(w>>12)^(w>>13);
}
top=gx[10]>>3;
gx[0]^=top;
top<<=3;
gx[1]^=(top>>12)^(top>>13);
gx[0]^=(top<<4)^(top<<3)^top;
gx[10]^=top;
x->len=11;
if (gx[10]==0) mr_lzero(x);
return;
}
if (M==271 && A==201 && B==0)
{
for (i=xl-1;i>=17;i--)
{
w=gx[i]; gx[i]=0;
gx[i-17]^=(w<<1);
gx[i-16]^=(w>>15);
gx[i-5]^=(w<<10);
gx[i-4]^=(w>>6);
}
top=gx[16]>>15;
gx[0]^=top;
top<<=15;
gx[12]^=(top>>6);
gx[11]^=(top<<10);
gx[16]^=top;
x->len=17;
if (gx[16]==0) mr_lzero(x);
return;
}
if (M==271 && A==207 && B==175 && C==111)
{
for (i=xl-1;i>=17;i--)
{
w=gx[i]; gx[i]=0;
gx[i-4]^=w;
gx[i-6]^=w;
gx[i-10]^=w;
gx[i-16]^=(w>>15);
gx[i-17]^=(w<<1);
} /* XORs= 5 shifts= 2 */
top=gx[16]>>15;
gx[0]^=top;
top<<=15;
gx[6]^=top;
gx[10]^=top;
gx[12]^=top;
gx[16]^=top;
x->len=17;
if (gx[16]==0) mr_lzero(x);
return;
}
#endif
#if MIRACL == 32
if (M==127 && A==63)
{
for (i=xl-1;i>=4;i--)
{
w=gx[i]; gx[i]=0;
gx[i-2]^=w;
gx[i-3]^=(w>>31);
gx[i-4]^=(w<<1);
} /* XORs= 3 shifts= 2 */
top=gx[3]>>31; gx[0]^=top; top<<=31;
gx[1]^=top;
gx[3]^=top;
x->len=4;
if (gx[3]==0) mr_lzero(x);
return;
}
if (M==163 && A==7 && B==6 && C==3)
{
for (i=xl-1;i>=6;i--)
{
w=gx[i]; gx[i]=0;
gx[i-5]^=((w>>3)^(w<<4)^(w<<3)^w);
gx[i-6]^=(w<<29);
gx[i-4]^=((w>>28)^(w>>29));
}
top=gx[5]>>3;
gx[0]^=top;
top<<=3;
gx[1]^=(top>>28)^(top>>29);
gx[0]^=top^(top<<4)^(top<<3);
gx[5]^=top;
x->len=6;
if (gx[5]==0) mr_lzero(x);
return;
}
if (M==163 && A==99 && B==97 && C==3)
{
for (i=xl-1;i>=6;i--)
{
w=gx[i]; gx[i]=0;
gx[i-2]^=w^(w>>2);
gx[i-3]^=(w<<30);
gx[i-5]^=(w>>3)^w;
gx[i-6]^=(w<<29);
}
top=gx[5]>>3;
gx[0]^=top;
top<<=3;
gx[0]^=top;
gx[2]^=(top<<30);
gx[3]^=top^(top>>2);
gx[5]^=top;
x->len=6;
if (gx[5]==0) mr_lzero(x);
return;
}
if (M==233 && A==74 && B==0)
{
for (i=xl-1;i>=8;i--)
{
w=gx[i]; gx[i]=0;
gx[i-8]^=(w<<23);
gx[i-7]^=(w>>9);
gx[i-5]^=(w<<1);
gx[i-4]^=(w>>31);
}
top=gx[7]>>9;
gx[0]^=top;
gx[2]^=(top<<10);
gx[3]^=(top>>22);
gx[7]&=0x1FF;
x->len=8;
if (gx[7]==0) mr_lzero(x);
return;
}
if (M==233 && A==159 && B==0)
{
for (i=xl-1;i>=8;i--)
{
w=gx[i]; gx[i]=0;
gx[i-2]^=(w>>10);
gx[i-3]^=(w<<22);
gx[i-7]^=(w>>9);
gx[i-8]^=(w<<23);
}
top=gx[7]>>9;
gx[0]^=top;
top<<=9;
gx[4]^=(top<<22);
gx[5]^=(top>>10);
gx[7]^=top;
x->len=8;
if (gx[7]==0) mr_lzero(x);
return;
}
if (M==233 && A==201 && B==105 && C==9)
{
for (i=xl-1;i>=8;i--)
{
w=gx[i]; gx[i]=0;
gx[i-1]^=w;
gx[i-4]^=w;
gx[i-7]^=(w>>9)^w;
gx[i-8]^=(w<<23);
}
top=gx[7]>>9;
gx[0]^=top;
top<<=9;
gx[0]^=top;
gx[3]^=top;
gx[6]^=top;
gx[7]^=top;
x->len=8;
if (gx[7]==0) mr_lzero(x);
return;
}
if (M==103 && A==9 && B==0)
{
for (i=xl-1;i>=4;i--)
{
w=gx[i]; gx[i]=0;
gx[i-3]^=((w>>7)^(w<<2));
gx[i-4]^=(w<<25);
gx[i-2]^=(w>>30);
}
top=gx[3]>>7;
gx[0]^=top;
top<<=7;
gx[1]^=(top>>30);
gx[0]^=(top<<2);
gx[3]^=top;
x->len=4;
if (gx[3]==0) mr_lzero(x);
return;
}
if (M==283 && A==12 && B==7 && C==5)
{
for (i=xl-1;i>=9;i--)
{
w=gx[i]; gx[i]=0;
gx[i-9]^=(w<<5)^(w<<10)^(w<<12)^(w<<17);
gx[i-8]^=(w>>27)^(w>>22)^(w>>20)^(w>>15);
}
top=gx[8]>>27;
gx[0]^=top^(top<<5)^(top<<7)^(top<<12);
gx[8]&=0x7FFFFFF;
x->len=9;
if (gx[8]==0) mr_lzero(x);
return;
}
if (M==283 && A==249 && B==219 && C==27)
{
for (i=xl-1;i>=9;i--)
{
w=gx[i]; gx[i]=0;
gx[i-1]^=(w>>2);
gx[i-2]^=(w<<30)^w;
gx[i-8]^=(w>>27)^w;
gx[i-9]^=(w<<5);
} /* XORs= 6 shifts= 4 */
top=gx[8]>>27;
gx[0]^=top;
top<<=27;
gx[0]^=top;
gx[6]^=(top<<30)^top;
gx[7]^=(top>>2);
gx[8]^=top;
x->len=9;
if (gx[8]==0) mr_lzero(x);
return;
}
if (M==313 && A==121 && B==0)
{
for (i=xl-1;i>=10;i--)
{
w=gx[i]; gx[i]=0;
gx[i-6]^=w;
gx[i-9]^=(w>>25);
gx[i-10]^=(w<<7);
}
top=gx[9]>>25;
gx[0]^=top;
top<<=25;
gx[3]^=top;
gx[9]^=top;
x->len=10;
if (gx[9]==0) mr_lzero(x);
return;
}
if (M==379 && A==253 && B==251 && C==59)
{
for (i=xl-1;i>=12;i--)
{
w=gx[i]; gx[i]=0;
gx[i-3]^=(w>>30);
gx[i-4]^=(w<<2)^w;
gx[i-10]^=w;
gx[i-11]^=(w>>27);
gx[i-12]^=(w<<5);
} /* XORs= 6 shifts= 4 */
top=gx[11]>>27; gx[0]^=top; top<<=27;
gx[1]^=top;
gx[7]^=(top<<2)^top;
gx[8]^=(top>>30);
gx[11]^=top;
x->len=12;
if (gx[11]==0) mr_lzero(x);
return;
}
if (M==571 && A==10 && B==5 && C==2)
{
for (i=xl-1;i>=18;i--)
{
w=gx[i]; gx[i]=0;
gx[i-18]^=(w<<5)^(w<<7)^(w<<10)^(w<<15);
gx[i-17]^=(w>>27)^(w>>25)^(w>>22)^(w>>17);
}
top=gx[17]>>27;
gx[0]^=top^(top<<2)^(top<<5)^(top<<10);
gx[17]&=0x7FFFFFF;
x->len=18;
if (gx[17]==0) mr_lzero(x);
return;
}
if (M==571 && A==507 && B==475 && C==417)
{
for (i=xl-1;i>=18;i--)
{
w=gx[i]; gx[i]=0;
gx[i-2]^=w;
gx[i-3]^=w;
gx[i-4]^=(w>>26);
gx[i-5]^=(w<<6);
gx[i-17]^=(w>>27);
gx[i-18]^=(w<<5);
} /* XORs= 6 shifts= 4 */
top=gx[17]>>27;
gx[0]^=top;
top<<=27;
gx[12]^=(top<<6);
gx[13]^=(top>>26);
gx[14]^=top;
gx[15]^=top;
gx[17]^=top;
x->len=18;
if (gx[17]==0) mr_lzero(x);
return;
}
if (M==1223 && A==255)
{
for (i=xl-1;i>=39;i--)
{
w=gx[i]; gx[i]=0;
gx[i-30]^=(w>>8);
gx[i-31]^=(w<<24);
gx[i-38]^=(w>>7);
gx[i-39]^=(w<<25);
} /* XORs= 4 shifts= 4 */
top=gx[38]>>7; gx[0]^=top; top<<=7;
gx[7]^=(top<<24);
gx[8]^=(top>>8);
gx[38]^=top;
x->len=39;
if (gx[38]==0) mr_lzero(x);
return;
}
#endif
#if MIRACL == 64
if (M==1223 && A==255)
{
for (i=xl-1;i>=20;i--)
{
w=gx[i]; gx[i]=0;
gx[i-15]^=(w>>8);
gx[i-16]^=(w<<56);
gx[i-19]^=(w>>7);
gx[i-20]^=(w<<57);
}
top=gx[19]>>7; gx[0]^=top; top<<=7;
gx[3]^=(top<<56);
gx[4]^=(top>>8);
gx[19]^=top;
x->len=20;
if (gx[19]==0) mr_lzero(x);
return;
}
if (M==379 && A==253 && B==251 && C==59)
{
for (i=xl-1;i>=6;i--)
{
w=gx[i]; gx[i]=0;
gx[i-1]^=(w>>62);
gx[i-2]^=(w<<2)^w;
gx[i-5]^=(w>>59)^w;
gx[i-6]^=(w<<5);
} /* XORs= 6 shifts= 4 */
top=gx[5]>>59; gx[0]^=top; top<<=59;
gx[0]^=top;
gx[3]^=(top<<2)^top;
gx[4]^=(top>>62);
gx[5]^=top;
x->len=6;
if (gx[5]==0) mr_lzero(x);
return;
}
#endif
k3=k4=rs3=ls3=rs4=ls4=0;
k1=1+M/MIRACL; /* words from MSB to LSB */
if (xl<=k1)
{
if (numbits(x)<=M) return;
}
rs1=M%MIRACL;
ls1=MIRACL-rs1;
if (M-A < MIRACL)
{ /* slow way */
while (numbits(x)>=M+1)
{
copy(mr_mip->modulus,mr_mip->w7);
shiftleftbits(mr_mip->w7,numbits(x)-M-1);
add2(x,mr_mip->w7,x);
}
return;
}
k2=1+(M-A)/MIRACL; /* words from MSB to bit */
rs2=(M-A)%MIRACL;
ls2=MIRACL-rs2;
if (B)
{ /* Pentanomial */
k3=1+(M-B)/MIRACL;
rs3=(M-B)%MIRACL;
ls3=MIRACL-rs3;
k4=1+(M-C)/MIRACL;
rs4=(M-C)%MIRACL;
ls4=MIRACL-rs4;
}
for (i=xl-1;i>=k1;i--)
{
w=gx[i]; gx[i]=0;
if (rs1==0) gx[i-k1+1]^=w;
else
{
gx[i-k1+1]^=(w>>rs1);
gx[i-k1]^=(w<>rs2);
gx[i-k2]^=(w<>rs3);
gx[i-k3]^=(w<>rs4);
gx[i-k4]^=(w<>rs1;
if (top!=0)
{
gx[0]^=top;
top<<=rs1;
if (rs2==0) gx[k1-k2]^=top;
else
{
gx[k1-k2]^=(top>>rs2);
if (k1>k2) gx[k1-k2-1]^=(top<>rs3);
if (k1>k3) gx[k1-k3-1]^=(top<>rs4);
if (k1>k4) gx[k1-k4-1]^=(top<len=k1;
if (gx[k1-1]==0) mr_lzero(x);
}
void incr2(big x,int n,big w)
{ /* increment x by small amount */
if (x!=w) copy(x,w);
if (n==0) return;
if (w->len==0)
{
w->len=1;
w->w[0]=n;
}
else
{
w->w[0]^=(mr_small)n;
if (w->len==1 && w->w[0]==0) w->len=0;
}
}
void modsquare2(_MIPD_ big x,big w)
{ /* w=x*x mod f */
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
square2(x,mr_mip->w0);
reduce2(_MIPP_ mr_mip->w0,mr_mip->w0);
copy(mr_mip->w0,w);
}
/* Experimental code for GF(2^103) modular multiplication *
* Inspired by Robert Harley's ECDL code */
#ifdef SP103
#ifdef __GNUC__
#include
#else
#include
#endif
void modmult2(_MIPD_ big x,big y,big w)
{
int i,j;
mr_small b;
__m128i t[16];
__m128i m,r,s,p,q,xe,xo;
__m64 a3,a2,a1,a0,top;
if (x==y)
{
modsquare2(_MIPP_ x,w);
return;
}
if (x->len==0 || y->len==0)
{
zero(w);
return;
}
#ifdef MR_COUNT_OPS
fpm2++;
#endif
m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */
/* precompute a small table */
t[0]=_mm_set1_epi32(0);
xe=_mm_set_epi32(0,x->w[2],0,x->w[0]);
xo=_mm_set_epi32(0,x->w[3],0,x->w[1]);
t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[3]=_mm_xor_si128(t[2],t[1]);
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[5]=_mm_xor_si128(t[4],t[1]);
t[6]=_mm_xor_si128(t[4],t[2]);
t[7]=_mm_xor_si128(t[4],t[3]);
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[9]=_mm_xor_si128(t[8],t[1]);
t[10]=_mm_xor_si128(t[8],t[2]);
t[11]=_mm_xor_si128(t[8],t[3]);
t[12]=_mm_xor_si128(t[8],t[4]);
t[13]=_mm_xor_si128(t[8],t[5]);
t[14]=_mm_xor_si128(t[8],t[6]);
t[15]=_mm_xor_si128(t[8],t[7]);
b=y->w[0];
i=b&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
p=q=r; q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[1];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[2];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,10);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,11);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[3];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,12);
p=_mm_xor_si128(p,r);
q=_mm_srli_si128(q,4); /* only 103 bits, so we are done */
/* modular reduction - x^103+x^9+1 */
a0=_mm_movepi64_pi64(p);
a1=_mm_movepi64_pi64(_mm_srli_si128(p,8));
a2=_mm_movepi64_pi64(q);
a3=_mm_movepi64_pi64(_mm_srli_si128(q,8));
a2=_m_pxor(a2,_m_psrlqi(a3,39));
a2=_m_pxor(a2,_m_psrlqi(a3,30));
a1=_m_pxor(a1,_m_psllqi(a3,25));
a1=_m_pxor(a1,_m_psllqi(a3,34));
a1=_m_pxor(a1,_m_psrlqi(a2,39));
a1=_m_pxor(a1,_m_psrlqi(a2,30));
a0=_m_pxor(a0,_m_psllqi(a2,25));
a0=_m_pxor(a0,_m_psllqi(a2,34));
top=_m_psrlqi(a1,39);
a0=_m_pxor(a0,top);
top=_m_psllqi(top,39);
a0=_m_pxor(a0,_m_psrlqi(top,30));
a1=_m_pxor(a1,top);
if (w->len>4) zero(w);
w->w[0]=_m_to_int(a0);
a0=_m_psrlqi(a0,32);
w->w[1]=_m_to_int(a0);
w->w[2]=_m_to_int(a1);
a1=_m_psrlqi(a1,32);
w->w[3]=_m_to_int(a1);
w->len=4;
if (w->w[3]==0) mr_lzero(w);
_m_empty();
}
#endif
#ifdef SP79
#ifdef __GNUC__
#include
#else
#include
#endif
void modmult2(_MIPD_ big x,big y,big w)
{
int i,j;
mr_small b;
__m128i t[16];
__m128i m,r,s,p,q,xe,xo;
__m64 a2,a1,a0,top;
if (x==y)
{
modsquare2(_MIPP_ x,w);
return;
}
#ifdef MR_COUNT_OPS
fpm2++;
#endif
if (x->len==0 || y->len==0)
{
zero(w);
return;
}
m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */
/* precompute a small table */
t[0]=_mm_set1_epi32(0);
xe=_mm_set_epi32(0,x->w[2],0,x->w[0]);
xo=_mm_set_epi32(0,0,0,x->w[1]);
t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[3]=_mm_xor_si128(t[2],t[1]);
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[5]=_mm_xor_si128(t[4],t[1]);
t[6]=_mm_xor_si128(t[4],t[2]);
t[7]=_mm_xor_si128(t[4],t[3]);
xe=_mm_slli_epi64(xe,1);
xo=_mm_slli_epi64(xo,1);
t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
t[9]=_mm_xor_si128(t[8],t[1]);
t[10]=_mm_xor_si128(t[8],t[2]);
t[11]=_mm_xor_si128(t[8],t[3]);
t[12]=_mm_xor_si128(t[8],t[4]);
t[13]=_mm_xor_si128(t[8],t[5]);
t[14]=_mm_xor_si128(t[8],t[6]);
t[15]=_mm_xor_si128(t[8],t[7]);
b=y->w[0];
i=b&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
p=q=r; q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[1];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>24)&0xf; j=(b>>28); r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
b=y->w[2];
i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8);
p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9);
p=_mm_xor_si128(p,r);
q=_mm_srli_si128(q,7); /* only 79 bits, so we are done */
/* modular reduction - x^79+x^9+1 */
a0=_mm_movepi64_pi64(p);
a1=_mm_movepi64_pi64(_mm_srli_si128(p,8));
a2=_mm_movepi64_pi64(q);
a1=_m_pxor(a1,_m_psrlqi(a2,15));
a1=_m_pxor(a1,_m_psrlqi(a2,6));
a0=_m_pxor(a0,_m_psllqi(a2,49));
a0=_m_pxor(a0,_m_psllqi(a2,58));
top=_m_psrlqi(a1,15);
a0=_m_pxor(a0,top);
top=_m_psllqi(top,15);
a0=_m_pxor(a0,_m_psrlqi(top,6));
a1=_m_pxor(a1,top);
w->w[2]=_m_to_int(a1);
if (w->len>3)
{ /* Yes I know its crazy, but its needed to fix the broken /O2 optimizer */
for (i=3;ilen;i++) w->w[i]=0;
}
w->w[0]=_m_to_int(a0);
a0=_m_psrlqi(a0,32);
w->w[1]=_m_to_int(a0);
w->len=3;
if (w->w[2]==0) mr_lzero(w);
_m_empty();
}
#endif
#ifndef SP103
#ifndef SP79
/*#ifndef SP271 */
void modmult2(_MIPD_ big x,big y,big w)
{ /* w=x*y mod f */
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (x==NULL || y==NULL)
{
zero(w);
return;
}
if (x==y)
{
modsquare2(_MIPP_ x,w);
return;
}
if (y->len==0)
{
zero(w);
return;
}
if (y->len==1)
{
if (y->w[0]==1)
{
copy(x,w);
return;
}
}
#ifdef MR_COUNT_OPS
fpm2++;
#endif
multiply2(_MIPP_ x,y,mr_mip->w0);
reduce2(_MIPP_ mr_mip->w0,mr_mip->w0);
copy(mr_mip->w0,w);
}
#endif
#endif
/*#endif*/
/* Will be *much* faster if M,A,(B and C) are all odd */
/* This could/should be optimized for a particular irreducible polynomial and fixed A, B and C */
void sqroot2(_MIPD_ big x,big y)
{
int i,M,A,B,C;
int k,n,h,s,a,aw,ab,bw,bb,cw,cb;
#if MIRACL != 32
int mm,j;
#endif
mr_small *wk,w,we,wo;
BOOL slow;
/* Using Harley's trick */
static const mr_small evens[16]=
{0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15};
static const mr_small odds[16]=
{0,4,1,5,8,12,9,13,2,6,3,7,10,14,11,15};
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
M=mr_mip->M;
A=mr_mip->AA;
if (A==0)
{
mr_berror(_MIPP_ MR_ERR_NO_BASIS);
return;
}
B=mr_mip->BB;
C=mr_mip->CC;
slow=FALSE;
if (B)
{
if (M%2!=1 || A%2!=1 || B%2!=1 || C%2!=1) slow=TRUE;
}
else
{
if (M%2!=1 || A%2!=1) slow=TRUE;
}
if (slow)
{
copy(x,y);
for (i=1;iM;i++)
modsquare2(_MIPP_ y,y);
return;
}
bb=cb=cw=bw=0;
/* M, A (B and C) are all odd - so use fast
Fong, Hankerson, Lopez and Menezes method */
if (x==y)
{
copy (x,mr_mip->w0);
wk=mr_mip->w0->w;
}
else
{
wk=x->w;
}
zero(y);
#if MIRACL==8
if (M==271 && A==207 && B==175 && C==111)
{
y->len=34;
for (i=0;i<34;i++)
{
n=i/2;
w=wk[i];
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
i++;
w=wk[i];
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
y->w[n]^=we;
y->w[n+17]=wo;
y->w[n+13]^=wo;
y->w[n+11]^=wo;
y->w[n+7]^=wo;
}
if (y->w[33]==0) mr_lzero(y);
return;
}
#endif
#if MIRACL==32
if (M==1223 && A==255)
{
y->len=39;
for (i=0;i<39;i++)
{
n=i/2;
w=wk[i];
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
i++;
if (i<39)
{
w=wk[i];
we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
}
y->w[n]^=we;
y->w[20+n-1]^=wo<<4;
y->w[20+n]^=wo>>28;
y->w[n+4]^=wo;
}
if (y->w[38]==0) mr_lzero(y);
return;
}
#endif
#if MIRACL==64
if (M==1223 && A==255)
{
y->len=20;
for (i=0;i<20;i++)
{
n=i/2;
w=wk[i];
we=evens[((w&0x5)+((w&0x50)>>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
i++;
w=wk[i];
we|=evens[((w&0x5)+((w&0x50)>>3))]<<32;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<32;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<36;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<36;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<40;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<40;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<44;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<44;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<48;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<48;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<52;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<52;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<56;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<56;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<60;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<60;
y->w[n]^=we;
y->w[10+n-1]^=wo<<36;
y->w[10+n]^=wo>>28;
y->w[n+2]^=wo;
}
if (y->w[19]==0) mr_lzero(y);
return;
}
#endif
k=1+(M/MIRACL);
h=(k+1)/2;
a=(A+1)/2;
aw=a/MIRACL;
ab=a%MIRACL;
if (B)
{
a=(B+1)/2;
bw=a/MIRACL;
bb=a%MIRACL;
a=(C+1)/2;
cw=a/MIRACL;
cb=a%MIRACL;
}
s=h*MIRACL-1-(M-1)/2;
y->len=k;
for (i=0;i>3))];
wo=odds[((w&0xA)+((w&0xA0)>>5))];
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
#else
mm=0;
we=wo=0;
for (j=0;j>3))]<>5))]<>=8;
}
#endif
i++;
if (i>3))]<<16;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
w>>=8;
we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
#else
for (j=0;j>3))]<>5))]<>=8;
}
#endif
}
y->w[n]^=we;
if (s==0) y->w[h+n]=wo;
else
{
y->w[h+n-1]^=wo<<(MIRACL-s);
y->w[h+n]^=wo>>s; /* abutt odd bits to even */
}
if (ab==0) y->w[n+aw]^=wo;
else
{
y->w[n+aw]^=wo<w[n+aw+1]^=wo>>(MIRACL-ab);
}
if (B)
{
if (bb==0) y->w[n+bw]^=wo;
else
{
y->w[n+bw]^=wo<w[n+bw+1]^=wo>>(MIRACL-bb);
}
if (cb==0) y->w[n+cw]^=wo;
else
{
y->w[n+cw]^=wo<w[n+cw+1]^=wo>>(MIRACL-cb);
}
}
}
if (y->w[k-1]==0) mr_lzero(y);
}
#ifndef MR_STATIC
void power2(_MIPD_ big x,int m,big w)
{ /* w=x^m mod f. Could be optimised a lot, but not time critical for me */
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
copy(x,mr_mip->w1);
convert(_MIPP_ 1,w);
forever
{
if (m%2!=0)
modmult2(_MIPP_ w,mr_mip->w1,w);
m/=2;
if (m==0) break;
modsquare2(_MIPP_ mr_mip->w1,mr_mip->w1);
}
}
#endif
/* Euclidean Algorithm */
BOOL inverse2(_MIPD_ big x,big w)
{
mr_small bit;
int i,j,n,n3,k,n4,mb,mw;
big t;
BOOL newword;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (size(x)==0) return FALSE;
convert(_MIPP_ 1,mr_mip->w1);
zero(mr_mip->w2);
copy(x,mr_mip->w3);
copy(mr_mip->modulus,mr_mip->w4);
n3=numbits(mr_mip->w3);
n4=mr_mip->M+1;
#ifdef MR_COUNT_OPS
fpi2++;
#endif
while (n3!=1)
{
j=n3-n4;
if (j<0)
{
t=mr_mip->w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t;
t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t;
j=-j; n=n3; n3=n4; n4=n;
}
mw=j/MIRACL; mb=j%MIRACL;
if (n3w3->w[0]^=mr_mip->w4->w[0]<w3->w[0]&bit))
{
n3--;
bit>>=1;
}
}
else
{
k=mr_mip->w3->len;
if (mb==0)
{
for (i=mw;iw3->w[i]^=mr_mip->w4->w[i-mw];
}
else
{
mr_mip->w3->w[mw]^=mr_mip->w4->w[0]<w3->w[i]^=((mr_mip->w4->w[i-mw]<w4->w[i-mw-1]>>(MIRACL-mb)));
}
newword=FALSE;
while (mr_mip->w3->w[k-1]==0) {k--; newword=TRUE;}
/*
bit=mr_mip->w3->w[k-1];
ASM mov eax,bit
ASM bsr ecx,eax
ASM mov shift,ecx
n3=(k-1)*MIRACL+shift+1;
*/
if (newword)
{
bit=TOPBIT;
n3=k*MIRACL;
}
else
{
n3--;
bit=((mr_small)1<<((n3-1)%MIRACL));
}
while (!(mr_mip->w3->w[k-1]&bit))
{
n3--;
bit>>=1;
}
mr_mip->w3->len=k;
}
k=mr_mip->w2->len+mw+1;
if ((int)mr_mip->w1->len>k) k=mr_mip->w1->len;
if (mb==0)
{
for (i=mw;iw1->w[i]^=mr_mip->w2->w[i-mw];
}
else
{
mr_mip->w1->w[mw]^=mr_mip->w2->w[0]<w1->w[i]^=((mr_mip->w2->w[i-mw]<w2->w[i-mw-1]>>(MIRACL-mb)));
}
while (mr_mip->w1->w[k-1]==0) k--;
mr_mip->w1->len=k;
}
copy(mr_mip->w1,w);
return TRUE;
}
/* Schroeppel, Orman, O'Malley, Spatscheck *
* "Almost Inverse" algorithm, Crypto '95 *
* More optimization here and in-lining would *
* speed up AFFINE mode. I observe that *
* pentanomials would be more efficient if C *
* were greater */
/*
BOOL inverse2(_MIPD_ big x,big w)
{
mr_small lsw,*gw;
int i,n,bits,step,n3,n4,k;
int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4;
int M,A,B,C;
big t;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (size(x)==0) return FALSE;
M=mr_mip->M;
A=mr_mip->AA;
if (A==0)
{
mr_berror(_MIPP_ MR_ERR_NO_BASIS);
return FALSE;
}
B=mr_mip->BB;
C=mr_mip->CC;
convert(_MIPP_ 1,mr_mip->w1);
zero(mr_mip->w2);
copy(x,mr_mip->w3);
copy(mr_mip->modulus,mr_mip->w4);
bits=zerobits(mr_mip->w3);
shiftrightbits(mr_mip->w3,bits);
k=bits;
n3=numbits(mr_mip->w3);
n4=M+1;
if (n3>1) forever
{
if (n3w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t;
t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t;
n=n3; n3=n4; n4=n;
}
add2(mr_mip->w3,mr_mip->w4,mr_mip->w3);
add2(mr_mip->w1,mr_mip->w2,mr_mip->w1);
if (n3==n4) n3=numbits(mr_mip->w3);
bits=zerobits(mr_mip->w3);
k+=bits;
n3-=bits;
if (n3==1) break;
shiftrightbits(mr_mip->w3,bits);
shiftleftbits(mr_mip->w2,bits);
}
copy(mr_mip->w1,w);
if (k==0)
{
mr_lzero(w);
return TRUE;
}
step=MIRACL;
if (Aw;
while (k>0)
{
if (k>step) n=step;
else n=k;
if (n==MIRACL) lsw=gw[0];
else lsw=gw[0]&(((mr_small)1<len=k1;
if (rs1==0) gw[k1-1]^=lsw;
else
{
w->len++;
gw[k1]^=(lsw>>ls1);
gw[k1-1]^=(lsw<>ls2);
gw[k2-1]^=(lsw<>ls3);
gw[k3-1]^=(lsw<>ls4);
gw[k4-1]^=(lsw<w6);
if (size(mr_mip->w6)==0)
{
mr_berror(_MIPP_ MR_ERR_DIV_BY_ZERO);
return FALSE;
}
inverse2(_MIPP_ mr_mip->w6,mr_mip->w6); /* y=1/y */
copy(x[m-1],mr_mip->w5);
modmult2(_MIPP_ w[m-1],mr_mip->w6,w[m-1]);
for (i=m-2;;i--)
{
if (i==0)
{
modmult2(_MIPP_ mr_mip->w5,mr_mip->w6,w[0]);
break;
}
modmult2(_MIPP_ w[i],mr_mip->w5,w[i]);
modmult2(_MIPP_ w[i],mr_mip->w6,w[i]);
modmult2(_MIPP_ mr_mip->w5,x[i],mr_mip->w5);
}
return TRUE;
}
#ifndef MR_STATIC
int trace2(_MIPD_ big x)
{
int i;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
copy(x,mr_mip->w1);
for (i=1;iM;i++)
{
modsquare2(_MIPP_ mr_mip->w1,mr_mip->w1);
add2(mr_mip->w1,x,mr_mip->w1);
}
return (int)(mr_mip->w1->w[0]&1);
}
#endif
#ifndef MR_NO_RAND
void rand2(_MIPD_ big x)
{ /* random number */
int i,k;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
zero(x);
k=1+mr_mip->M/MIRACL;
x->len=k;
for (i=0;iw[i]=brand(_MIPPO_ );
mr_lzero(x);
reduce2(_MIPP_ x,x);
}
#endif
int parity2(big x)
{ /* return LSB */
if (x->len==0) return 0;
return (int)(x->w[0]%2);
}
void halftrace2(_MIPD_ big b,big w)
{
int i,M;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
M=mr_mip->M;
if (M%2==0) return;
copy(b,mr_mip->w1);
copy(b,w);
for (i=1;i<=(M-1)/2;i++)
{
modsquare2(_MIPP_ w,w);
modsquare2(_MIPP_ w,w);
add2(w,mr_mip->w1,w);
}
}
BOOL quad2(_MIPD_ big b,big w)
{ /* Solves x^2 + x = b for a root w *
* returns TRUE if a solution exists *
* the "other" solution is w+1 */
int i,M;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
M=mr_mip->M;
copy(b,mr_mip->w1);
if (M%2==1) halftrace2(_MIPP_ b,w); /* M is odd, so its the Half-Trace */
else
{
zero(mr_mip->w2);
forever
{
#ifndef MR_NO_RAND
rand2(_MIPP_ mr_mip->w2);
#else
incr(_MIPP_ mr_mip->w2,1,mr_mip->w2);
#endif
zero(w);
copy(mr_mip->w2,mr_mip->w3);
for (i=1;iw3,mr_mip->w3);
modmult2(_MIPP_ mr_mip->w3,mr_mip->w1,mr_mip->w4);
modsquare2(_MIPP_ w,w);
add2(w,mr_mip->w4,w);
add2(mr_mip->w3,mr_mip->w2,mr_mip->w3);
}
if (size(mr_mip->w3)!=0) break;
}
}
copy(w,mr_mip->w2);
modsquare2(_MIPP_ mr_mip->w2,mr_mip->w2);
add2(mr_mip->w2,w,mr_mip->w2);
if (mr_compare(mr_mip->w1,mr_mip->w2)==0) return TRUE;
return FALSE;
}
#ifndef MR_STATIC
void gf2m_dotprod(_MIPD_ int n,big *x,big *y,big w)
{ /* dot product - only one reduction! */
int i;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
mr_mip->check=OFF;
zero(mr_mip->w5);
for (i=0;iw0);
add2(mr_mip->w5,mr_mip->w0,mr_mip->w5);
}
reduce2(_MIPP_ mr_mip->w5,mr_mip->w5);
copy(mr_mip->w5,w);
mr_mip->check=ON;
}
#endif
BOOL prepare_basis(_MIPD_ int m,int a,int b,int c,BOOL check)
{
int i,k,sh;
#ifdef MR_OS_THREADS
miracl *mr_mip=get_mip();
#endif
if (mr_mip->ERNUM) return FALSE;
if (b==0) c=0;
if (m==mr_mip->M && a==mr_mip->AA && b==mr_mip->BB && c==mr_mip->CC)
return TRUE; /* its already prepared... */
MR_IN(138)
if (m <=0 || a<=0 || a>=m || b>=a)
{
mr_berror(_MIPP_ MR_ERR_BAD_MODULUS);
MR_OUT
return FALSE;
}
mr_mip->M=m;
mr_mip->AA=a;
mr_mip->BB=0;
mr_mip->CC=0;
zero(mr_mip->modulus);
convert(_MIPP_ 1,mr_mip->one);
k=1+m/MIRACL;
if (k>mr_mip->nib)
{
mr_berror(_MIPP_ MR_ERR_OVERFLOW);
MR_OUT
return FALSE;
}
mr_mip->modulus->len=k;
sh=m%MIRACL;
mr_mip->modulus->w[k-1]=((mr_small)1<modulus->w[0]^=1;
mr_mip->modulus->w[a/MIRACL]^=((mr_small)1<<(a%MIRACL));
if (b!=0)
{
mr_mip->BB=b;
mr_mip->CC=c;
mr_mip->modulus->w[b/MIRACL]^=((mr_small)1<<(b%MIRACL));
mr_mip->modulus->w[c/MIRACL]^=((mr_small)1<<(c%MIRACL));
}
if (!check)
{
MR_OUT
return TRUE;
}
/* check for irreducibility of basis */
zero(mr_mip->w4);
mr_mip->w4->len=1;
mr_mip->w4->w[0]=2; /* f(t) = t */
for (i=1;i<=m/2;i++)
{
modsquare2(_MIPP_ mr_mip->w4,mr_mip->w4);
incr2(mr_mip->w4,2,mr_mip->w5);
gcd2(_MIPP_ mr_mip->w5,mr_mip->modulus,mr_mip->w6);
if (size(mr_mip->w6)!=1)
{
mr_berror(_MIPP_ MR_ERR_NOT_IRREDUC);
MR_OUT
return FALSE;
}
}
MR_OUT
return TRUE;
}
#endif