KGC_TEST/KGCAPP/3rdparty/miracl/source/mrmuldv.any

2780 lines
72 KiB
Plaintext

/*
* MIRACL - various implementations of routines muldiv, muldvm, muldvd
* muldvd2 and imuldiv
* mrmuldv.c
*
* THIS FILE CONTAINS MANY VERSIONS OF THESE ROUTINES
* COPY THIS FILE TO MRMULDV.C AND DELETE THOSE PARTS IRRELEVANT TO
* YOUR REQUIREMENTS.
*
* NOTE: - This file and its contents are not needed
* if MR_NOASM is defined in mirdef.h
*
* muldiv() calculates (a*b+c)/m and (a*b+c)%m as quickly as possible. Should
* ideally be written in assembly language of target machine for speed
* The problem is to avoid overflow in the calculation of the intermediate
* product a*b+c.
*
* If using a floating-point underlying type, and rounding can be
* controlled, it makes sense to pre-calculate
* the inverse of the modulus m, and multiply instead of divide
* In this situation a function imuldiv() is also needed.
*
* muldvm() and muldvd() routines are necessary to support full-width number
* base working. They are not needed if MR_NOFULLWIDTH is defined in mirdef.h.
*
* muldvm - returns (a*base+c)/m and remainder
* muldvd - returns (a*b+c)/base and remainder
*
* NOTE: New to version 4.2, new routine muldvd2() is required.
* See C version below for specification
* Versions of this are easily developed from existing muldvd() programs
*
* In most applications muldvd2() will be the time critical routine.
*
* Note that full-width base working may not be possible for all processors.
* For example it cannot be used on a VAX, or RS/6000 with mr_utypes defined
* as ints. This is because the instruction set does not support
* unsigned multiply and divide instructions. In such cases ALWAYS use a
* maximum base of MAXBASE in mirsys(), rather than 0.
*
* Since parameter passing and returning is time-consuming, these routines
* should be generated 'inline', if compiler allows it. Parameter passing
* by register will also be faster than via the stack. For even faster
* operation, use in-line assembly to speed up the inner loops of routines
* pmul(), sdiv(), multiply() and divide(). See these routines for details
* of Microsoft/Borland C inline 80x86 assembly, which gives a substantial speed-up.
*
* NOTE: All other things being equal, versions of MIRACL with 32-bit mr_utypes
* will run 3-4 times faster than versions with 16-bit mr_utypes, even for medium
* precision arithmetic, such as used in Public Key systems.
*
* Note that a portable C version of 'muldiv' may not possible with some
* 32-bit compilers if ints and longs are both 32-bits and there is no
* 64-bit type. Fortunately these days there usually is such a type - called
* perhaps long long, or maybe __int64. See also the Blakely-Sloan
* method below. In any case the portable versions may be used if mr_utypes
* are defined as shorts, usually 16 bits. This would amount however to
* using the 32-bit processor in a 16 bit mode and would be very inefficient
* - up to 4 times slower. See mirdef.haf
*
* First the standard portable versions, for use when there is a double
* length type capable of holding the product of two mr_utype types.
* For example 32 and 16 bits types respectively.
* Note that if MR_NOASM is defined in mirdef.h, these routines are
* implemented in mrcore.c, and do not need to be extracted from here.
*
* This is followed by various other assembly language implementations for
* popular processors, computers and compilers.
*
**************************************************************
/* Standard C version of mrmuldv.c */
#include <stdio.h>
#include "miracl.h"
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
mr_large ldres,dble=(mr_large)a*b+c;
q=(mr_small)MR_LROUND(dble/m);
*rp=(mr_small)(dble-(mr_large)q*m);
return q;
}
#ifdef MR_FP_ROUNDING
mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
{
mr_small q;
mr_large ldres,dble=(mr_large)a*b+c;
q=(mr_small)MR_LROUND(dble*im);
*rp=(mr_small)(dble-(mr_large)q*m);
return q;
}
#endif
#ifndef MR_NOFULLWIDTH
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
union doubleword dble;
dble.h[MR_BOT]=c;
dble.h[MR_TOP]=a;
q=(mr_small)(dble.d/m);
*rp=(mr_small)(dble.d-(mr_large)q*m);
return q;
}
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
{
union doubleword dble;
dble.d=(mr_large)a*b+c;
*rp=dble.h[MR_BOT];
return dble.h[MR_TOP];
}
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
{
union doubleword dble;
dble.d=(mr_large)a*b+*c+*rp;
*rp=dble.h[MR_BOT];
*c=dble.h[MR_TOP];
}
#endif
/* version for PowerPC (64-bit G5). Use with Blakely-Sloan C versions of muldiv(.) and muldvm(.) - see below */
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
{
__asm__ __volatile__ (
"mulld %%r16,%0,%1\n"
"mulhdu %%r17,%0,%1\n"
"ld %%r18,0(%2)\n"
"addc %%r16,%%r18,%%r16\n"
"addze %%r17,%%r17\n"
"ld %%r19,0(%3)\n"
"addc %%r16,%%r19,%%r16\n"
"addze %%r17,%%r17\n"
"std %%r16,0(%3)\n"
"std %%r17,0(%2)\n"
:
: "r"(a),"r"(b),"r"(c),"r"(rp)
: "r16","r17","r18","r19","memory"
);
}
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
{
mr_small q;
__asm__ __volatile__ (
"mulld %%r16,%1,%2\n"
"mulhdu %%r17,%1,%2\n"
"addc %%r16,%3,%%r16\n"
"addze %%r17,%%r17\n"
"std %%r16,0(%4)\n"
"or %0,%%r17,%%r17\n"
: "=r"(q)
: "r"(a),"r"(b),"r"(c),"r"(rp)
: "r16","r17","memory"
);
return q;
}
****************************************************************
//
// Version of muldiv() for use with underlying type a double
// and using the FP co-processor on a Pentium, and the gcc compiler.
// In this case MR_NOFULLWIDTH is defined.
// This is much better than compiling the above, but fprem and fdiv
// are still very slow.
//
.file "mrmuldv.s"
.text
.globl _muldiv
_muldiv:
pushl %ebx
fldl 8(%esp)
fmull 16(%esp)
movl 40(%esp),%ebx
faddl 24(%esp)
fldl 32(%esp)
fld %st(1)
// NOTE: If rounding control is possible, set rounding to "chop"
// and replace lines below with these
// In this case #define MR_FP_ROUNDING will be defined in mirdef.h
//
// fdiv %st(1),%st
// fistpq 8(%esp)
// fildq 8(%esp)
// fmul %st,%st(1)
// fxch %st(2)
// fsubp %st,%st(1)
// fstpl (%ebx)
fprem
fstl (%ebx)
fsubrp %st,%st(2)
fdivrp %st,%st(1)
popl %ebx
ret
//
// If MR_FP_ROUNDING is defined, this function will be needed for Pentium
//
.globl _imuldiv
_imuldiv:
pushl %ebx
fldl 8(%esp)
fmull 16(%esp)
movl 52(%esp),%ebx
faddl 24(%esp)
fldl 32(%esp)
fld %st(1)
fldt 40(%esp)
fmulp %st,%st(1)
fistpq 8(%esp)
fildq 8(%esp)
fmul %st,%st(1)
fxch %st(2)
fsubp %st,%st(1)
fstpl (%ebx)
popl %ebx
ret
************************************************************************
/*
* Borland C++ 32-bit compiler (BCC32) version of the above.
* Uses inline assembly feature. Suitable for Win32 Apps
* Also compatible with Microsoft Visual C++ 32-bit compiler
* BUT change TBYTE to QWORD
*/
#include "mirdef.h"
#define ASM _asm
double muldiv(double a,double b,double c,double m,double *rp)
{
ASM fld QWORD PTR a
ASM fmul QWORD PTR b
ASM mov ebx,DWORD PTR rp
ASM fadd QWORD PTR c
ASM fld QWORD PTR m
ASM fld st(1)
#ifdef MR_FP_ROUNDING
ASM fdiv st,st(1)
ASM fistp QWORD PTR [ebx]
ASM fild QWORD PTR [ebx]
ASM fmul st(1),st
ASM fxch st(2)
ASM fsubrp st(1),st
ASM fstp QWORD PTR [ebx]
#else
ASM fprem
ASM fst QWORD PTR [ebx]
ASM fsubp st(2),st
ASM fdivp st(1),st
#endif
}
#ifdef MR_FP_ROUNDING
double imuldiv(double a,double b,double c,double m,long double im,double *rp)
{
ASM fld QWORD PTR a
ASM fmul QWORD PTR b
ASM fld QWORD PTR m
ASM fxch st(1)
ASM fadd QWORD PTR c
ASM mov ebx,DWORD PTR rp
ASM fxch st(1)
ASM fld st(1)
ASM fld TBYTE PTR im /* QWORD for Microsoft */
ASM fmulp st(1),st
ASM fistp QWORD PTR [ebx]
ASM fild QWORD PTR [ebx]
ASM fmul st(1),st
ASM fxch st(2)
ASM fsubrp st(1),st
ASM fstp QWORD PTR [ebx]
}
#endif
*********************************************************************
;
; VAX11 version for Dec C compiler
; with 32 bit int using 64-bit quadword
; for the intermediate product.
;
; Use with mirdef.h32 - but define MR_NOFULLWIDTH
; Use mirsys(...,MAXBASE) instead of mirsys(...,0) in your programs.
;
; Why ...(MIRACL-2) instead of ...(MIRACL-1) ? That's a negative
; number for division by mr_base!
;
; The problem is that the emul and ediv instructions work only
; for signed types
;
.entry muldiv,0
subl #4,sp
emul 4(ap),8(ap),12(ap),r0 ;a*b+c
ediv 16(ap),r0,r0,@20(ap) ;quo. in r0, rem. in *rp
ret
.end
;
; Fullwidth base working not possible on VAX, so no muldvm() or muldvd()
;
;
**********************************************************************
#
# Version of muldiv.c for IBM RS/6000
# This processor has no unsigned multiply/divide
# so full-width base not possible, so no muldvm() or muldvd()
#
# Use with mirdef.h32 but define MR_NOFULLWIDTH definition.
# Use mirsys(...,MAXBASE) instead of mirsys(...,0) in your programs.
#
# Note this version was developed from very inadequate RS/6000
# documentation. It may not be optimal, and it may not always work
# (although it works fine for me!)
#
.file "mrmuldv.s"
.globl .muldiv[PR]
.csect .muldiv[PR]
# parameters are passed in registers 3,4,5,6 and 7
# the mq register holds the low 32-bits for mul/div
mul 3,4,3 # q=a*b
mfmq 4 # get low part from mq
a 4,5,4 # add in c
aze 3,3 # add carry to high part
mtmq 4 # move low part to mq
div 3,3,6 # q=(a*b+c)/m
mfmq 4 # get remainder
st 4,0(7) # store remainder
# quotient is returned in register 3
brl
************************************************************************
/* Here's another portable method which might be considered for processors
* like the VAX and RS6000. The idea is due to Peter Montgomery. */
#include "mirdef.h"
typedef unsigned mr_utype uint;
uint muldiv(a,b,c,m,rp)
uint a,b,c,m,*rp;
{
int q,r;
q=(int)(0.5+((double)a*(double)b+(double)c)/(double)m);
r=(int)(((uint)a*(uint)b+(uint)c)-(uint)m*(uint)q);
if (r < 0)
{
r+=m;
q--;
}
*rp=r;
return q;
}
**********************************************************************
;
; IBM-PC versions - small memory model only
; Easily modified for other memory models
;
; For large code models (e.g. medium)
;
; change _TEXT to mrmuldv_TEXT (in three places)
; change NEAR to FAR
; change [bp+4] to [bp+6]
; change [bp+6] to [bp+8]
; change [bp+8] to [bp+10]
; change [bp+10] to [bp+12]
; change [bp+12] to [bp+14]
;
; For large data models, see Turbo C version below for required modification
;
; Microsoft C compiler V4.0+
; Written for MS macro-assembler
;
ASSUME CS:_TEXT
_TEXT SEGMENT BYTE PUBLIC 'CODE'
PUBLIC _muldiv
_muldiv PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov ax,[bp+4] ;get a
mul WORD PTR [bp+6] ;multiply by b
add ax,[bp+8] ;add c to low word
adc dx,0h ;add carry to high word
div WORD PTR [bp+10] ;divide by m
mov bx,[bp+12] ;get address for remainder
mov [bx],dx ;store remainder
pop bp ;standard C return
ret ;quotient in ax
_muldiv endP
PUBLIC _muldvm
_muldvm PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov dx,[bp+4] ;get a
mov ax,[bp+6] ;add in c
div WORD PTR [bp+8] ;divide by m
mov bx,[bp+10] ;get address for remainder
mov [bx],dx ;store remainder
pop bp ;standard C return
ret ;quotient in ax
_muldvm endP
PUBLIC _muldvd
_muldvd PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov ax,[bp+4] ;get a
mul WORD PTR [bp+6] ;multiply by b
add ax,[bp+8] ;add c to low word
adc dx,0h ;add carry to high word
mov bx,[bp+10] ;get address for remainder
mov [bx],ax ;store remainder
mov ax,dx
pop bp ;standard C return
ret ;quotient in ax
_muldvd endP
PUBLIC _muldvd2
_muldvd2 PROC NEAR
push bp ;standard C linkage
mov bp,sp
push si
mov ax,[bp+4] ;get a
mul WORD PTR [bp+6] ;multiply by b
mov bx,[bp+8] ;get address for c
add ax,[bx] ;add c
adc dx,0h ;add carry to high word
mov si,[bp+10] ;get address for remainder
add ax,[si] ;add rp
adc dx,0h ;add carry to high word
mov [si],ax ;store remainder
mov [bx],dx ;store carry
pop si
pop bp ;standard C return
ret
_muldvd2 endP
_TEXT ENDS
END
***********************************************************************
/*
* Turbo C compiler V1.5+, Turbo/Borland C++. Microsoft C/C++
* Uses inline assembly feature
* Generates code identical to above version, and
* can be used instead.
*/
#define ASM asm
/* or perhaps #define ASM _asm */
unsigned int muldiv(a,b,c,m,rp)
unsigned int a,b,c,m,*rp;
{
ASM mov ax,a ;/* get a */
ASM mul WORD PTR b ;/* multiply by b */
ASM add ax,c ;/* add c to low word */
ASM adc dx,0h ;/* add carry to high word */
ASM div WORD PTR m ;/* divide by m */
ASM mov bx,rp ;/* get address for remainder */
ASM mov [bx],dx ;/* store remainder */
}
/* Replace last two ASM lines when using large data memory models */
/* ASM les bx, DWORD PTR rp ; get address for remainder */
/* ASM mov WORD PTR es:[bx],dx ; store remainder */
unsigned int muldvm(a,c,m,rp)
unsigned int a,c,m,*rp;
{
ASM mov dx,a ;/* get a */
ASM mov ax,c ;/* add in c to low word */
ASM div WORD PTR m ;/* divide by m */
ASM mov bx,rp ;/* get address for remainder */
ASM mov [bx],dx ;/* store remainder */
}
/* Replace last two ASM lines when using large data memory models */
/* ASM les bx, DWORD PTR rp ; get address for remainder */
/* ASM mov WORD PTR es:[bx],dx ; store remainder */
unsigned int muldvd(a,b,c,rp)
unsigned int a,b,c,*rp;
{
ASM mov ax,a ;/* get a */
ASM mul WORD PTR b ;/* multiply by b */
ASM add ax,c ;/* add c to low word */
ASM adc dx,0h ;/* add carry to high word */
ASM mov bx,rp ;/* get address for remainder */
ASM mov [bx],ax ;/* store remainder */
ASM mov ax,dx
}
/* Replace second and third last lines if using large data memory models */
/* ASM les bx, DWORD PTR rp ; get address for remainder */
/* ASM mov WORD PTR es:[bx],ax ; store remainder */
void muldvd2(a,b,c,rp)
unsigned int a,b,*c,*rp;
{
ASM mov ax,a ;/* get a */
ASM mul WORD PTR b ;/* multiply by b */
ASM mov bx,c
ASM add ax,[bx]
ASM adc dx,0h ;/* add carry to high word */
ASM mov si,rp
ASM add ax,[si]
ASM adc dx,0h
ASM mov [si],ax
ASM mov [bx],dx
}
/* for large memory model ....
ASM mov ax,a ;/* get a */
ASM mul WORD PTR b ;/* multiply by b */
ASM les bx, DWORD PTR c
ASM add ax, WORD PTR es:[bx]
ASM adc dx,0h ;/* add carry to high word */
ASM les si,DWORD PTR rp
ASM add ax,WORD PTR es:[si]
ASM adc dx,0h
ASM mov WORD PTR es:[si],ax
ASM les bx,DWORD PTR c
ASM mov WORD PTR es:[bx],dx
*/
**********************************************************************
;
; IBM-PC-8087 for Microsoft C compiler V4.0+
; with 'mr_utype' defined as 'long' (32 bit). See mirdef.hpc
; This allows IBM-PC XT to look a bit like a 32-bit computer
; (which it isn't). To make use of this option:
;
; (1) Must have 8087 Maths Co-processor (for speed and to hold 64-bit
; intermediate product).
;
; (2) Must use 'ANSI' enhanced type C compiler, e.g. Microsoft V3.0+
; and must use header 'miracl.h' which declares function
; parameter types.
;
; Note: some compilation warnings may be generated - ignore them.
;
; Note: This is NOT, in most cases, faster, but it does allow
; very high precision calculations, e.g. 1000!
;
; Note: No versions of muldvm(), muldvd() or muldvd2() yet written for
; this method.
;
ASSUME CS:_TEXT
_TEXT SEGMENT BYTE PUBLIC 'CODE'
PUBLIC _muldiv
_muldiv PROC NEAR
push si ;standard C linkage
push bp
mov bp,sp
finit ;initialise 8087
fild DWORD PTR [bp+6] ;get a
fimul DWORD PTR [bp+0ah];multiply by b
fiadd DWORD PTR [bp+0eh];add c
fild DWORD PTR [bp+12h];get m
fld st(1) ;duplicate a*b+c on stack
fprem ;get remainder
fist DWORD PTR [bp+0ah];store remainder in b
fsubr st,st(2) ;subtract rem from total
fdiv st,st(1) ;divide by m
fist DWORD PTR [bp+6] ;store quotient in a
wait
mov si,[bp+22] ;get address for remainder
mov ax,[bp+10]
mov dx,[bp+12] ;get remainder
mov [si],ax
mov [si+2],dx ;store remainder
mov ax,[bp+6]
mov dx,[bp+8] ;get quotient in dx:ax
pop bp ;standard C return
pop si
ret
_muldiv endP
_TEXT ENDS
END
**************************************************************************
;
; Intel-80386 pseudo-32 bit version - for Microsoft C V5.0+
; Written for MS macro-assembler V5.0+ by Andrej Sauer
; with 'mr_utype' defined as 'long' (32 bit). See mirdef.hpc
; Same comments apply as above (except for 8087 requirement)
; Note that this version will also work with the latest Zortech and
; Borland 16-bit compilers, specifically Borland C++ V3.1+
;
; For large code models (e.g. medium)
;
; change _TEXT to mrmuldv_TEXT (in three places)
; change NEAR to FAR
; change [bp+4] to [bp+6]
; change [bp+8] to [bp+10]
; change [bp+12] to [bp+14]
; change [bp+16] to [bp+18]
; change [bp+20] to [bp+22]
; etc
;
.386
ASSUME CS:_TEXT
_TEXT SEGMENT USE16 PUBLIC 'CODE'
PUBLIC _muldiv
_muldiv PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov eax,[bp+4] ;get a
mul DWORD PTR [bp+8] ;multiply by b
add eax,DWORD PTR [bp+12] ;add c to low word
adc edx,0h ;add carry to high word
div DWORD PTR [bp+16] ;divide by m
mov bx,WORD PTR [bp+20] ;get address for remainder
mov [bx],edx ;store remainder
shld edx,eax,16 ;shift higher half of quotient
;into lower half of edx
pop bp ;standard C return
ret ;quotient: high bits in dx, lows in ax
_muldiv endP
PUBLIC _muldvm
_muldvm PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov edx,[bp+4] ;get a
mov eax,[bp+8] ;add in c
div DWORD PTR [bp+12] ;divide by m
mov bx,WORD PTR [bp+16] ;get address for remainder
mov [bx],edx ;store remainder
shld edx,eax,16 ;shift higher half of quotient
;into lower half of edx
pop bp ;standard C return
ret ;quotient: high bits in dx, lows in ax
_muldvm endP
PUBLIC _muldvd
_muldvd PROC NEAR
push bp ;standard C linkage
mov bp,sp
mov eax,[bp+4] ;get a
mul DWORD PTR [bp+8] ;multiply by b
add eax,DWORD PTR [bp+12] ;add c to low word
adc edx,0h ;add carry to high word
mov bx,WORD PTR [bp+16] ;get address for remainder
mov [bx],eax ;store remainder
mov eax,edx
shld edx,eax,16 ;shift higher half of quotient
;into lower half of edx
pop bp ;standard C return
ret ;quotient: high bits in dx, lows in ax
_muldvd endP
PUBLIC _muldvd2
_muldvd2 PROC NEAR
push bp ;standard C linkage
mov bp,sp
push si
mov eax,[bp+4] ;get a
mul DWORD PTR [bp+8] ;multiply by b
les bx,DWORD PTR [bp+12]
add eax,DWORD PTR es:[bx]
adc edx,0h ;add carry to high word
les si,DWORD PTR [bp+16]
add eax,DWORD PTR es:[si]
adc edx,0h ;add carry to high word
mov DWORD PTR es:[si],eax ;store remainder
les bx,DWORD PTR [bp+12]
mov DWORD PTR es:[bx],edx
pop si
pop bp ;standard C return
ret
_muldvd2 endP
_TEXT ENDS
END
***********************************************************************
;
; Large Memory model version of the above. Useful
; for creating 16-bit DLL on 386+. Microsoft/Borland compatible
;
.386
ASSUME CS:mrmuldv_TEXT
mrmuldv_TEXT SEGMENT USE16 PUBLIC 'CODE'
PUBLIC _muldiv
_muldiv PROC FAR
push bp ;standard C linkage
mov bp,sp
mov eax,[bp+6] ;get a
mul DWORD PTR [bp+10] ;multiply by b
add eax,DWORD PTR [bp+14] ;add c to low word
adc edx,0h ;add carry to high word
div DWORD PTR [bp+18] ;divide by m
les bx,DWORD PTR [bp+22]
mov DWORD PTR es:[bx],edx
shld edx,eax,16 ;shift higher half of quotient
;into lower half of edx
pop bp ;standard C return
ret ;quotient: high bits in dx, lows in ax
_muldiv endP
PUBLIC _muldvm
_muldvm PROC FAR
push bp ;standard C linkage
mov bp,sp
mov edx,[bp+6] ;get a
mov eax,[bp+10] ;add in c
div DWORD PTR [bp+14] ;divide by m
les bx,DWORD PTR [bp+18]
mov DWORD PTR es:[bx],edx
shld edx,eax,16 ;shift higher half of quotient
;into lower half of edx
pop bp ;standard C return
ret ;quotient: high bits in dx, lows in ax
_muldvm endP
PUBLIC _muldvd
_muldvd PROC FAR
push bp ;standard C linkage
mov bp,sp
mov eax,[bp+6] ;get a
mul DWORD PTR [bp+10] ;multiply by b
add eax,DWORD PTR [bp+14] ;add c to low word
adc edx,0h ;add carry to high word
les bx,DWORD PTR [bp+18]
mov DWORD PTR es:[bx],eax
mov eax,edx
shld edx,eax,16 ;shift higher half of quotient
;into lower half of edx
pop bp ;standard C return
ret ;quotient: high bits in dx, lows in ax
_muldvd endP
PUBLIC _muldvd2
_muldvd2 PROC FAR
push bp ;standard C linkage
mov bp,sp
push si
mov eax,[bp+6] ;get a
mul DWORD PTR [bp+10] ;multiply by b
les bx,DWORD PTR [bp+14]
add eax,DWORD PTR es:[bx]
adc edx,0h ;add carry to high word
les si,DWORD PTR [bp+18]
add eax,DWORD PTR es:[si]
adc edx,0h ;add carry to high word
mov DWORD PTR es:[si],eax
les bx,DWORD PTR [bp+14]
mov DWORD PTR es:[bx],edx
pop si
pop bp ;standard C return
ret
_muldvd2 endP
mrmuldv_TEXT ENDS
END
****************************************************************************
/*
Borland in-line pseudo-32 bit version of the above
Large memory model version.
Use with mirdef.hpc
Unfortunately this cannot be used with Microsoft C,
as its 16 bit compiler will not allow inline 386 opcodes
*/
#define ASM _asm
long muldiv(a,b,c,m,rp)
long a,b,c,m,*rp;
{
ASM mov eax,DWORD PTR a
ASM mul DWORD PTR b
ASM add eax,DWORD PTR c
ASM adc edx,0h
ASM div DWORD PTR m
ASM les bx,DWORD PTR rp
ASM mov DWORD PTR es:[bx],edx
ASM shld edx,eax,16
}
long muldvm(a,c,m,rp)
long a,c,m,*rp;
{
ASM mov edx,DWORD PTR a
ASM mov eax,DWORD PTR c
ASM div DWORD PTR m
ASM les bx,DWORD PTR rp
ASM mov DWORD PTR es:[bx],edx
ASM shld edx,eax,16
}
long muldvd(a,b,c,rp)
long a,b,c,*rp;
{
ASM mov eax,DWORD PTR a
ASM mul DWORD PTR b
ASM add eax,DWORD PTR c
ASM adc edx,0h
ASM les bx,DWORD PTR rp
ASM mov DWORD PTR es:[bx],eax
ASM mov eax,edx
ASM shld edx,eax,16
}
void muldvd2(a,b,c,rp)
long a,b,*c,*rp;
{
ASM mov eax,DWORD PTR a
ASM mul DWORD PTR b
ASM les bx,DWORD PTR c
ASM add eax,DWORD PTR es:[bx]
ASM adc edx,0h
ASM les si,DWORD PTR rp
ASM add eax,DWORD PTR es:[si]
ASM adc edx,0h
ASM mov DWORD PTR es:[si],eax
ASM les bx,DWORD PTR c
ASM mov DWORD PTR es:[bx],edx
}
***********************************************************************
/*
* Borland C++ 32-bit compiler (BCC32). Use with mirdef.h32
* Uses inline assembly feature. Suitable for Win32 Apps
* Also compatible with Microsoft Visual C++ 32-bit compiler
*/
#define ASM _asm
int muldiv(a,b,c,m,rp)
int a,b,c,m,*rp;
{
ASM mov eax,DWORD PTR a
ASM mul DWORD PTR b
ASM add eax,DWORD PTR c
ASM adc edx,0h
ASM div DWORD PTR m
ASM mov ebx,DWORD PTR rp
ASM mov [ebx],edx
}
int muldvm(a,c,m,rp)
int a,c,m,*rp;
{
ASM mov edx,DWORD PTR a
ASM mov eax,DWORD PTR c
ASM div DWORD PTR m
ASM mov ebx,DWORD PTR rp
ASM mov [ebx],edx
}
int muldvd(a,b,c,rp)
int a,b,c,*rp;
{
ASM mov eax,DWORD PTR a
ASM mul DWORD PTR b
ASM add eax,DWORD PTR c
ASM adc edx,0h
ASM mov ebx,DWORD PTR rp
ASM mov [ebx],eax
ASM mov eax,edx
}
void muldvd2(a,b,c,rp)
int a,b,*c,*rp;
{
ASM mov eax,DWORD PTR a
ASM mul DWORD PTR b
ASM mov ebx,DWORD PTR c
ASM add eax,[ebx]
ASM adc edx,0h
ASM mov esi,DWORD PTR rp
ASM add eax,[esi]
ASM adc edx,0h
ASM mov [esi],eax
ASM mov [ebx],edx
}
*************************************************************************
/
/ Version for 32-bit Sun 386i Workstation
/
.file "mrmuldv.c"
.version "sun386-1.0"
.text
.globl muldiv
muldiv:
pushl %ebp
movl %esp,%ebp
movl 8(%ebp),%eax /get a
mull 12(%ebp) /multiply by b
addl 16(%ebp),%eax /add c to low word
adcl $0,%edx /add carry to high word
divl 20(%ebp) /divide by m
movl 24(%ebp),%ebx /get address for remainder
movl %edx,(%ebx) /store remainder
popl %ebp
ret
.text
.globl muldvm
muldvm:
pushl %ebp
movl %esp,%ebp
movl 8(%ebp),%edx /get a
movl 12(%ebp),%eax /add in c
divl 16(%ebp) /divide by m
movl 20(%ebp),%ebx /get address for remainder
movl %edx,(%ebx) /store remainder
popl %ebp
ret
.text
.globl muldvd
muldvd:
pushl %ebp
movl %esp,%ebp
movl 8(%ebp),%eax /get a
mull 12(%ebp) /multiply by b
addl 16(%ebp),%eax /add c to low word
adcl $0,%edx /add carry to high word
movl 20(%ebp),%ebx /get address for remainder
movl %eax,(%ebx) /store remainder
movl %edx,%eax /get quotient
popl %ebp
ret
**************************************************************************
/
/ DJGPP GNU C version for DOS
/ M. Scott 22/3/98
/
.file "mrmuldv.c"
.text
.globl _muldiv
_muldiv:
pushl %ebp
movl %esp,%ebp
pushl %ebx
movl 8(%ebp),%eax
mull 12(%ebp)
addl 16(%ebp),%eax
adcl $0,%edx
divl 20(%ebp)
movl 24(%ebp),%ebx
movl %edx,(%ebx)
popl %ebx
popl %ebp
ret
.globl _muldvm
_muldvm:
pushl %ebp
movl %esp,%ebp
pushl %ebx
movl 8(%ebp),%edx
movl 12(%ebp),%eax
divl 16(%ebp)
movl 20(%ebp),%ebx
movl %edx,(%ebx)
popl %ebx
popl %ebp
ret
.globl _muldvd
_muldvd:
pushl %ebp
movl %esp,%ebp
pushl %ebx
movl 8(%ebp),%eax
mull 12(%ebp)
addl 16(%ebp),%eax
adcl $0,%edx
movl 20(%ebp),%ebx
movl %eax,(%ebx)
movl %edx,%eax
popl %ebx
popl %ebp
ret
.globl _muldvd2
_muldvd2:
pushl %ebp
movl %esp,%ebp
pushl %ebx
pushl %esi
movl 8(%ebp),%eax
mull 12(%ebp)
movl 16(%ebp),%ebx
addl (%ebx),%eax
adcl $0,%edx
movl 20(%ebp),%esi
addl (%esi),%eax
adcl $0,%edx
movl %eax,(%esi)
movl %edx,(%ebx)
popl %esi
popl %ebx
popl %ebp
ret
*************************************************************************
/
/ GNU C for Linux (and other 386 based Linux/Unix??)
/
/
.file "mrmuldv.s"
.text
.globl muldiv
muldiv:
pushl %ebp
movl %esp,%ebp
pushl %ebx
movl 8(%ebp),%eax
mull 12(%ebp)
addl 16(%ebp),%eax
adcl $0,%edx
divl 20(%ebp)
movl 24(%ebp),%ebx
movl %edx,(%ebx)
popl %ebx
popl %ebp
ret
.globl muldvm
muldvm:
pushl %ebp
movl %esp,%ebp
pushl %ebx
movl 8(%ebp),%edx
movl 12(%ebp),%eax
divl 16(%ebp)
movl 20(%ebp),%ebx
movl %edx,(%ebx)
popl %ebx
popl %ebp
ret
.globl muldvd
muldvd:
pushl %ebp
movl %esp,%ebp
pushl %ebx
movl 8(%ebp),%eax
mull 12(%ebp)
addl 16(%ebp),%eax
adcl $0,%edx
movl 20(%ebp),%ebx
movl %eax,(%ebx)
movl %edx,%eax
popl %ebx
popl %ebp
ret
.globl muldvd2
muldvd2:
pushl %ebp
movl %esp,%ebp
pushl %ebx
pushl %esi
movl 8(%ebp),%eax
mull 12(%ebp)
movl 16(%ebp),%ebx
addl (%ebx),%eax
adcl $0,%edx
movl 20(%ebp),%esi
addl (%esi),%eax
adcl $0,%edx
movl %eax,(%esi)
movl %edx,(%ebx)
popl %esi
popl %ebx
popl %ebp
ret
*************************************************************************
/* GCC inline assembly version for Linux/DJGPP */
#include "miracl.h"
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
__asm__ __volatile__ (
"movl %1,%%eax\n"
"mull %2\n"
"addl %3,%%eax\n"
"adcl $0,%%edx\n"
"divl %4\n"
"movl %5,%%ebx\n"
"movl %%edx,(%%ebx)\n"
"movl %%eax,%0\n"
: "=m"(q)
: "m"(a),"m"(b),"m"(c),"m"(m),"m"(rp)
: "eax","ebx","memory"
);
return q;
}
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
__asm__ __volatile__ (
"movl %1,%%edx\n"
"movl %2,%%eax\n"
"divl %3\n"
"movl %4,%%ebx\n"
"movl %%edx,(%%ebx)\n"
"movl %%eax,%0\n"
: "=m"(q)
: "m"(a),"m"(c),"m"(m),"m"(rp)
: "eax","ebx","memory"
);
return q;
}
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
{
mr_small q;
__asm__ __volatile__ (
"movl %1,%%eax\n"
"mull %2\n"
"addl %3,%%eax\n"
"adcl $0,%%edx\n"
"movl %4,%%ebx\n"
"movl %%eax,(%%ebx)\n"
"movl %%edx,%0\n"
: "=m"(q)
: "m"(a),"m"(b),"m"(c),"m"(rp)
: "eax","ebx","memory"
);
return q;
}
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
{
__asm__ __volatile__ (
"movl %0,%%eax\n"
"mull %1\n"
"movl %2,%%ebx\n"
"addl (%%ebx),%%eax\n"
"adcl $0,%%edx\n"
"movl %3,%%esi\n"
"addl (%%esi),%%eax\n"
"adcl $0,%%edx\n"
"movl %%eax,(%%esi)\n"
"movl %%edx,(%%ebx)\n"
:
: "m"(a),"m"(b),"m"(c),"m"(rp)
: "eax","ebx","esi","memory"
);
}
***********************************************************
;
; Watcom C/386 32-bit compiler V7.0. Use with mirdef.h32
; Most parameters passed in registers
; Written for Phar Lap 386ASM macro-assembler
;
; V4.0 NOTE! Inline assembly versions of these routines,
; are also available. See miracl.h for details
;
.386
ASSUME CS:_TEXT
_TEXT SEGMENT BYTE PUBLIC 'CODE'
PUBLIC muldiv_
muldiv_ PROC NEAR
mul edx ;multiply a*b
add eax,ebx ;add in c
adc edx,0 ;carry
div ecx ;divide by m
mov ebx,[esp+4]
mov [ebx],edx ;remainder
ret 4 ;quotient in eax
muldiv_ endP
PUBLIC muldvm_
muldvm_ PROC NEAR
xchg eax,edx ;a*base+c
div ebx ;divide by m
mov [ecx],edx ;store remainder
ret ;quotient in eax
muldvm_ endP
PUBLIC muldvd_
muldvd_ PROC NEAR
mul edx ;multiply a*b
add eax,ebx ;add in c
adc edx,0
mov [ecx],eax ;store remainder
mov eax,edx ;get quotient
ret ;quotient in eax
muldvd_ endP
_TEXT ENDS
END
*******************************************************************
;
; Zortech C/386 32-bit compiler V2.1
; Use with mirdef.h32
; Written for Phar lap 386ASM macro-assembler
;
.386
ASSUME CS:_TEXT
_TEXT SEGMENT BYTE PUBLIC 'CODE'
PUBLIC _muldiv
_muldiv PROC NEAR
mov eax,DWORD PTR [esp+4]
mul DWORD PTR [esp+8]
add eax,DWORD PTR [esp+12]
adc edx,0
div DWORD PTR [esp+16]
mov ebx,DWORD PTR [esp+20]
mov [ebx],edx
ret
_muldiv endP
PUBLIC _muldvm
_muldvm PROC NEAR
mov edx,DWORD PTR [esp+4]
mov eax,DWORD PTR [esp+8]
div DWORD PTR [esp+12]
mov ebx,DWORD PTR [esp+16]
mov [ebx],edx
ret
_muldvm endP
PUBLIC _muldvd
_muldvd PROC NEAR
mov eax,DWORD PTR [esp+4]
mul DWORD PTR [esp+8]
add eax,DWORD PTR [esp+12]
adc edx,0
mov ebx,DWORD PTR [esp+16]
mov [ebx],eax
mov eax,edx
ret
_muldvd endP
_TEXT ENDS
END
************************************************************************
unsigned int muldiv(a,b,c,m,rp)
unsigned int a,b,c,m,*rp;
{
asm
{
;
; MACintosh version for Megamax or Lightspeed Think C compiler
; with 16-bit int, 68000 processor
; For a 32 bit version for the 68020, see below
;
move a(A6),D1 ;get a
mulu b(A6),D1 ;multiply by b
clr.l D0
move c(A6),D0 ;get c
add.l D0,D1 ;D1 contains a*b+c
divu m(A6),D1 ;divide by m
move D1,D0 ;return with quotient in D0
swap D1 ;get remainder
move.l rp(A6),A0 ;get address for remainder
move D1,(A0) ;store remainder
}
}
unsigned int muldvm(a,c,m,rp)
unsigned int a,c,m,*rp;
{
asm
{
;
; Version of muldvm for Apple MAC
;
clr.l D1
move a(A6),D1 ;get a
swap D1 ;move a to high word
move c(A6),D1 ;add in c
divu m(A6),D1 ;divide by m
move D1,D0 ;return quotient in D0
swap D1 ;get remainder
move.l rp(A6),A0 ;get address for remainder
move D1,(A0) ;store remainder
}
}
unsigned int muldvd(a,b,c,rp)
unsigned int a,b,c,*rp;
{
asm
{
;
; Version of muldvd for Apple MAC
;
move a(A6),D1 ;get a
mulu b(a6),D1 ;multiply by b
clr.l D0
move c(A6),D0 ;get c
add.l D0,D1 ;add in c
move.l D1,D0
swap D0 ;return quotient in D0
move.l rp(A6),A0 ;get address for remainder
move D1,(A0) ;store remainder
}
}
**********************************************************************
#
# 68020+ versions for Next, and for new 32-bit Macs
# Parameters come off the stack
#
.globl _muldiv,_muldvm,_muldvd
_muldiv:
movel sp@(4),d0
mulul sp@(8),d1:d0
addl sp@(12),d0
negxl d1 # tricky stuff!
negl d1
divul sp@(16),d1:d0
movel sp@(20),a0
movel d1,a0@
rts
_muldvm:
movel sp@(4),d1
movel sp@(8),d0
divul sp@(12),d1:d0
movel sp@(16),a0
movel d1,a0@
rts
_muldvd:
movel sp@(4),d1
mulul sp@(8),d0:d1
addl sp@(12),d1
negxl d0
negl d0
movel sp@(16),a0
movel d1,a0@
rts
*************************************************************************
unsigned int muldiv(a,b,c,m,rp)
unsigned int a,b,c,m,*rp;
{
asm
{
;
; 32016 processor version for BBC Master Scientific
; with 32-bit int, by Dudley Long, Rutherford-Appleton Labs.
; No muldvm() or muldvd()
;
movd a,0 ;move a to R0
meid b,0 ;multiply by b, result extended
addd c,0 ;add c to extended number in R0 & R1
addcd #0,1
deid m,0 ;divide by m
movd 0,0(rp) ;remainder to *rp
movd 1,0 ;quotient returned in R0
}
}
*******************************************************************
;
; MOTE! This code is obsolete. Newer ARMs support a 32x32 UMULL instruction
; The ARM compiler supports a long long type, so a C only version may be
; faster
;
; Acorn ARM Risc version (32-bit) for Archimedes micro
; Wingpass Macro Assembler
; Use with mirdef.h32
;
.INCLUDE "A.REGNAMES"
.AREA C$$code, .CODE, .READONLY
muldiv::
MOV ip, sp ;standard linkage
STMFD sp!, {v1-v4}
CMPS a2,#0x80000000 ;check for b=MAXBASE
MOVEQ v3,a1,LSL #31 ;this idea is quicker because
MOVEQ v4,a1,LSR #1 ;of ARM barrel shifting capability
BEQ addin
MOV v1,a1,LSR #16 ;do it the hard way
MOV v2,a2,LSR #16
BIC a1,a1,v1,LSL #16
BIC a2,a2,v2,LSL #16
MUL v3,a1,a2 ;form partial products of a*b
MUL v4,v1,v2
SUB v1,v1,a1
SUB v2,a2,v2
MLA v1,v2,v1,v3 ;look - only 3 MULs!
ADD v1,v1,v4
ADDS v3,v3,v1,LSL #16
ADC v4,v4,v1,LSR #16
addin:
ADDS v3,v3,a3 ;now add in c
ADCCS v4,v4,#0
CMPS a4,#0x80000000 ;check for m=MAXBASE
MOVEQ a1,v3,LSR #31
ADDEQ a1,a1,v4,LSL #1
BICEQ v4,v3,#0x80000000
BEQ leave
MOV a1,#0 ;do long division by m
divlp:
.REPEAT 32 ;2xfaster than a loop!
MOVS v3,v3,ASL #1 ;get next bit into carry
ADC v4,v4,v4 ;accumulate remainder
CMPS v4,a4
SUBCS v4,v4,a4
ADC a1,a1,a1 ;accumulate quotient
.ENDREPEAT
leave:
LDR v3,[ip]
STR v4,[v3] ;store remainder
LDMFD sp!, {v1-v4}
MOVS pc,lr
muldvm::
STMFD sp!, {v1-v2}
MOV v2,a1 ;'multiply' by 2^32
MOV v1,a2 ;add in c
MOV a1,#0 ;do long division by m
.REPEAT 32 ;2xfaster than a loop!
MOVS v1,v1,ASL #1 ;get next bit into carry
ADCS v2,v2,v2 ;accumulate remainder
CMPCCS v2,a3
SUBCS v2,v2,a3
ADC a1,a1,a1 ;accumulate quotient
.ENDREPEAT
STR v2,[a4] ;store remainder
LDMFD sp!, {v1-v2}
MOVS pc,lr
muldvd::
STMFD sp!, {v1-v2}
MOV ip,a1,LSR #16 ;do it the hard way
MOV v2,a2,LSR #16
BIC a1,a1,ip,LSL #16
BIC a2,a2,v2,LSL #16
MUL v1,a1,a2 ;form partial products of a*b
MUL a2,ip,a2
MUL a1,v2,a1
MUL v2,ip,v2
ADDS a1,a2,a1
ADDCS v2,v2,#0x10000
ADDS v1,v1,a1,LSL #16
ADC v2,v2,a1,LSR #16
ADDS v1,v1,a3 ;now add in c
ADCCS v2,v2,#0
MOV a1,v2 ;get quotient
STR v1,[a4] ;store remainder
LDMFD sp!, {v1-v2}
MOVS pc,lr
********************************************************************
;
; Version for Pyramid 90x and 98x computers
; from Rod Worley, Monash University, Victoria, Australia
;
; No muldvm() or muldvd()
;
.text 0
.globl _muldiv
_muldiv:
movw pr0,pr8 ;save a in reg 8
movw $0x0,pr0 ;zero reg0 so long reg 0,1 is b
emul pr8,pr0 ;extended multiply by a
addw pr2,pr1 ;add c to extended result
addwc $0x0,pr0
ediv pr3,pr0 ;extended div by m
movw pr1,(pr4) ;store remainder
ret ;return qotient in pr0
************************************************************************
/* This is the transputer version, by A.H. Pepperdine */
/* Assumes that the result will fit into a 32-bit word */
/* The error flag will be set if */
/* (a*b+c)/m >= 2**32 */
/* ie. equivalently, if */
/* ( (a*b+c) >> 32) >= m */
unsigned int muldiv(unsigned int a, unsigned int b, unsigned int c,
unsigned int m, unsigned int * rp)
{
unsigned int q;
__asm
{
ldabc a, b, c;
lmul ;
ld m;
ldiv ;
stab q, *rp;
}
return q;
}
/* The base is 2**32, ie a full 32-bit unsigned integer */
/* The error flag will be set if the result will not fit*/
/* into a word, ie. */
/* for muldvm that is if (a >= m) */
/* and for muldvd it cannot happen */
unsigned int muldvm(unsigned int a, unsigned int c,
unsigned int m, unsigned int * rp)
{
unsigned int q;
__asm
{
ldabc m, c, a;
ldiv ;
stab q, *rp;
}
return q;
}
unsigned int muldvd(unsigned int a, unsigned int b, unsigned int c,
unsigned int * rp)
{
unsigned int q;
__asm
{
ldabc a, b, c;
lmul ;
stab *rp, q;
}
return q;
}
*********************************************************************
/* Now ... just to confuse you even more ....
Blakeley/Sloan 'portable' method for Modular multiplication IEEE Trans
Computers C-34 March 1985 pp 290-292 eliminates need for double length
product - but will be slow. Might suit some RISC computers with no
multiply/divide instructions. To speed up try completely unravelling for()
loops.
This method should only be used if the mr_utype data type is twice the size
of a "mr_hltype" data-type. This must be defined below.
Note: DON't define MR_NOASM in mirdef.h if using this method.
*/
#include <stdio.h>
#include "miracl.h"
mr_small muldiv(a,b,c,m,rp)
mr_small a,b,c,m;
mr_small *rp;
{
int i;
mr_small d,q=0,r=0;
d=m-a;
for (i=MIRACL/4;i>0;i--)
{ /* do it bit by bit */
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
}
*rp=r;
return q;
}
mr_small muldvm(a,c,m,rp)
mr_small a,c,m;
mr_small *rp;
{ /* modified Blakely-Sloan */
register int i,carry;
register mr_small q=0,r=0;
r=a;
for (i=MIRACL/4;i>0;i--)
{ /* do it bit by bit */
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
}
*rp=r;
return q;
}
/* define mr_hltype as that C type that is half the size in bits of the
underlying type (mr_utype in mirdef.h). Perhaps short if mr_utype is long?
Possible int if mr_utype is 64-bit long long ?? */
#define mr_hltype short
mr_small muldvd(a,b,c,rp)
mr_small a,b,c;
mr_small *rp;
{ /* multiply by parts */
mr_small middle,middle2;
mr_small q,r;
unsigned mr_hltype am,al,bm,bl;
int hshift=(MIRACL>>1);
am=(unsigned mr_hltype)(a>>hshift);
al=(unsigned mr_hltype)a;
bm=(unsigned mr_hltype)(b>>hshift);
bl=(unsigned mr_hltype)b;
/* form partial products */
r= (mr_small)al*bl;
q= (mr_small)am*bm;
middle=(mr_small)al*bm;
middle2=(mr_small)bl*am;
middle+=middle2; /* combine them - carefully */
if (middle<middle2) q+=((mr_small)1<<hshift);
r+=(middle << hshift);
if ((r>>hshift)<(unsigned mr_hltype)middle) q++;
q+=(middle>>hshift);
r+=c;
if (r<c) q++;
*rp=r;
return q;
}
void muldvd2(a,b,c,rp)
mr_small a,b;
mr_small *c,*rp;
{ /* multiply by parts */
mr_small middle,middle2;
mr_small q,r;
unsigned mr_hltype am,al,bm,bl;
int hshift=(MIRACL>>1);
am=(unsigned mr_hltype)(a>>hshift);
al=(unsigned mr_hltype)a;
bm=(unsigned mr_hltype)(b>>hshift);
bl=(unsigned mr_hltype)b;
/* form partial products */
r= (mr_small)al*bl;
q= (mr_small)am*bm;
middle=(mr_small)al*bm;
middle2=(mr_small)bl*am;
middle+=middle2; /* combine them - carefully */
if (middle<middle2) q+=((mr_small)1<<hshift);
r+=(middle << hshift);
if ((r>>hshift)<(unsigned mr_hltype)middle) q++;
q+=(middle>>hshift);
r+=*c;
if (r<*c) q++;
r+=*rp;
if (r<*rp) q++;
*rp=r;
*c=q;
}
*************************************************************************
/* SPARC assembler version of above. Note that when Full-width base
working is used, then muldvd() is the most time-critical of these
three routines. Use with above Blakely-Sloan C versions of muldvm
and muldiv (Assumes mr_utype is 32 bit int) */
.global _muldvd
_muldvd:
mov %o1,%y
andcc %g0,%g0,%o4
nop
nop
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%o0,%o4
mulscc %o4,%g0,%o4
tst %o0
bge 1f
nop
add %o4,%o1,%o4
1:
rd %y,%o1
addcc %o1,%o2,%o1
st %o1,[%o3]
retl
addxcc %o4,%g0,%o0
**************************************************************************
/* If you have a "decent" SPARC which supports UMUL and UDIV instructions
then the following will be much faster. Cut and paste what follows
into mrmuldv.s. See miracl.mak make file
Aside: God, I hate the Sparc, with its slippery ill-defined Instruction
set. Not all implementations support UMUL and UDIV, so its safer
to use the method above.
Note: Sometimes the routine name needs a preceding underscore,
so it may be necessary to change for example muldvd to _muldvd
through-out. Depends on the Unix version
*/
.global muldvd
muldvd:
umul %o0,%o1,%o0
rd %y,%o1
addcc %o0,%o2,%o0
st %o0,[%o3]
retl
addx %o1,%g0,%o0
.global muldvd2
muldvd2:
umul %o0,%o1,%o0
rd %y,%o1
ld [%o2],%o5
addcc %o0,%o5,%o0
ld [%o3],%o5
addx %o1,%g0,%o1
addcc %o0,%o5,%o0
st %o0,[%o3]
addx %o1,%g0,%o1
retl
st %o1,[%o2]
.global muldvm
muldvm:
mov %o0,%y
nop
nop
nop
udiv %o1,%o2,%o0
umul %o0,%o2,%o2
sub %o1,%o2,%o1
retl
st %o1,[%o3]
.global muldiv
muldiv:
umul %o0,%o1,%o1
rd %y,%o0
addcc %o1,%o2,%o1
addx %o0,%g0,%o0
mov %o0,%y
nop
nop
nop
udiv %o1,%o3,%o0
umul %o0,%o3,%o2
sub %o1,%o2,%o1
retl
st %o1,[%o4]
/* In-line assembly for SPARC using double type */
#include <stdio.h>
#include "miracl.h"
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
mr_small q;
mr_large ldres,dble;
static mr_small magic=MR_MAGIC;
__asm__ __volatile__ (
"fdmulq %1,%2,%%f0\n"
"fdtoq %3,%%f4\n"
"faddq %%f0,%%f4,%%f0\n"
"fdtoq %4,%%f4\n"
"fdivq %%f0,%%f4,%%f4\n"
"fdtoq %5,%%f8\n"
"faddq %%f4,%%f8,%%f4\n"
"fsubq %%f4,%%f8,%%f4\n"
"fqtod %%f4,%0\n"
"fdmulq %0,%4,%%f8\n"
"fsubq %%f0,%%f8,%%f0\n"
"fqtod %%f0,%%f10\n"
"std %%f10,[%6]\n"
: "=f"(q)
: "f"(a),"f"(b),"f"(c),"f"(m),"f"(magic),"r"(rp)
: "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
);
return q;
}
#ifdef MR_FP_ROUNDING
mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
{
mr_small q;
mr_large ldres,dble;
static mr_small magic=MR_MAGIC;
__asm__ __volatile__ (
"fdmulq %1,%2,%%f0\n"
"fdtoq %3,%%f4\n"
"faddq %%f0,%%f4,%%f0\n"
"fmulq %4,%%f0,%%f4\n"
"fdtoq %6,%%f8\n"
"faddq %%f4,%%f8,%%f4\n"
"fsubq %%f4,%%f8,%%f4\n"
"fqtod %%f4,%0\n"
"fdmulq %0,%5,%%f8\n"
"fsubq %%f0,%%f8,%%f0\n"
"fqtod %%f0,%%f10\n"
"std %%f10,[%7]\n"
: "=f"(q)
: "f"(a),"f"(b),"f"(c),"f"(im),"f"(m),"f"(magic),"r"(rp)
: "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
);
return q;
}
#endif
/* before leaving the SPARC, here is an interesting idea
Specify the underlying type as the 64-bit long long, as supported by the
GCC compiler. Use the Blakely-Sloan Portable Code above, with mr_hltype
defined as a long. This has been tried and works, getting 64-bit
behaviour from a 32-bit processor! Its slower than the 32-bit code above,
but if the 64-bit mrmuldvd() were rewritten in fast assembler.....? */
************************************************************************
#########################################################################################
#
# mrmuldv.s
# author: G. Garth Feb 1996
#
# implementation of modular multiplication for smalls
# using Blakely-Sloan division algorithm
# for Motorola 601 and 604 RISC PowerPC 32-bit processors
# see IEEE trans. Computers C-34, No. 3, March 1985 pp. 290-292
#
# see also PowerPC Microprocessor Developer's guide
# by Bunda, Potter & Shadowen, SAMS 1995, Appendix A p. 177
#
# intended for use in MIRACL library as assembly language implementation
# of routines muldiv, muldvm and muldvd
# written for Apple MPW PPC Assembler for Macintosh PPC computers
#
# Division Algorithm Pseudo Code
# given: integers A,B,C,D and M where D = A * B + C
# this algorithm computes Q and R such that
# D = M * Q + R
# Constraints:
# A,B,C,M < 2^H where H is word length in bits
# 0 <= Q,R < M; 0 < D < 2^(2*H)
#
# let K = # of bits in D
#
# R = Q = 0;
# for(T = K - 1; T >= 0; T--)
# {
# R <<= 1;
# Q <<= 1;
# if(D[T] == 1)
# {
# R += 1;
# }
# while(R >= M)
# {
# R -= M;
# Q += 1;
# }
# }
#
#########################################################################################
export muldiv[DS]
export .muldiv[PR]
export muldvm[DS]
export .muldvm[PR]
export muldvd[DS]
export .muldvd[PR]
toc
tc muldiv[TC],muldiv[DS]
tc muldvm[TC],muldvm[DS]
tc muldvd[TC],muldvd[DS]
csect muldiv[DS]
dc.l .muldiv[PR]
dc.l TOC[tc0]
csect muldvm[DS]
dc.l .muldvm[PR]
dc.l TOC[tc0]
csect muldvd[DS]
dc.l .muldvd[PR]
dc.l TOC[tc0]
#
# unsigned int muldiv(a,b,c,m,rp)
# unsigned int a,b,c,m,*rp;
# returns q = int[(a*b+c)/m] and *rp = (a*b+c) mod m
# when called a -> (r3), b -> (r4), c -> (r5), m -> (r6), rp -> (r7)
# upon return q -> (r3), *rp -> [(r12)]
# registers used: r3 thru r12
#
csect .muldiv[PR]
function .muldiv[PR]
or r12,r7,r7 ;(r12) <- remainder address
mulhwu r8,r3,r4 ;(r8) <- a * b high word
mullw r9,r3,r4 ;(r9 ) <- a * b low word
addc r4,r5,r9 ;(r4) <- a * b + c dividend.lo
addze r3,r8 ;(r3) <- (r8) + XERca dividend.hi
subic. r5,r3,0 ;test for zero dividend.hi
bne divlong ;
;here if dividend is single word
divwu r3,r4,r6 ;(r3) <- quotient
mullw r7,r6,r3; ;(r7) <- r6 * int (r4 / r6)
subf r5,r7,r4 ;(r5) <- remainder.lo
stw r5,0x0000(r12) ;[(r12)] <- remainder
blr ;that's all for single word division
divlong:
xor r7,r7,r7 ;zero divisor.hi
nor r7,r7,r7 ;calc ~divisor.hi
subfic r8,r6,0 ;(r8) <- -divisor.lo, set CA
addze r7,r7 ;(r7) <- ~divisor.hi + CA
or r11,r4,r4 ;(r11) <- dividend.lo
or r4,r3,r3 ;(r4) <- dividend.hi
;try to shift ahead, skipping unnecessary
;shifting loops
cntlzw r10,r4 ;find order of dividend.hi
subfic r9,r10,32 ;calc shift = 32 - order
slw r4,r4,r10 ;shift ahead dividend.hi
srw r3,r11,r9 ;get shifted part of dividend.lo
or r4,r4,r3 ;combine with dividend.hi
slw r11,r11,r10 ;shift ahead dividend.lo
addi r9,r9,33 ;setup for looping
mtctr r9 ;
xor r3,r3,r3 ;clear quotient.lo
xor r5,r5,r5 ;clear shift.hi
xor r6,r6,r6 ;clear shift.lo
b ldiff ;skip first round of shifting
align 6 ;align loop to 64-byte boundary
lshift:
rlwinm r5,r5,1,0,30 ;shift.hi <<= 1
rlwimi r5,r6,1,31,31 ;shift.hi[31] = shift.lo[0]
rlwinm r6,r6,1,0,30 ;shift.lo <<= 1
rlwimi r6,r4,1,31,31 ;shift.lo[31] = dividend.hi[0]
rlwinm r4,r4,1,0,30 ;dividend.hi <<= 1
rlwimi r4,r11,1,31,31 ;dividend.hi[31] = dividend.lo[0]
rlwinm r11,r11,1,0,30 ;dividend.lo <<= 1
rlwinm r3,r3,1,0,30 ;quotient.lo <<=1
ldiff:
addc r10,r6,r8 ;diff.lo = shift.lo - divisor.lo, set CA
adde. r9,r5,r7 ;diff.hi = shift.hi - divisor.hi + CA
blt lloop ;loop if diff < 0
or r6,r10,r10 ;shift.lo = diff.lo
or r5,r9,r9 ;shift.hi = diff.hi
ori r3,r3,1 ;set bit in quotient
lloop:
bdnz lshift ;loop until done
stw r6,0x0000(r12) ;store remainder in rp address
blr ;return
#
# unsigned int muldvm(a,c,m,rp)
# unsigned int a,c,m,*rp;
# returns q = int[(a*base+c)/m] and *rp = (a*base+c) mod m
# when called a -> (r3), c -> (r4), m -> (r5), rp -> (r6)
# upon return q -> (r3), *rp -> [(r12)]
# registers used: r3 thru r12
#
csect .muldvm[PR]
function .muldvm[PR]
or r12,r6,r6 ;(r12) <- remainder address
or r6,r5,r5 ;(r6) <- m
xor r7,r7,r7 ;zero divisor.hi
nor r7,r7,r7 ;calc ~divisor.hi
subfic r8,r6,0 ;(r8) <- calc -divisor.lo, set CA
addze r7,r7 ;(r7) <- ~divisor.hi += CA
or r11,r4,r4 ;(r11) <- dividend.lo
or r4,r3,r3 ;(r4) <- dividend.hi
;try to shift ahead, skipping unnecessary
;shifting loops
cntlzw r10,r4 ;find order of dividend.hi
subfic r9,r10,32 ;calc shift = 32 - order
slw r4,r4,r10 ;shift ahead dividend.hi
srw r3,r11,r9 ;get shifted part of dividend.lo
or r4,r4,r3 ;combine with dividend.hi
slw r11,r11,r10 ;shift ahead dividend.lo
addi r9,r9,33 ;setup for looping
mtctr r9 ;
xor r3,r3,r3 ;clear quotient.lo
xor r5,r5,r5 ;clear shift.hi
xor r6,r6,r6 ;clear shift.lo
b sdiff ;skip first round of shifting
align 6 ;align loop to 64-byte boundary
sshift:
rlwinm r5,r5,1,0,30 ;shift.hi <<= 1
rlwimi r5,r6,1,31,31 ;shift.hi[31] = shift.lo[0]
rlwinm r6,r6,1,0,30 ;shift.lo <<= 1
rlwimi r6,r4,1,31,31 ;shift.lo[31] = dividend.hi[0]
rlwinm r4,r4,1,0,30 ;dividend.hi <<= 1
rlwimi r4,r11,1,31,31 ;dividend.hi[31] = dividend.lo[0]
rlwinm r11,r11,1,0,30 ;dividend.lo <<= 1
rlwinm r3,r3,1,0,30 ;quotient.lo <<=1
sdiff:
addc r10,r6,r8 ;diff.lo = shift.lo - divisor.lo, set CA
adde. r9,r5,r7 ;diff.hi = shift.hi - divisor.hi + CA
blt sloop ;loop if diff < 0
or r6,r10,r10 ;shift.lo = diff.lo
or r5,r9,r9 ;shift.hi = diff.hi
ori r3,r3,1 ;set bit in quotient
sloop:
bdnz sshift
stw r6,0x0000(r12) ;store remainder in rp address
blr ;return
#
# unsigned int muldvd(a,b,c,rp)
# unsigned int a,b,c,*rp;
# returns q = int[(a*b+c)/base] and *rp = (a*b+c) mod base
# when called a -> (r3), b -> (r4), c -> (r5), rp -> (r6)
# upon return q -> (r3), *rp -> [(r6)]
# registers used: r3 thru r8
#
csect .muldvd[PR]
function .muldvd[PR]
mulhwu r7,r3,r4 ;(r7) <- a * b high word
mullw r8,r3,r4 ;(r8) <- a * b low word
addc r4,r8,r5 ;(r4) <- a * b + c
addze r3,r7 ;(r3) <- (r7) + XERca
stw r4,0x0000(r6) ;store remainder -> (r6)
blr ;return
*****************************************************************************/
/* Itanium code for Intel compiler, with mr_small a 64-bit long */
#include "miracl.h"
mr_small muldiv(a,b,c,m,rp)
mr_small a,b,c,m;
mr_small *rp;
{ /* Blakely-Sloan */
int i;
mr_small d,q=0,r=0;
d=m-a;
for (i=MIRACL/4;i>0;i--)
{ /* do it bit by bit */
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
}
*rp=r;
return q;
}
mr_small muldvm(a,c,m,rp)
mr_small a,c,m;
mr_small *rp;
{ /* modified Blakely-Sloan */
register int i,carry;
register mr_small q=0,r=0;
r=a;
for (i=MIRACL/4;i>0;i--)
{ /* do it bit by bit */
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
}
*rp=r;
return q;
}
/* use intrinsics for speed */
/* These are now in-lined - see miracl.h */
/*
#include <ia64intrin.h>
mr_small muldvd(a,b,c,rp)
mr_small a,b,c;
mr_small *rp;
{
*rp=_m64_xmalu(a,b,c);
return _m64_xmahu(a,b,c);
}
void muldvd2(a,b,c,rp)
mr_small a,b;
mr_small *c,*rp;
{
mr_small bot;
bot=_m64_xmalu(a,b,*c);
*c=_m64_xmahu(a,b,*c);
bot+=*rp;
if (bot<*rp) (*c)++;
*rp=bot;
}
*/
/
/ GNU C for Linux (AMD64)
/ Parameters are passed in rdi,rsi,rdx,rcx,r8....
/
.file "mrmuldv.s"
.text
.globl muldiv
muldiv:
pushq %rbx
movq %rdi,%rax
movq %rdx,%rbx
mulq %rsi
addq %rbx,%rax
adcq $0,%rdx
divq %rcx
movq %r8,%rbx
movq %rdx,(%rbx)
popq %rbx
ret
.globl muldvm
muldvm:
pushq %rbx
movq %rdx,%rbx
movq %rdi,%rdx
movq %rsi,%rax
divq %rbx
movq %rcx,%rbx
movq %rdx,(%rbx)
popq %rbx
ret
.globl muldvd
muldvd:
pushq %rbx
movq %rdi,%rax
movq %rdx,%rbx
mulq %rsi
addq %rbx,%rax
adcq $0,%rdx
movq %rcx,%rbx
movq %rax,(%rbx)
movq %rdx,%rax
popq %rbx
ret
.globl muldvd2
muldvd2:
pushq %rbx
movq %rdi,%rax
movq %rdx,%rbx
mulq %rsi
addq (%rbx),%rax
adcq $0,%rdx
addq (%rcx),%rax
adcq $0,%rdx
movq %rax,(%rcx)
movq %rdx,(%rbx)
popq %rbx
ret
; Written by Ed Runnion with full rights granted to Shamus Software.
;
; An implementation of mrmuldv routines for miracl
; for ml64 assembler used by Microsoft Visual Studio (VC8) and X64 processor (AMD 64)
; X64 arguments are passed in RCX, RDX, R8, R9, Stack...
;/*
; * MIRACL compiler/hardware definitions - mirdef.h
; * Copyright (c) 1988-2006 Shamus Software Ltd.
; */
;#define MR_LITTLE_ENDIAN
;#define MIRACL 64
;#define mr_utype __int64
;#define mr_unsign64 unsigned __int64
;#define MR_IBITS 32
;#define MR_LBITS 32
;#define mr_unsign32 unsigned int
;#define MR_FLASH 52
;#define MAXBASE ((mr_small)1<<(MIRACL-1))
;#define MR_BITSINCHAR 8
.code
ALIGN 16
PUBLIC muldiv
muldiv PROC
mov rax,rcx
mul rdx
add rax,r8
adc rdx,0
div r9
mov r10, QWORD PTR [rsp+28h]
mov QWORD PTR[r10],rdx
ret
muldiv ENDP
ALIGN 16
PUBLIC muldvm
muldvm PROC
mov rax,rdx
mov rdx,rcx
div r8
mov QWORD PTR[r9],rdx
ret
muldvm ENDP
ALIGN 16
PUBLIC muldvd
muldvd PROC
mov rax,rcx
mul rdx
add rax,r8
adc rdx,0
mov QWORD PTR[r9],rax
mov rax,rdx
ret
muldvd ENDP
ALIGN 16
PUBLIC muldvd2
muldvd2 PROC
mov rax,rcx
mul rdx
add rax,QWORD PTR[r8]
adc rdx,0
add rax,QWORD PTR[r9]
adc rdx,0
mov QWORD PTR[r9],rax
mov QWORD PTR[r8],rdx
ret
muldvd2 ENDP
end
/* Win64 C version of mrmuldv.c, for 64-bit Visual Studio apps */
#include "miracl.h"
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
{
int i;
mr_small d,q=0,r=0;
d=m-a;
for (i=MIRACL/4;i>0;i--)
{ /* do it bit by bit */
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if ((mr_utype)b<0)
{
if (r>=m) { r-=d; q++; }
else r+=a;
}
if (r>=m) { r-=m; q++; }
b<<=1;
}
*rp=r;
return q;
}
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
{ /* modified Blakely-Sloan */
register int i,carry;
register mr_small q=0,r=0;
r=a;
for (i=MIRACL/4;i>0;i--)
{ /* do it bit by bit */
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
carry=0;
if ((mr_utype)r<0) carry=1;
r<<=1;
if ((mr_utype)c<0) r++;
c<<=1;
q<<=1;
if (carry || r>=m) { r-=m; q++; }
}
*rp=r;
return q;
}
#ifndef MR_NOFULLWIDTH
/* These are now in-lined - see miracl.h */
/*
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
{
mr_small q,r;
r=_umul128(a,b,&q);
r+=c;
q+=(r<c);
*rp=r;
return q;
}
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
{
mr_small q,r;
r=_umul128(a,b,&q);
r+=*c;
q+=(r<*c);
r+=*rp;
q+=(r<*rp);
*rp=r;
*c=q;
}
*/
#endif