2780 lines
72 KiB
Plaintext
2780 lines
72 KiB
Plaintext
/*
|
|
* MIRACL - various implementations of routines muldiv, muldvm, muldvd
|
|
* muldvd2 and imuldiv
|
|
* mrmuldv.c
|
|
*
|
|
* THIS FILE CONTAINS MANY VERSIONS OF THESE ROUTINES
|
|
* COPY THIS FILE TO MRMULDV.C AND DELETE THOSE PARTS IRRELEVANT TO
|
|
* YOUR REQUIREMENTS.
|
|
*
|
|
* NOTE: - This file and its contents are not needed
|
|
* if MR_NOASM is defined in mirdef.h
|
|
*
|
|
* muldiv() calculates (a*b+c)/m and (a*b+c)%m as quickly as possible. Should
|
|
* ideally be written in assembly language of target machine for speed
|
|
* The problem is to avoid overflow in the calculation of the intermediate
|
|
* product a*b+c.
|
|
*
|
|
* If using a floating-point underlying type, and rounding can be
|
|
* controlled, it makes sense to pre-calculate
|
|
* the inverse of the modulus m, and multiply instead of divide
|
|
* In this situation a function imuldiv() is also needed.
|
|
*
|
|
* muldvm() and muldvd() routines are necessary to support full-width number
|
|
* base working. They are not needed if MR_NOFULLWIDTH is defined in mirdef.h.
|
|
*
|
|
* muldvm - returns (a*base+c)/m and remainder
|
|
* muldvd - returns (a*b+c)/base and remainder
|
|
*
|
|
* NOTE: New to version 4.2, new routine muldvd2() is required.
|
|
* See C version below for specification
|
|
* Versions of this are easily developed from existing muldvd() programs
|
|
*
|
|
* In most applications muldvd2() will be the time critical routine.
|
|
*
|
|
* Note that full-width base working may not be possible for all processors.
|
|
* For example it cannot be used on a VAX, or RS/6000 with mr_utypes defined
|
|
* as ints. This is because the instruction set does not support
|
|
* unsigned multiply and divide instructions. In such cases ALWAYS use a
|
|
* maximum base of MAXBASE in mirsys(), rather than 0.
|
|
*
|
|
* Since parameter passing and returning is time-consuming, these routines
|
|
* should be generated 'inline', if compiler allows it. Parameter passing
|
|
* by register will also be faster than via the stack. For even faster
|
|
* operation, use in-line assembly to speed up the inner loops of routines
|
|
* pmul(), sdiv(), multiply() and divide(). See these routines for details
|
|
* of Microsoft/Borland C inline 80x86 assembly, which gives a substantial speed-up.
|
|
*
|
|
* NOTE: All other things being equal, versions of MIRACL with 32-bit mr_utypes
|
|
* will run 3-4 times faster than versions with 16-bit mr_utypes, even for medium
|
|
* precision arithmetic, such as used in Public Key systems.
|
|
*
|
|
* Note that a portable C version of 'muldiv' may not possible with some
|
|
* 32-bit compilers if ints and longs are both 32-bits and there is no
|
|
* 64-bit type. Fortunately these days there usually is such a type - called
|
|
* perhaps long long, or maybe __int64. See also the Blakely-Sloan
|
|
* method below. In any case the portable versions may be used if mr_utypes
|
|
* are defined as shorts, usually 16 bits. This would amount however to
|
|
* using the 32-bit processor in a 16 bit mode and would be very inefficient
|
|
* - up to 4 times slower. See mirdef.haf
|
|
*
|
|
* First the standard portable versions, for use when there is a double
|
|
* length type capable of holding the product of two mr_utype types.
|
|
* For example 32 and 16 bits types respectively.
|
|
* Note that if MR_NOASM is defined in mirdef.h, these routines are
|
|
* implemented in mrcore.c, and do not need to be extracted from here.
|
|
*
|
|
* This is followed by various other assembly language implementations for
|
|
* popular processors, computers and compilers.
|
|
*
|
|
|
|
|
|
**************************************************************
|
|
|
|
/* Standard C version of mrmuldv.c */
|
|
|
|
#include <stdio.h>
|
|
#include "miracl.h"
|
|
|
|
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
mr_large ldres,dble=(mr_large)a*b+c;
|
|
q=(mr_small)MR_LROUND(dble/m);
|
|
*rp=(mr_small)(dble-(mr_large)q*m);
|
|
return q;
|
|
}
|
|
|
|
#ifdef MR_FP_ROUNDING
|
|
|
|
mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
mr_large ldres,dble=(mr_large)a*b+c;
|
|
q=(mr_small)MR_LROUND(dble*im);
|
|
*rp=(mr_small)(dble-(mr_large)q*m);
|
|
return q;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef MR_NOFULLWIDTH
|
|
|
|
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
union doubleword dble;
|
|
dble.h[MR_BOT]=c;
|
|
dble.h[MR_TOP]=a;
|
|
q=(mr_small)(dble.d/m);
|
|
*rp=(mr_small)(dble.d-(mr_large)q*m);
|
|
return q;
|
|
}
|
|
|
|
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
|
|
{
|
|
union doubleword dble;
|
|
dble.d=(mr_large)a*b+c;
|
|
*rp=dble.h[MR_BOT];
|
|
return dble.h[MR_TOP];
|
|
}
|
|
|
|
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
|
|
{
|
|
union doubleword dble;
|
|
dble.d=(mr_large)a*b+*c+*rp;
|
|
*rp=dble.h[MR_BOT];
|
|
*c=dble.h[MR_TOP];
|
|
}
|
|
|
|
#endif
|
|
|
|
/* version for PowerPC (64-bit G5). Use with Blakely-Sloan C versions of muldiv(.) and muldvm(.) - see below */
|
|
|
|
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
|
|
{
|
|
__asm__ __volatile__ (
|
|
"mulld %%r16,%0,%1\n"
|
|
"mulhdu %%r17,%0,%1\n"
|
|
"ld %%r18,0(%2)\n"
|
|
"addc %%r16,%%r18,%%r16\n"
|
|
"addze %%r17,%%r17\n"
|
|
"ld %%r19,0(%3)\n"
|
|
"addc %%r16,%%r19,%%r16\n"
|
|
"addze %%r17,%%r17\n"
|
|
"std %%r16,0(%3)\n"
|
|
"std %%r17,0(%2)\n"
|
|
:
|
|
: "r"(a),"r"(b),"r"(c),"r"(rp)
|
|
: "r16","r17","r18","r19","memory"
|
|
);
|
|
|
|
}
|
|
|
|
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
__asm__ __volatile__ (
|
|
"mulld %%r16,%1,%2\n"
|
|
"mulhdu %%r17,%1,%2\n"
|
|
"addc %%r16,%3,%%r16\n"
|
|
"addze %%r17,%%r17\n"
|
|
"std %%r16,0(%4)\n"
|
|
"or %0,%%r17,%%r17\n"
|
|
: "=r"(q)
|
|
: "r"(a),"r"(b),"r"(c),"r"(rp)
|
|
: "r16","r17","memory"
|
|
);
|
|
return q;
|
|
}
|
|
|
|
|
|
****************************************************************
|
|
|
|
//
|
|
// Version of muldiv() for use with underlying type a double
|
|
// and using the FP co-processor on a Pentium, and the gcc compiler.
|
|
// In this case MR_NOFULLWIDTH is defined.
|
|
// This is much better than compiling the above, but fprem and fdiv
|
|
// are still very slow.
|
|
//
|
|
|
|
.file "mrmuldv.s"
|
|
.text
|
|
.globl _muldiv
|
|
_muldiv:
|
|
|
|
pushl %ebx
|
|
|
|
fldl 8(%esp)
|
|
fmull 16(%esp)
|
|
movl 40(%esp),%ebx
|
|
faddl 24(%esp)
|
|
fldl 32(%esp)
|
|
fld %st(1)
|
|
|
|
// NOTE: If rounding control is possible, set rounding to "chop"
|
|
// and replace lines below with these
|
|
// In this case #define MR_FP_ROUNDING will be defined in mirdef.h
|
|
//
|
|
// fdiv %st(1),%st
|
|
// fistpq 8(%esp)
|
|
// fildq 8(%esp)
|
|
// fmul %st,%st(1)
|
|
// fxch %st(2)
|
|
// fsubp %st,%st(1)
|
|
// fstpl (%ebx)
|
|
|
|
fprem
|
|
fstl (%ebx)
|
|
fsubrp %st,%st(2)
|
|
fdivrp %st,%st(1)
|
|
|
|
popl %ebx
|
|
ret
|
|
|
|
//
|
|
// If MR_FP_ROUNDING is defined, this function will be needed for Pentium
|
|
//
|
|
.globl _imuldiv
|
|
_imuldiv:
|
|
|
|
pushl %ebx
|
|
|
|
fldl 8(%esp)
|
|
fmull 16(%esp)
|
|
movl 52(%esp),%ebx
|
|
faddl 24(%esp)
|
|
fldl 32(%esp)
|
|
fld %st(1)
|
|
|
|
fldt 40(%esp)
|
|
fmulp %st,%st(1)
|
|
fistpq 8(%esp)
|
|
fildq 8(%esp)
|
|
fmul %st,%st(1)
|
|
fxch %st(2)
|
|
fsubp %st,%st(1)
|
|
fstpl (%ebx)
|
|
|
|
popl %ebx
|
|
ret
|
|
|
|
|
|
************************************************************************
|
|
|
|
/*
|
|
* Borland C++ 32-bit compiler (BCC32) version of the above.
|
|
* Uses inline assembly feature. Suitable for Win32 Apps
|
|
* Also compatible with Microsoft Visual C++ 32-bit compiler
|
|
* BUT change TBYTE to QWORD
|
|
*/
|
|
#include "mirdef.h"
|
|
|
|
#define ASM _asm
|
|
|
|
double muldiv(double a,double b,double c,double m,double *rp)
|
|
{
|
|
ASM fld QWORD PTR a
|
|
ASM fmul QWORD PTR b
|
|
ASM mov ebx,DWORD PTR rp
|
|
ASM fadd QWORD PTR c
|
|
ASM fld QWORD PTR m
|
|
ASM fld st(1)
|
|
|
|
#ifdef MR_FP_ROUNDING
|
|
ASM fdiv st,st(1)
|
|
ASM fistp QWORD PTR [ebx]
|
|
ASM fild QWORD PTR [ebx]
|
|
ASM fmul st(1),st
|
|
ASM fxch st(2)
|
|
ASM fsubrp st(1),st
|
|
ASM fstp QWORD PTR [ebx]
|
|
#else
|
|
ASM fprem
|
|
ASM fst QWORD PTR [ebx]
|
|
ASM fsubp st(2),st
|
|
ASM fdivp st(1),st
|
|
#endif
|
|
}
|
|
|
|
#ifdef MR_FP_ROUNDING
|
|
|
|
double imuldiv(double a,double b,double c,double m,long double im,double *rp)
|
|
{
|
|
ASM fld QWORD PTR a
|
|
ASM fmul QWORD PTR b
|
|
ASM fld QWORD PTR m
|
|
ASM fxch st(1)
|
|
ASM fadd QWORD PTR c
|
|
ASM mov ebx,DWORD PTR rp
|
|
ASM fxch st(1)
|
|
ASM fld st(1)
|
|
|
|
ASM fld TBYTE PTR im /* QWORD for Microsoft */
|
|
ASM fmulp st(1),st
|
|
ASM fistp QWORD PTR [ebx]
|
|
ASM fild QWORD PTR [ebx]
|
|
ASM fmul st(1),st
|
|
ASM fxch st(2)
|
|
ASM fsubrp st(1),st
|
|
ASM fstp QWORD PTR [ebx]
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
*********************************************************************
|
|
|
|
;
|
|
; VAX11 version for Dec C compiler
|
|
; with 32 bit int using 64-bit quadword
|
|
; for the intermediate product.
|
|
;
|
|
; Use with mirdef.h32 - but define MR_NOFULLWIDTH
|
|
; Use mirsys(...,MAXBASE) instead of mirsys(...,0) in your programs.
|
|
;
|
|
; Why ...(MIRACL-2) instead of ...(MIRACL-1) ? That's a negative
|
|
; number for division by mr_base!
|
|
;
|
|
; The problem is that the emul and ediv instructions work only
|
|
; for signed types
|
|
;
|
|
.entry muldiv,0
|
|
subl #4,sp
|
|
emul 4(ap),8(ap),12(ap),r0 ;a*b+c
|
|
ediv 16(ap),r0,r0,@20(ap) ;quo. in r0, rem. in *rp
|
|
ret
|
|
.end
|
|
;
|
|
; Fullwidth base working not possible on VAX, so no muldvm() or muldvd()
|
|
;
|
|
;
|
|
|
|
|
|
**********************************************************************
|
|
|
|
|
|
#
|
|
# Version of muldiv.c for IBM RS/6000
|
|
# This processor has no unsigned multiply/divide
|
|
# so full-width base not possible, so no muldvm() or muldvd()
|
|
#
|
|
# Use with mirdef.h32 but define MR_NOFULLWIDTH definition.
|
|
# Use mirsys(...,MAXBASE) instead of mirsys(...,0) in your programs.
|
|
#
|
|
# Note this version was developed from very inadequate RS/6000
|
|
# documentation. It may not be optimal, and it may not always work
|
|
# (although it works fine for me!)
|
|
#
|
|
|
|
|
|
.file "mrmuldv.s"
|
|
.globl .muldiv[PR]
|
|
.csect .muldiv[PR]
|
|
|
|
# parameters are passed in registers 3,4,5,6 and 7
|
|
# the mq register holds the low 32-bits for mul/div
|
|
|
|
mul 3,4,3 # q=a*b
|
|
mfmq 4 # get low part from mq
|
|
a 4,5,4 # add in c
|
|
aze 3,3 # add carry to high part
|
|
mtmq 4 # move low part to mq
|
|
div 3,3,6 # q=(a*b+c)/m
|
|
mfmq 4 # get remainder
|
|
st 4,0(7) # store remainder
|
|
|
|
# quotient is returned in register 3
|
|
|
|
brl
|
|
|
|
************************************************************************
|
|
|
|
/* Here's another portable method which might be considered for processors
|
|
* like the VAX and RS6000. The idea is due to Peter Montgomery. */
|
|
|
|
#include "mirdef.h"
|
|
|
|
typedef unsigned mr_utype uint;
|
|
|
|
uint muldiv(a,b,c,m,rp)
|
|
uint a,b,c,m,*rp;
|
|
{
|
|
int q,r;
|
|
q=(int)(0.5+((double)a*(double)b+(double)c)/(double)m);
|
|
r=(int)(((uint)a*(uint)b+(uint)c)-(uint)m*(uint)q);
|
|
if (r < 0)
|
|
{
|
|
r+=m;
|
|
q--;
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
**********************************************************************
|
|
|
|
|
|
;
|
|
; IBM-PC versions - small memory model only
|
|
; Easily modified for other memory models
|
|
;
|
|
; For large code models (e.g. medium)
|
|
;
|
|
; change _TEXT to mrmuldv_TEXT (in three places)
|
|
; change NEAR to FAR
|
|
; change [bp+4] to [bp+6]
|
|
; change [bp+6] to [bp+8]
|
|
; change [bp+8] to [bp+10]
|
|
; change [bp+10] to [bp+12]
|
|
; change [bp+12] to [bp+14]
|
|
;
|
|
; For large data models, see Turbo C version below for required modification
|
|
;
|
|
; Microsoft C compiler V4.0+
|
|
; Written for MS macro-assembler
|
|
;
|
|
ASSUME CS:_TEXT
|
|
_TEXT SEGMENT BYTE PUBLIC 'CODE'
|
|
|
|
PUBLIC _muldiv
|
|
_muldiv PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov ax,[bp+4] ;get a
|
|
mul WORD PTR [bp+6] ;multiply by b
|
|
add ax,[bp+8] ;add c to low word
|
|
adc dx,0h ;add carry to high word
|
|
div WORD PTR [bp+10] ;divide by m
|
|
mov bx,[bp+12] ;get address for remainder
|
|
mov [bx],dx ;store remainder
|
|
|
|
pop bp ;standard C return
|
|
ret ;quotient in ax
|
|
|
|
_muldiv endP
|
|
|
|
PUBLIC _muldvm
|
|
_muldvm PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov dx,[bp+4] ;get a
|
|
mov ax,[bp+6] ;add in c
|
|
div WORD PTR [bp+8] ;divide by m
|
|
mov bx,[bp+10] ;get address for remainder
|
|
mov [bx],dx ;store remainder
|
|
|
|
pop bp ;standard C return
|
|
ret ;quotient in ax
|
|
|
|
_muldvm endP
|
|
|
|
PUBLIC _muldvd
|
|
_muldvd PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov ax,[bp+4] ;get a
|
|
mul WORD PTR [bp+6] ;multiply by b
|
|
add ax,[bp+8] ;add c to low word
|
|
adc dx,0h ;add carry to high word
|
|
mov bx,[bp+10] ;get address for remainder
|
|
mov [bx],ax ;store remainder
|
|
mov ax,dx
|
|
pop bp ;standard C return
|
|
ret ;quotient in ax
|
|
|
|
_muldvd endP
|
|
|
|
PUBLIC _muldvd2
|
|
_muldvd2 PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
push si
|
|
|
|
mov ax,[bp+4] ;get a
|
|
mul WORD PTR [bp+6] ;multiply by b
|
|
mov bx,[bp+8] ;get address for c
|
|
add ax,[bx] ;add c
|
|
adc dx,0h ;add carry to high word
|
|
|
|
mov si,[bp+10] ;get address for remainder
|
|
add ax,[si] ;add rp
|
|
adc dx,0h ;add carry to high word
|
|
|
|
mov [si],ax ;store remainder
|
|
mov [bx],dx ;store carry
|
|
|
|
pop si
|
|
pop bp ;standard C return
|
|
ret
|
|
|
|
_muldvd2 endP
|
|
|
|
|
|
|
|
_TEXT ENDS
|
|
END
|
|
|
|
|
|
***********************************************************************
|
|
|
|
|
|
/*
|
|
* Turbo C compiler V1.5+, Turbo/Borland C++. Microsoft C/C++
|
|
* Uses inline assembly feature
|
|
* Generates code identical to above version, and
|
|
* can be used instead.
|
|
*/
|
|
|
|
#define ASM asm
|
|
|
|
/* or perhaps #define ASM _asm */
|
|
|
|
unsigned int muldiv(a,b,c,m,rp)
|
|
unsigned int a,b,c,m,*rp;
|
|
{
|
|
ASM mov ax,a ;/* get a */
|
|
ASM mul WORD PTR b ;/* multiply by b */
|
|
ASM add ax,c ;/* add c to low word */
|
|
ASM adc dx,0h ;/* add carry to high word */
|
|
ASM div WORD PTR m ;/* divide by m */
|
|
ASM mov bx,rp ;/* get address for remainder */
|
|
ASM mov [bx],dx ;/* store remainder */
|
|
}
|
|
/* Replace last two ASM lines when using large data memory models */
|
|
/* ASM les bx, DWORD PTR rp ; get address for remainder */
|
|
/* ASM mov WORD PTR es:[bx],dx ; store remainder */
|
|
|
|
unsigned int muldvm(a,c,m,rp)
|
|
unsigned int a,c,m,*rp;
|
|
{
|
|
ASM mov dx,a ;/* get a */
|
|
ASM mov ax,c ;/* add in c to low word */
|
|
ASM div WORD PTR m ;/* divide by m */
|
|
ASM mov bx,rp ;/* get address for remainder */
|
|
ASM mov [bx],dx ;/* store remainder */
|
|
}
|
|
/* Replace last two ASM lines when using large data memory models */
|
|
/* ASM les bx, DWORD PTR rp ; get address for remainder */
|
|
/* ASM mov WORD PTR es:[bx],dx ; store remainder */
|
|
|
|
unsigned int muldvd(a,b,c,rp)
|
|
unsigned int a,b,c,*rp;
|
|
{
|
|
ASM mov ax,a ;/* get a */
|
|
ASM mul WORD PTR b ;/* multiply by b */
|
|
ASM add ax,c ;/* add c to low word */
|
|
ASM adc dx,0h ;/* add carry to high word */
|
|
ASM mov bx,rp ;/* get address for remainder */
|
|
ASM mov [bx],ax ;/* store remainder */
|
|
ASM mov ax,dx
|
|
}
|
|
/* Replace second and third last lines if using large data memory models */
|
|
/* ASM les bx, DWORD PTR rp ; get address for remainder */
|
|
/* ASM mov WORD PTR es:[bx],ax ; store remainder */
|
|
|
|
void muldvd2(a,b,c,rp)
|
|
unsigned int a,b,*c,*rp;
|
|
{
|
|
ASM mov ax,a ;/* get a */
|
|
ASM mul WORD PTR b ;/* multiply by b */
|
|
ASM mov bx,c
|
|
ASM add ax,[bx]
|
|
ASM adc dx,0h ;/* add carry to high word */
|
|
ASM mov si,rp
|
|
ASM add ax,[si]
|
|
ASM adc dx,0h
|
|
ASM mov [si],ax
|
|
ASM mov [bx],dx
|
|
}
|
|
|
|
/* for large memory model ....
|
|
ASM mov ax,a ;/* get a */
|
|
ASM mul WORD PTR b ;/* multiply by b */
|
|
ASM les bx, DWORD PTR c
|
|
ASM add ax, WORD PTR es:[bx]
|
|
ASM adc dx,0h ;/* add carry to high word */
|
|
ASM les si,DWORD PTR rp
|
|
ASM add ax,WORD PTR es:[si]
|
|
ASM adc dx,0h
|
|
ASM mov WORD PTR es:[si],ax
|
|
ASM les bx,DWORD PTR c
|
|
ASM mov WORD PTR es:[bx],dx
|
|
*/
|
|
|
|
|
|
|
|
**********************************************************************
|
|
|
|
|
|
;
|
|
; IBM-PC-8087 for Microsoft C compiler V4.0+
|
|
; with 'mr_utype' defined as 'long' (32 bit). See mirdef.hpc
|
|
; This allows IBM-PC XT to look a bit like a 32-bit computer
|
|
; (which it isn't). To make use of this option:
|
|
;
|
|
; (1) Must have 8087 Maths Co-processor (for speed and to hold 64-bit
|
|
; intermediate product).
|
|
;
|
|
; (2) Must use 'ANSI' enhanced type C compiler, e.g. Microsoft V3.0+
|
|
; and must use header 'miracl.h' which declares function
|
|
; parameter types.
|
|
;
|
|
; Note: some compilation warnings may be generated - ignore them.
|
|
;
|
|
; Note: This is NOT, in most cases, faster, but it does allow
|
|
; very high precision calculations, e.g. 1000!
|
|
;
|
|
; Note: No versions of muldvm(), muldvd() or muldvd2() yet written for
|
|
; this method.
|
|
;
|
|
ASSUME CS:_TEXT
|
|
_TEXT SEGMENT BYTE PUBLIC 'CODE'
|
|
|
|
PUBLIC _muldiv
|
|
_muldiv PROC NEAR
|
|
push si ;standard C linkage
|
|
push bp
|
|
mov bp,sp
|
|
|
|
finit ;initialise 8087
|
|
fild DWORD PTR [bp+6] ;get a
|
|
fimul DWORD PTR [bp+0ah];multiply by b
|
|
fiadd DWORD PTR [bp+0eh];add c
|
|
fild DWORD PTR [bp+12h];get m
|
|
fld st(1) ;duplicate a*b+c on stack
|
|
fprem ;get remainder
|
|
fist DWORD PTR [bp+0ah];store remainder in b
|
|
fsubr st,st(2) ;subtract rem from total
|
|
fdiv st,st(1) ;divide by m
|
|
fist DWORD PTR [bp+6] ;store quotient in a
|
|
wait
|
|
|
|
mov si,[bp+22] ;get address for remainder
|
|
mov ax,[bp+10]
|
|
mov dx,[bp+12] ;get remainder
|
|
mov [si],ax
|
|
mov [si+2],dx ;store remainder
|
|
mov ax,[bp+6]
|
|
mov dx,[bp+8] ;get quotient in dx:ax
|
|
|
|
pop bp ;standard C return
|
|
pop si
|
|
ret
|
|
|
|
_muldiv endP
|
|
|
|
_TEXT ENDS
|
|
END
|
|
|
|
|
|
|
|
**************************************************************************
|
|
|
|
|
|
;
|
|
; Intel-80386 pseudo-32 bit version - for Microsoft C V5.0+
|
|
; Written for MS macro-assembler V5.0+ by Andrej Sauer
|
|
; with 'mr_utype' defined as 'long' (32 bit). See mirdef.hpc
|
|
; Same comments apply as above (except for 8087 requirement)
|
|
; Note that this version will also work with the latest Zortech and
|
|
; Borland 16-bit compilers, specifically Borland C++ V3.1+
|
|
;
|
|
; For large code models (e.g. medium)
|
|
;
|
|
; change _TEXT to mrmuldv_TEXT (in three places)
|
|
; change NEAR to FAR
|
|
; change [bp+4] to [bp+6]
|
|
; change [bp+8] to [bp+10]
|
|
; change [bp+12] to [bp+14]
|
|
; change [bp+16] to [bp+18]
|
|
; change [bp+20] to [bp+22]
|
|
; etc
|
|
;
|
|
.386
|
|
ASSUME CS:_TEXT
|
|
_TEXT SEGMENT USE16 PUBLIC 'CODE'
|
|
|
|
PUBLIC _muldiv
|
|
_muldiv PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov eax,[bp+4] ;get a
|
|
mul DWORD PTR [bp+8] ;multiply by b
|
|
add eax,DWORD PTR [bp+12] ;add c to low word
|
|
adc edx,0h ;add carry to high word
|
|
div DWORD PTR [bp+16] ;divide by m
|
|
mov bx,WORD PTR [bp+20] ;get address for remainder
|
|
mov [bx],edx ;store remainder
|
|
shld edx,eax,16 ;shift higher half of quotient
|
|
;into lower half of edx
|
|
|
|
pop bp ;standard C return
|
|
ret ;quotient: high bits in dx, lows in ax
|
|
|
|
_muldiv endP
|
|
|
|
PUBLIC _muldvm
|
|
_muldvm PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov edx,[bp+4] ;get a
|
|
mov eax,[bp+8] ;add in c
|
|
div DWORD PTR [bp+12] ;divide by m
|
|
mov bx,WORD PTR [bp+16] ;get address for remainder
|
|
mov [bx],edx ;store remainder
|
|
shld edx,eax,16 ;shift higher half of quotient
|
|
;into lower half of edx
|
|
pop bp ;standard C return
|
|
ret ;quotient: high bits in dx, lows in ax
|
|
|
|
_muldvm endP
|
|
|
|
|
|
PUBLIC _muldvd
|
|
_muldvd PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov eax,[bp+4] ;get a
|
|
mul DWORD PTR [bp+8] ;multiply by b
|
|
add eax,DWORD PTR [bp+12] ;add c to low word
|
|
adc edx,0h ;add carry to high word
|
|
mov bx,WORD PTR [bp+16] ;get address for remainder
|
|
mov [bx],eax ;store remainder
|
|
mov eax,edx
|
|
shld edx,eax,16 ;shift higher half of quotient
|
|
;into lower half of edx
|
|
|
|
pop bp ;standard C return
|
|
ret ;quotient: high bits in dx, lows in ax
|
|
|
|
_muldvd endP
|
|
|
|
|
|
PUBLIC _muldvd2
|
|
_muldvd2 PROC NEAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
push si
|
|
|
|
mov eax,[bp+4] ;get a
|
|
mul DWORD PTR [bp+8] ;multiply by b
|
|
les bx,DWORD PTR [bp+12]
|
|
add eax,DWORD PTR es:[bx]
|
|
adc edx,0h ;add carry to high word
|
|
les si,DWORD PTR [bp+16]
|
|
add eax,DWORD PTR es:[si]
|
|
adc edx,0h ;add carry to high word
|
|
|
|
mov DWORD PTR es:[si],eax ;store remainder
|
|
les bx,DWORD PTR [bp+12]
|
|
mov DWORD PTR es:[bx],edx
|
|
pop si
|
|
pop bp ;standard C return
|
|
ret
|
|
|
|
_muldvd2 endP
|
|
|
|
_TEXT ENDS
|
|
END
|
|
|
|
|
|
***********************************************************************
|
|
|
|
|
|
;
|
|
; Large Memory model version of the above. Useful
|
|
; for creating 16-bit DLL on 386+. Microsoft/Borland compatible
|
|
;
|
|
.386
|
|
ASSUME CS:mrmuldv_TEXT
|
|
mrmuldv_TEXT SEGMENT USE16 PUBLIC 'CODE'
|
|
|
|
PUBLIC _muldiv
|
|
_muldiv PROC FAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov eax,[bp+6] ;get a
|
|
mul DWORD PTR [bp+10] ;multiply by b
|
|
add eax,DWORD PTR [bp+14] ;add c to low word
|
|
adc edx,0h ;add carry to high word
|
|
div DWORD PTR [bp+18] ;divide by m
|
|
les bx,DWORD PTR [bp+22]
|
|
mov DWORD PTR es:[bx],edx
|
|
shld edx,eax,16 ;shift higher half of quotient
|
|
;into lower half of edx
|
|
pop bp ;standard C return
|
|
ret ;quotient: high bits in dx, lows in ax
|
|
|
|
_muldiv endP
|
|
|
|
PUBLIC _muldvm
|
|
_muldvm PROC FAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov edx,[bp+6] ;get a
|
|
mov eax,[bp+10] ;add in c
|
|
div DWORD PTR [bp+14] ;divide by m
|
|
les bx,DWORD PTR [bp+18]
|
|
mov DWORD PTR es:[bx],edx
|
|
shld edx,eax,16 ;shift higher half of quotient
|
|
;into lower half of edx
|
|
pop bp ;standard C return
|
|
ret ;quotient: high bits in dx, lows in ax
|
|
|
|
_muldvm endP
|
|
|
|
|
|
PUBLIC _muldvd
|
|
_muldvd PROC FAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
|
|
mov eax,[bp+6] ;get a
|
|
mul DWORD PTR [bp+10] ;multiply by b
|
|
add eax,DWORD PTR [bp+14] ;add c to low word
|
|
adc edx,0h ;add carry to high word
|
|
les bx,DWORD PTR [bp+18]
|
|
mov DWORD PTR es:[bx],eax
|
|
mov eax,edx
|
|
shld edx,eax,16 ;shift higher half of quotient
|
|
;into lower half of edx
|
|
|
|
pop bp ;standard C return
|
|
ret ;quotient: high bits in dx, lows in ax
|
|
|
|
_muldvd endP
|
|
|
|
PUBLIC _muldvd2
|
|
_muldvd2 PROC FAR
|
|
push bp ;standard C linkage
|
|
mov bp,sp
|
|
push si
|
|
|
|
mov eax,[bp+6] ;get a
|
|
mul DWORD PTR [bp+10] ;multiply by b
|
|
les bx,DWORD PTR [bp+14]
|
|
add eax,DWORD PTR es:[bx]
|
|
adc edx,0h ;add carry to high word
|
|
|
|
les si,DWORD PTR [bp+18]
|
|
add eax,DWORD PTR es:[si]
|
|
adc edx,0h ;add carry to high word
|
|
|
|
mov DWORD PTR es:[si],eax
|
|
les bx,DWORD PTR [bp+14]
|
|
mov DWORD PTR es:[bx],edx
|
|
pop si
|
|
pop bp ;standard C return
|
|
ret
|
|
|
|
_muldvd2 endP
|
|
|
|
mrmuldv_TEXT ENDS
|
|
END
|
|
|
|
|
|
****************************************************************************
|
|
|
|
|
|
/*
|
|
Borland in-line pseudo-32 bit version of the above
|
|
Large memory model version.
|
|
Use with mirdef.hpc
|
|
|
|
Unfortunately this cannot be used with Microsoft C,
|
|
as its 16 bit compiler will not allow inline 386 opcodes
|
|
*/
|
|
|
|
#define ASM _asm
|
|
|
|
long muldiv(a,b,c,m,rp)
|
|
long a,b,c,m,*rp;
|
|
{
|
|
ASM mov eax,DWORD PTR a
|
|
ASM mul DWORD PTR b
|
|
ASM add eax,DWORD PTR c
|
|
ASM adc edx,0h
|
|
ASM div DWORD PTR m
|
|
ASM les bx,DWORD PTR rp
|
|
ASM mov DWORD PTR es:[bx],edx
|
|
ASM shld edx,eax,16
|
|
}
|
|
|
|
long muldvm(a,c,m,rp)
|
|
long a,c,m,*rp;
|
|
{
|
|
ASM mov edx,DWORD PTR a
|
|
ASM mov eax,DWORD PTR c
|
|
ASM div DWORD PTR m
|
|
ASM les bx,DWORD PTR rp
|
|
ASM mov DWORD PTR es:[bx],edx
|
|
ASM shld edx,eax,16
|
|
}
|
|
|
|
long muldvd(a,b,c,rp)
|
|
long a,b,c,*rp;
|
|
{
|
|
ASM mov eax,DWORD PTR a
|
|
ASM mul DWORD PTR b
|
|
ASM add eax,DWORD PTR c
|
|
ASM adc edx,0h
|
|
ASM les bx,DWORD PTR rp
|
|
ASM mov DWORD PTR es:[bx],eax
|
|
ASM mov eax,edx
|
|
ASM shld edx,eax,16
|
|
}
|
|
|
|
void muldvd2(a,b,c,rp)
|
|
long a,b,*c,*rp;
|
|
{
|
|
ASM mov eax,DWORD PTR a
|
|
ASM mul DWORD PTR b
|
|
ASM les bx,DWORD PTR c
|
|
ASM add eax,DWORD PTR es:[bx]
|
|
ASM adc edx,0h
|
|
ASM les si,DWORD PTR rp
|
|
ASM add eax,DWORD PTR es:[si]
|
|
ASM adc edx,0h
|
|
ASM mov DWORD PTR es:[si],eax
|
|
ASM les bx,DWORD PTR c
|
|
ASM mov DWORD PTR es:[bx],edx
|
|
}
|
|
|
|
|
|
|
|
***********************************************************************
|
|
|
|
|
|
/*
|
|
* Borland C++ 32-bit compiler (BCC32). Use with mirdef.h32
|
|
* Uses inline assembly feature. Suitable for Win32 Apps
|
|
* Also compatible with Microsoft Visual C++ 32-bit compiler
|
|
*/
|
|
|
|
#define ASM _asm
|
|
|
|
int muldiv(a,b,c,m,rp)
|
|
int a,b,c,m,*rp;
|
|
{
|
|
ASM mov eax,DWORD PTR a
|
|
ASM mul DWORD PTR b
|
|
ASM add eax,DWORD PTR c
|
|
ASM adc edx,0h
|
|
ASM div DWORD PTR m
|
|
ASM mov ebx,DWORD PTR rp
|
|
ASM mov [ebx],edx
|
|
}
|
|
|
|
int muldvm(a,c,m,rp)
|
|
int a,c,m,*rp;
|
|
{
|
|
ASM mov edx,DWORD PTR a
|
|
ASM mov eax,DWORD PTR c
|
|
ASM div DWORD PTR m
|
|
ASM mov ebx,DWORD PTR rp
|
|
ASM mov [ebx],edx
|
|
}
|
|
|
|
int muldvd(a,b,c,rp)
|
|
int a,b,c,*rp;
|
|
{
|
|
ASM mov eax,DWORD PTR a
|
|
ASM mul DWORD PTR b
|
|
ASM add eax,DWORD PTR c
|
|
ASM adc edx,0h
|
|
ASM mov ebx,DWORD PTR rp
|
|
ASM mov [ebx],eax
|
|
ASM mov eax,edx
|
|
}
|
|
|
|
|
|
void muldvd2(a,b,c,rp)
|
|
int a,b,*c,*rp;
|
|
{
|
|
ASM mov eax,DWORD PTR a
|
|
ASM mul DWORD PTR b
|
|
ASM mov ebx,DWORD PTR c
|
|
ASM add eax,[ebx]
|
|
ASM adc edx,0h
|
|
ASM mov esi,DWORD PTR rp
|
|
ASM add eax,[esi]
|
|
ASM adc edx,0h
|
|
ASM mov [esi],eax
|
|
ASM mov [ebx],edx
|
|
}
|
|
|
|
|
|
*************************************************************************
|
|
|
|
|
|
|
|
/
|
|
/ Version for 32-bit Sun 386i Workstation
|
|
/
|
|
.file "mrmuldv.c"
|
|
.version "sun386-1.0"
|
|
.text
|
|
.globl muldiv
|
|
muldiv:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
|
|
movl 8(%ebp),%eax /get a
|
|
mull 12(%ebp) /multiply by b
|
|
addl 16(%ebp),%eax /add c to low word
|
|
adcl $0,%edx /add carry to high word
|
|
|
|
divl 20(%ebp) /divide by m
|
|
movl 24(%ebp),%ebx /get address for remainder
|
|
movl %edx,(%ebx) /store remainder
|
|
|
|
popl %ebp
|
|
ret
|
|
|
|
.text
|
|
.globl muldvm
|
|
muldvm:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
|
|
movl 8(%ebp),%edx /get a
|
|
movl 12(%ebp),%eax /add in c
|
|
divl 16(%ebp) /divide by m
|
|
|
|
movl 20(%ebp),%ebx /get address for remainder
|
|
movl %edx,(%ebx) /store remainder
|
|
|
|
popl %ebp
|
|
ret
|
|
|
|
.text
|
|
.globl muldvd
|
|
muldvd:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
|
|
movl 8(%ebp),%eax /get a
|
|
mull 12(%ebp) /multiply by b
|
|
addl 16(%ebp),%eax /add c to low word
|
|
adcl $0,%edx /add carry to high word
|
|
movl 20(%ebp),%ebx /get address for remainder
|
|
movl %eax,(%ebx) /store remainder
|
|
movl %edx,%eax /get quotient
|
|
|
|
popl %ebp
|
|
ret
|
|
|
|
|
|
**************************************************************************
|
|
|
|
|
|
/
|
|
/ DJGPP GNU C version for DOS
|
|
/ M. Scott 22/3/98
|
|
/
|
|
|
|
|
|
.file "mrmuldv.c"
|
|
.text
|
|
.globl _muldiv
|
|
_muldiv:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
|
|
|
|
movl 8(%ebp),%eax
|
|
mull 12(%ebp)
|
|
addl 16(%ebp),%eax
|
|
adcl $0,%edx
|
|
|
|
divl 20(%ebp)
|
|
movl 24(%ebp),%ebx
|
|
movl %edx,(%ebx)
|
|
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
.globl _muldvm
|
|
_muldvm:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
|
|
movl 8(%ebp),%edx
|
|
movl 12(%ebp),%eax
|
|
divl 16(%ebp)
|
|
|
|
movl 20(%ebp),%ebx
|
|
movl %edx,(%ebx)
|
|
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
.globl _muldvd
|
|
_muldvd:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
|
|
movl 8(%ebp),%eax
|
|
mull 12(%ebp)
|
|
addl 16(%ebp),%eax
|
|
adcl $0,%edx
|
|
movl 20(%ebp),%ebx
|
|
movl %eax,(%ebx)
|
|
movl %edx,%eax
|
|
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
.globl _muldvd2
|
|
_muldvd2:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
pushl %esi
|
|
|
|
movl 8(%ebp),%eax
|
|
mull 12(%ebp)
|
|
movl 16(%ebp),%ebx
|
|
addl (%ebx),%eax
|
|
adcl $0,%edx
|
|
movl 20(%ebp),%esi
|
|
addl (%esi),%eax
|
|
adcl $0,%edx
|
|
|
|
movl %eax,(%esi)
|
|
movl %edx,(%ebx)
|
|
|
|
popl %esi
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
|
|
*************************************************************************
|
|
|
|
/
|
|
/ GNU C for Linux (and other 386 based Linux/Unix??)
|
|
/
|
|
/
|
|
|
|
.file "mrmuldv.s"
|
|
.text
|
|
.globl muldiv
|
|
muldiv:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
|
|
|
|
movl 8(%ebp),%eax
|
|
mull 12(%ebp)
|
|
addl 16(%ebp),%eax
|
|
adcl $0,%edx
|
|
|
|
divl 20(%ebp)
|
|
movl 24(%ebp),%ebx
|
|
movl %edx,(%ebx)
|
|
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
.globl muldvm
|
|
muldvm:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
|
|
movl 8(%ebp),%edx
|
|
movl 12(%ebp),%eax
|
|
divl 16(%ebp)
|
|
|
|
movl 20(%ebp),%ebx
|
|
movl %edx,(%ebx)
|
|
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
.globl muldvd
|
|
muldvd:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
|
|
movl 8(%ebp),%eax
|
|
mull 12(%ebp)
|
|
addl 16(%ebp),%eax
|
|
adcl $0,%edx
|
|
movl 20(%ebp),%ebx
|
|
movl %eax,(%ebx)
|
|
movl %edx,%eax
|
|
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
.globl muldvd2
|
|
muldvd2:
|
|
pushl %ebp
|
|
movl %esp,%ebp
|
|
pushl %ebx
|
|
pushl %esi
|
|
|
|
movl 8(%ebp),%eax
|
|
mull 12(%ebp)
|
|
movl 16(%ebp),%ebx
|
|
addl (%ebx),%eax
|
|
adcl $0,%edx
|
|
movl 20(%ebp),%esi
|
|
addl (%esi),%eax
|
|
adcl $0,%edx
|
|
|
|
movl %eax,(%esi)
|
|
movl %edx,(%ebx)
|
|
|
|
popl %esi
|
|
popl %ebx
|
|
popl %ebp
|
|
ret
|
|
|
|
|
|
*************************************************************************
|
|
|
|
|
|
/* GCC inline assembly version for Linux/DJGPP */
|
|
|
|
#include "miracl.h"
|
|
|
|
|
|
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
__asm__ __volatile__ (
|
|
"movl %1,%%eax\n"
|
|
"mull %2\n"
|
|
"addl %3,%%eax\n"
|
|
"adcl $0,%%edx\n"
|
|
"divl %4\n"
|
|
"movl %5,%%ebx\n"
|
|
"movl %%edx,(%%ebx)\n"
|
|
"movl %%eax,%0\n"
|
|
: "=m"(q)
|
|
: "m"(a),"m"(b),"m"(c),"m"(m),"m"(rp)
|
|
: "eax","ebx","memory"
|
|
);
|
|
return q;
|
|
}
|
|
|
|
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
__asm__ __volatile__ (
|
|
"movl %1,%%edx\n"
|
|
"movl %2,%%eax\n"
|
|
"divl %3\n"
|
|
"movl %4,%%ebx\n"
|
|
"movl %%edx,(%%ebx)\n"
|
|
"movl %%eax,%0\n"
|
|
: "=m"(q)
|
|
: "m"(a),"m"(c),"m"(m),"m"(rp)
|
|
: "eax","ebx","memory"
|
|
);
|
|
return q;
|
|
}
|
|
|
|
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
__asm__ __volatile__ (
|
|
"movl %1,%%eax\n"
|
|
"mull %2\n"
|
|
"addl %3,%%eax\n"
|
|
"adcl $0,%%edx\n"
|
|
"movl %4,%%ebx\n"
|
|
"movl %%eax,(%%ebx)\n"
|
|
"movl %%edx,%0\n"
|
|
: "=m"(q)
|
|
: "m"(a),"m"(b),"m"(c),"m"(rp)
|
|
: "eax","ebx","memory"
|
|
);
|
|
return q;
|
|
}
|
|
|
|
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
|
|
{
|
|
__asm__ __volatile__ (
|
|
"movl %0,%%eax\n"
|
|
"mull %1\n"
|
|
"movl %2,%%ebx\n"
|
|
"addl (%%ebx),%%eax\n"
|
|
"adcl $0,%%edx\n"
|
|
"movl %3,%%esi\n"
|
|
"addl (%%esi),%%eax\n"
|
|
"adcl $0,%%edx\n"
|
|
"movl %%eax,(%%esi)\n"
|
|
"movl %%edx,(%%ebx)\n"
|
|
:
|
|
: "m"(a),"m"(b),"m"(c),"m"(rp)
|
|
: "eax","ebx","esi","memory"
|
|
);
|
|
|
|
}
|
|
|
|
***********************************************************
|
|
|
|
;
|
|
; Watcom C/386 32-bit compiler V7.0. Use with mirdef.h32
|
|
; Most parameters passed in registers
|
|
; Written for Phar Lap 386ASM macro-assembler
|
|
;
|
|
; V4.0 NOTE! Inline assembly versions of these routines,
|
|
; are also available. See miracl.h for details
|
|
;
|
|
|
|
.386
|
|
ASSUME CS:_TEXT
|
|
_TEXT SEGMENT BYTE PUBLIC 'CODE'
|
|
|
|
PUBLIC muldiv_
|
|
muldiv_ PROC NEAR
|
|
|
|
mul edx ;multiply a*b
|
|
add eax,ebx ;add in c
|
|
adc edx,0 ;carry
|
|
div ecx ;divide by m
|
|
mov ebx,[esp+4]
|
|
mov [ebx],edx ;remainder
|
|
ret 4 ;quotient in eax
|
|
|
|
muldiv_ endP
|
|
|
|
PUBLIC muldvm_
|
|
muldvm_ PROC NEAR
|
|
|
|
xchg eax,edx ;a*base+c
|
|
div ebx ;divide by m
|
|
mov [ecx],edx ;store remainder
|
|
ret ;quotient in eax
|
|
|
|
muldvm_ endP
|
|
|
|
PUBLIC muldvd_
|
|
muldvd_ PROC NEAR
|
|
|
|
mul edx ;multiply a*b
|
|
add eax,ebx ;add in c
|
|
adc edx,0
|
|
mov [ecx],eax ;store remainder
|
|
mov eax,edx ;get quotient
|
|
ret ;quotient in eax
|
|
|
|
muldvd_ endP
|
|
|
|
_TEXT ENDS
|
|
END
|
|
|
|
|
|
*******************************************************************
|
|
|
|
|
|
;
|
|
; Zortech C/386 32-bit compiler V2.1
|
|
; Use with mirdef.h32
|
|
; Written for Phar lap 386ASM macro-assembler
|
|
;
|
|
|
|
.386
|
|
ASSUME CS:_TEXT
|
|
_TEXT SEGMENT BYTE PUBLIC 'CODE'
|
|
|
|
PUBLIC _muldiv
|
|
_muldiv PROC NEAR
|
|
|
|
mov eax,DWORD PTR [esp+4]
|
|
mul DWORD PTR [esp+8]
|
|
add eax,DWORD PTR [esp+12]
|
|
adc edx,0
|
|
div DWORD PTR [esp+16]
|
|
mov ebx,DWORD PTR [esp+20]
|
|
mov [ebx],edx
|
|
ret
|
|
|
|
_muldiv endP
|
|
|
|
PUBLIC _muldvm
|
|
_muldvm PROC NEAR
|
|
|
|
mov edx,DWORD PTR [esp+4]
|
|
mov eax,DWORD PTR [esp+8]
|
|
div DWORD PTR [esp+12]
|
|
mov ebx,DWORD PTR [esp+16]
|
|
mov [ebx],edx
|
|
ret
|
|
|
|
_muldvm endP
|
|
|
|
PUBLIC _muldvd
|
|
_muldvd PROC NEAR
|
|
|
|
mov eax,DWORD PTR [esp+4]
|
|
mul DWORD PTR [esp+8]
|
|
add eax,DWORD PTR [esp+12]
|
|
adc edx,0
|
|
mov ebx,DWORD PTR [esp+16]
|
|
mov [ebx],eax
|
|
mov eax,edx
|
|
ret
|
|
|
|
_muldvd endP
|
|
|
|
_TEXT ENDS
|
|
END
|
|
|
|
|
|
|
|
************************************************************************
|
|
|
|
|
|
unsigned int muldiv(a,b,c,m,rp)
|
|
unsigned int a,b,c,m,*rp;
|
|
{
|
|
asm
|
|
{
|
|
;
|
|
; MACintosh version for Megamax or Lightspeed Think C compiler
|
|
; with 16-bit int, 68000 processor
|
|
; For a 32 bit version for the 68020, see below
|
|
;
|
|
move a(A6),D1 ;get a
|
|
mulu b(A6),D1 ;multiply by b
|
|
clr.l D0
|
|
move c(A6),D0 ;get c
|
|
add.l D0,D1 ;D1 contains a*b+c
|
|
divu m(A6),D1 ;divide by m
|
|
move D1,D0 ;return with quotient in D0
|
|
swap D1 ;get remainder
|
|
move.l rp(A6),A0 ;get address for remainder
|
|
move D1,(A0) ;store remainder
|
|
}
|
|
}
|
|
|
|
unsigned int muldvm(a,c,m,rp)
|
|
unsigned int a,c,m,*rp;
|
|
{
|
|
asm
|
|
{
|
|
;
|
|
; Version of muldvm for Apple MAC
|
|
;
|
|
clr.l D1
|
|
move a(A6),D1 ;get a
|
|
swap D1 ;move a to high word
|
|
move c(A6),D1 ;add in c
|
|
divu m(A6),D1 ;divide by m
|
|
move D1,D0 ;return quotient in D0
|
|
swap D1 ;get remainder
|
|
move.l rp(A6),A0 ;get address for remainder
|
|
move D1,(A0) ;store remainder
|
|
}
|
|
}
|
|
|
|
unsigned int muldvd(a,b,c,rp)
|
|
unsigned int a,b,c,*rp;
|
|
{
|
|
asm
|
|
{
|
|
;
|
|
; Version of muldvd for Apple MAC
|
|
;
|
|
move a(A6),D1 ;get a
|
|
mulu b(a6),D1 ;multiply by b
|
|
clr.l D0
|
|
move c(A6),D0 ;get c
|
|
add.l D0,D1 ;add in c
|
|
move.l D1,D0
|
|
swap D0 ;return quotient in D0
|
|
move.l rp(A6),A0 ;get address for remainder
|
|
move D1,(A0) ;store remainder
|
|
}
|
|
}
|
|
|
|
|
|
**********************************************************************
|
|
|
|
|
|
#
|
|
# 68020+ versions for Next, and for new 32-bit Macs
|
|
# Parameters come off the stack
|
|
#
|
|
|
|
.globl _muldiv,_muldvm,_muldvd
|
|
|
|
_muldiv:
|
|
movel sp@(4),d0
|
|
mulul sp@(8),d1:d0
|
|
addl sp@(12),d0
|
|
negxl d1 # tricky stuff!
|
|
negl d1
|
|
divul sp@(16),d1:d0
|
|
movel sp@(20),a0
|
|
movel d1,a0@
|
|
rts
|
|
|
|
_muldvm:
|
|
movel sp@(4),d1
|
|
movel sp@(8),d0
|
|
divul sp@(12),d1:d0
|
|
movel sp@(16),a0
|
|
movel d1,a0@
|
|
rts
|
|
|
|
_muldvd:
|
|
movel sp@(4),d1
|
|
mulul sp@(8),d0:d1
|
|
addl sp@(12),d1
|
|
negxl d0
|
|
negl d0
|
|
movel sp@(16),a0
|
|
movel d1,a0@
|
|
rts
|
|
|
|
|
|
|
|
*************************************************************************
|
|
|
|
|
|
unsigned int muldiv(a,b,c,m,rp)
|
|
unsigned int a,b,c,m,*rp;
|
|
{
|
|
asm
|
|
{
|
|
;
|
|
; 32016 processor version for BBC Master Scientific
|
|
; with 32-bit int, by Dudley Long, Rutherford-Appleton Labs.
|
|
; No muldvm() or muldvd()
|
|
;
|
|
movd a,0 ;move a to R0
|
|
meid b,0 ;multiply by b, result extended
|
|
addd c,0 ;add c to extended number in R0 & R1
|
|
addcd #0,1
|
|
deid m,0 ;divide by m
|
|
movd 0,0(rp) ;remainder to *rp
|
|
movd 1,0 ;quotient returned in R0
|
|
}
|
|
}
|
|
|
|
|
|
*******************************************************************
|
|
|
|
|
|
;
|
|
; MOTE! This code is obsolete. Newer ARMs support a 32x32 UMULL instruction
|
|
; The ARM compiler supports a long long type, so a C only version may be
|
|
; faster
|
|
;
|
|
; Acorn ARM Risc version (32-bit) for Archimedes micro
|
|
; Wingpass Macro Assembler
|
|
; Use with mirdef.h32
|
|
;
|
|
.INCLUDE "A.REGNAMES"
|
|
|
|
.AREA C$$code, .CODE, .READONLY
|
|
|
|
muldiv::
|
|
MOV ip, sp ;standard linkage
|
|
STMFD sp!, {v1-v4}
|
|
|
|
CMPS a2,#0x80000000 ;check for b=MAXBASE
|
|
MOVEQ v3,a1,LSL #31 ;this idea is quicker because
|
|
MOVEQ v4,a1,LSR #1 ;of ARM barrel shifting capability
|
|
BEQ addin
|
|
MOV v1,a1,LSR #16 ;do it the hard way
|
|
MOV v2,a2,LSR #16
|
|
BIC a1,a1,v1,LSL #16
|
|
BIC a2,a2,v2,LSL #16
|
|
MUL v3,a1,a2 ;form partial products of a*b
|
|
MUL v4,v1,v2
|
|
SUB v1,v1,a1
|
|
SUB v2,a2,v2
|
|
MLA v1,v2,v1,v3 ;look - only 3 MULs!
|
|
ADD v1,v1,v4
|
|
ADDS v3,v3,v1,LSL #16
|
|
ADC v4,v4,v1,LSR #16
|
|
addin:
|
|
ADDS v3,v3,a3 ;now add in c
|
|
ADCCS v4,v4,#0
|
|
|
|
CMPS a4,#0x80000000 ;check for m=MAXBASE
|
|
MOVEQ a1,v3,LSR #31
|
|
ADDEQ a1,a1,v4,LSL #1
|
|
BICEQ v4,v3,#0x80000000
|
|
BEQ leave
|
|
MOV a1,#0 ;do long division by m
|
|
|
|
divlp:
|
|
|
|
.REPEAT 32 ;2xfaster than a loop!
|
|
MOVS v3,v3,ASL #1 ;get next bit into carry
|
|
ADC v4,v4,v4 ;accumulate remainder
|
|
CMPS v4,a4
|
|
SUBCS v4,v4,a4
|
|
ADC a1,a1,a1 ;accumulate quotient
|
|
.ENDREPEAT
|
|
|
|
leave:
|
|
LDR v3,[ip]
|
|
STR v4,[v3] ;store remainder
|
|
LDMFD sp!, {v1-v4}
|
|
MOVS pc,lr
|
|
|
|
muldvm::
|
|
STMFD sp!, {v1-v2}
|
|
|
|
MOV v2,a1 ;'multiply' by 2^32
|
|
MOV v1,a2 ;add in c
|
|
|
|
MOV a1,#0 ;do long division by m
|
|
|
|
.REPEAT 32 ;2xfaster than a loop!
|
|
MOVS v1,v1,ASL #1 ;get next bit into carry
|
|
ADCS v2,v2,v2 ;accumulate remainder
|
|
CMPCCS v2,a3
|
|
SUBCS v2,v2,a3
|
|
ADC a1,a1,a1 ;accumulate quotient
|
|
.ENDREPEAT
|
|
|
|
STR v2,[a4] ;store remainder
|
|
LDMFD sp!, {v1-v2}
|
|
MOVS pc,lr
|
|
|
|
|
|
muldvd::
|
|
STMFD sp!, {v1-v2}
|
|
|
|
MOV ip,a1,LSR #16 ;do it the hard way
|
|
MOV v2,a2,LSR #16
|
|
BIC a1,a1,ip,LSL #16
|
|
BIC a2,a2,v2,LSL #16
|
|
MUL v1,a1,a2 ;form partial products of a*b
|
|
MUL a2,ip,a2
|
|
MUL a1,v2,a1
|
|
MUL v2,ip,v2
|
|
ADDS a1,a2,a1
|
|
ADDCS v2,v2,#0x10000
|
|
ADDS v1,v1,a1,LSL #16
|
|
ADC v2,v2,a1,LSR #16
|
|
|
|
ADDS v1,v1,a3 ;now add in c
|
|
ADCCS v2,v2,#0
|
|
MOV a1,v2 ;get quotient
|
|
|
|
STR v1,[a4] ;store remainder
|
|
LDMFD sp!, {v1-v2}
|
|
MOVS pc,lr
|
|
|
|
|
|
|
|
********************************************************************
|
|
|
|
|
|
;
|
|
; Version for Pyramid 90x and 98x computers
|
|
; from Rod Worley, Monash University, Victoria, Australia
|
|
;
|
|
; No muldvm() or muldvd()
|
|
;
|
|
.text 0
|
|
.globl _muldiv
|
|
_muldiv:
|
|
movw pr0,pr8 ;save a in reg 8
|
|
movw $0x0,pr0 ;zero reg0 so long reg 0,1 is b
|
|
emul pr8,pr0 ;extended multiply by a
|
|
addw pr2,pr1 ;add c to extended result
|
|
addwc $0x0,pr0
|
|
ediv pr3,pr0 ;extended div by m
|
|
movw pr1,(pr4) ;store remainder
|
|
ret ;return qotient in pr0
|
|
|
|
|
|
************************************************************************
|
|
|
|
|
|
/* This is the transputer version, by A.H. Pepperdine */
|
|
/* Assumes that the result will fit into a 32-bit word */
|
|
/* The error flag will be set if */
|
|
/* (a*b+c)/m >= 2**32 */
|
|
/* ie. equivalently, if */
|
|
/* ( (a*b+c) >> 32) >= m */
|
|
|
|
unsigned int muldiv(unsigned int a, unsigned int b, unsigned int c,
|
|
unsigned int m, unsigned int * rp)
|
|
{
|
|
unsigned int q;
|
|
__asm
|
|
{
|
|
ldabc a, b, c;
|
|
lmul ;
|
|
ld m;
|
|
ldiv ;
|
|
stab q, *rp;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
/* The base is 2**32, ie a full 32-bit unsigned integer */
|
|
/* The error flag will be set if the result will not fit*/
|
|
/* into a word, ie. */
|
|
/* for muldvm that is if (a >= m) */
|
|
/* and for muldvd it cannot happen */
|
|
|
|
unsigned int muldvm(unsigned int a, unsigned int c,
|
|
unsigned int m, unsigned int * rp)
|
|
{
|
|
unsigned int q;
|
|
__asm
|
|
{
|
|
ldabc m, c, a;
|
|
ldiv ;
|
|
stab q, *rp;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
unsigned int muldvd(unsigned int a, unsigned int b, unsigned int c,
|
|
unsigned int * rp)
|
|
{
|
|
unsigned int q;
|
|
__asm
|
|
{
|
|
ldabc a, b, c;
|
|
lmul ;
|
|
stab *rp, q;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
|
|
*********************************************************************
|
|
|
|
|
|
/* Now ... just to confuse you even more ....
|
|
|
|
Blakeley/Sloan 'portable' method for Modular multiplication IEEE Trans
|
|
Computers C-34 March 1985 pp 290-292 eliminates need for double length
|
|
product - but will be slow. Might suit some RISC computers with no
|
|
multiply/divide instructions. To speed up try completely unravelling for()
|
|
loops.
|
|
|
|
This method should only be used if the mr_utype data type is twice the size
|
|
of a "mr_hltype" data-type. This must be defined below.
|
|
|
|
Note: DON't define MR_NOASM in mirdef.h if using this method.
|
|
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include "miracl.h"
|
|
|
|
mr_small muldiv(a,b,c,m,rp)
|
|
mr_small a,b,c,m;
|
|
mr_small *rp;
|
|
{
|
|
int i;
|
|
mr_small d,q=0,r=0;
|
|
d=m-a;
|
|
for (i=MIRACL/4;i>0;i--)
|
|
{ /* do it bit by bit */
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
mr_small muldvm(a,c,m,rp)
|
|
mr_small a,c,m;
|
|
mr_small *rp;
|
|
{ /* modified Blakely-Sloan */
|
|
register int i,carry;
|
|
register mr_small q=0,r=0;
|
|
r=a;
|
|
for (i=MIRACL/4;i>0;i--)
|
|
{ /* do it bit by bit */
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
/* define mr_hltype as that C type that is half the size in bits of the
|
|
underlying type (mr_utype in mirdef.h). Perhaps short if mr_utype is long?
|
|
Possible int if mr_utype is 64-bit long long ?? */
|
|
|
|
#define mr_hltype short
|
|
|
|
mr_small muldvd(a,b,c,rp)
|
|
mr_small a,b,c;
|
|
mr_small *rp;
|
|
{ /* multiply by parts */
|
|
mr_small middle,middle2;
|
|
mr_small q,r;
|
|
unsigned mr_hltype am,al,bm,bl;
|
|
int hshift=(MIRACL>>1);
|
|
am=(unsigned mr_hltype)(a>>hshift);
|
|
al=(unsigned mr_hltype)a;
|
|
bm=(unsigned mr_hltype)(b>>hshift);
|
|
bl=(unsigned mr_hltype)b;
|
|
/* form partial products */
|
|
r= (mr_small)al*bl;
|
|
q= (mr_small)am*bm;
|
|
middle=(mr_small)al*bm;
|
|
middle2=(mr_small)bl*am;
|
|
middle+=middle2; /* combine them - carefully */
|
|
if (middle<middle2) q+=((mr_small)1<<hshift);
|
|
r+=(middle << hshift);
|
|
if ((r>>hshift)<(unsigned mr_hltype)middle) q++;
|
|
q+=(middle>>hshift);
|
|
r+=c;
|
|
if (r<c) q++;
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
void muldvd2(a,b,c,rp)
|
|
mr_small a,b;
|
|
mr_small *c,*rp;
|
|
{ /* multiply by parts */
|
|
mr_small middle,middle2;
|
|
mr_small q,r;
|
|
unsigned mr_hltype am,al,bm,bl;
|
|
int hshift=(MIRACL>>1);
|
|
am=(unsigned mr_hltype)(a>>hshift);
|
|
al=(unsigned mr_hltype)a;
|
|
bm=(unsigned mr_hltype)(b>>hshift);
|
|
bl=(unsigned mr_hltype)b;
|
|
/* form partial products */
|
|
r= (mr_small)al*bl;
|
|
q= (mr_small)am*bm;
|
|
middle=(mr_small)al*bm;
|
|
middle2=(mr_small)bl*am;
|
|
middle+=middle2; /* combine them - carefully */
|
|
if (middle<middle2) q+=((mr_small)1<<hshift);
|
|
r+=(middle << hshift);
|
|
if ((r>>hshift)<(unsigned mr_hltype)middle) q++;
|
|
q+=(middle>>hshift);
|
|
r+=*c;
|
|
if (r<*c) q++;
|
|
r+=*rp;
|
|
if (r<*rp) q++;
|
|
*rp=r;
|
|
*c=q;
|
|
}
|
|
|
|
*************************************************************************
|
|
|
|
|
|
/* SPARC assembler version of above. Note that when Full-width base
|
|
working is used, then muldvd() is the most time-critical of these
|
|
three routines. Use with above Blakely-Sloan C versions of muldvm
|
|
and muldiv (Assumes mr_utype is 32 bit int) */
|
|
.global _muldvd
|
|
_muldvd:
|
|
mov %o1,%y
|
|
andcc %g0,%g0,%o4
|
|
nop
|
|
nop
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%o0,%o4
|
|
mulscc %o4,%g0,%o4
|
|
tst %o0
|
|
bge 1f
|
|
nop
|
|
add %o4,%o1,%o4
|
|
1:
|
|
rd %y,%o1
|
|
addcc %o1,%o2,%o1
|
|
st %o1,[%o3]
|
|
retl
|
|
addxcc %o4,%g0,%o0
|
|
|
|
|
|
**************************************************************************
|
|
|
|
|
|
/* If you have a "decent" SPARC which supports UMUL and UDIV instructions
|
|
then the following will be much faster. Cut and paste what follows
|
|
into mrmuldv.s. See miracl.mak make file
|
|
|
|
Aside: God, I hate the Sparc, with its slippery ill-defined Instruction
|
|
set. Not all implementations support UMUL and UDIV, so its safer
|
|
to use the method above.
|
|
|
|
Note: Sometimes the routine name needs a preceding underscore,
|
|
so it may be necessary to change for example muldvd to _muldvd
|
|
through-out. Depends on the Unix version
|
|
*/
|
|
|
|
.global muldvd
|
|
muldvd:
|
|
umul %o0,%o1,%o0
|
|
rd %y,%o1
|
|
addcc %o0,%o2,%o0
|
|
st %o0,[%o3]
|
|
retl
|
|
addx %o1,%g0,%o0
|
|
|
|
.global muldvd2
|
|
muldvd2:
|
|
umul %o0,%o1,%o0
|
|
rd %y,%o1
|
|
ld [%o2],%o5
|
|
addcc %o0,%o5,%o0
|
|
ld [%o3],%o5
|
|
addx %o1,%g0,%o1
|
|
addcc %o0,%o5,%o0
|
|
st %o0,[%o3]
|
|
addx %o1,%g0,%o1
|
|
retl
|
|
st %o1,[%o2]
|
|
|
|
.global muldvm
|
|
muldvm:
|
|
mov %o0,%y
|
|
nop
|
|
nop
|
|
nop
|
|
udiv %o1,%o2,%o0
|
|
umul %o0,%o2,%o2
|
|
sub %o1,%o2,%o1
|
|
retl
|
|
st %o1,[%o3]
|
|
|
|
.global muldiv
|
|
muldiv:
|
|
umul %o0,%o1,%o1
|
|
rd %y,%o0
|
|
addcc %o1,%o2,%o1
|
|
addx %o0,%g0,%o0
|
|
mov %o0,%y
|
|
nop
|
|
nop
|
|
nop
|
|
udiv %o1,%o3,%o0
|
|
umul %o0,%o3,%o2
|
|
sub %o1,%o2,%o1
|
|
retl
|
|
st %o1,[%o4]
|
|
|
|
|
|
/* In-line assembly for SPARC using double type */
|
|
|
|
#include <stdio.h>
|
|
#include "miracl.h"
|
|
|
|
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
mr_large ldres,dble;
|
|
static mr_small magic=MR_MAGIC;
|
|
__asm__ __volatile__ (
|
|
"fdmulq %1,%2,%%f0\n"
|
|
"fdtoq %3,%%f4\n"
|
|
"faddq %%f0,%%f4,%%f0\n"
|
|
"fdtoq %4,%%f4\n"
|
|
"fdivq %%f0,%%f4,%%f4\n"
|
|
"fdtoq %5,%%f8\n"
|
|
"faddq %%f4,%%f8,%%f4\n"
|
|
"fsubq %%f4,%%f8,%%f4\n"
|
|
"fqtod %%f4,%0\n"
|
|
"fdmulq %0,%4,%%f8\n"
|
|
"fsubq %%f0,%%f8,%%f0\n"
|
|
"fqtod %%f0,%%f10\n"
|
|
"std %%f10,[%6]\n"
|
|
: "=f"(q)
|
|
: "f"(a),"f"(b),"f"(c),"f"(m),"f"(magic),"r"(rp)
|
|
: "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
|
|
);
|
|
return q;
|
|
}
|
|
|
|
#ifdef MR_FP_ROUNDING
|
|
|
|
mr_small imuldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_large im,mr_small *rp)
|
|
{
|
|
mr_small q;
|
|
mr_large ldres,dble;
|
|
static mr_small magic=MR_MAGIC;
|
|
__asm__ __volatile__ (
|
|
"fdmulq %1,%2,%%f0\n"
|
|
"fdtoq %3,%%f4\n"
|
|
"faddq %%f0,%%f4,%%f0\n"
|
|
|
|
"fmulq %4,%%f0,%%f4\n"
|
|
"fdtoq %6,%%f8\n"
|
|
"faddq %%f4,%%f8,%%f4\n"
|
|
"fsubq %%f4,%%f8,%%f4\n"
|
|
"fqtod %%f4,%0\n"
|
|
|
|
"fdmulq %0,%5,%%f8\n"
|
|
"fsubq %%f0,%%f8,%%f0\n"
|
|
"fqtod %%f0,%%f10\n"
|
|
"std %%f10,[%7]\n"
|
|
: "=f"(q)
|
|
: "f"(a),"f"(b),"f"(c),"f"(im),"f"(m),"f"(magic),"r"(rp)
|
|
: "f0","f1","f2","f3","f4","f5","f6","f7","f8","f9","f10","f11","memory"
|
|
);
|
|
return q;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* before leaving the SPARC, here is an interesting idea
|
|
Specify the underlying type as the 64-bit long long, as supported by the
|
|
GCC compiler. Use the Blakely-Sloan Portable Code above, with mr_hltype
|
|
defined as a long. This has been tried and works, getting 64-bit
|
|
behaviour from a 32-bit processor! Its slower than the 32-bit code above,
|
|
but if the 64-bit mrmuldvd() were rewritten in fast assembler.....? */
|
|
|
|
|
|
************************************************************************
|
|
|
|
#########################################################################################
|
|
#
|
|
# mrmuldv.s
|
|
# author: G. Garth Feb 1996
|
|
#
|
|
# implementation of modular multiplication for smalls
|
|
# using Blakely-Sloan division algorithm
|
|
# for Motorola 601 and 604 RISC PowerPC 32-bit processors
|
|
# see IEEE trans. Computers C-34, No. 3, March 1985 pp. 290-292
|
|
#
|
|
# see also PowerPC Microprocessor Developer's guide
|
|
# by Bunda, Potter & Shadowen, SAMS 1995, Appendix A p. 177
|
|
#
|
|
# intended for use in MIRACL library as assembly language implementation
|
|
# of routines muldiv, muldvm and muldvd
|
|
# written for Apple MPW PPC Assembler for Macintosh PPC computers
|
|
#
|
|
# Division Algorithm Pseudo Code
|
|
# given: integers A,B,C,D and M where D = A * B + C
|
|
# this algorithm computes Q and R such that
|
|
# D = M * Q + R
|
|
# Constraints:
|
|
# A,B,C,M < 2^H where H is word length in bits
|
|
# 0 <= Q,R < M; 0 < D < 2^(2*H)
|
|
#
|
|
# let K = # of bits in D
|
|
#
|
|
# R = Q = 0;
|
|
# for(T = K - 1; T >= 0; T--)
|
|
# {
|
|
# R <<= 1;
|
|
# Q <<= 1;
|
|
# if(D[T] == 1)
|
|
# {
|
|
# R += 1;
|
|
# }
|
|
# while(R >= M)
|
|
# {
|
|
# R -= M;
|
|
# Q += 1;
|
|
# }
|
|
# }
|
|
#
|
|
#########################################################################################
|
|
|
|
export muldiv[DS]
|
|
export .muldiv[PR]
|
|
export muldvm[DS]
|
|
export .muldvm[PR]
|
|
export muldvd[DS]
|
|
export .muldvd[PR]
|
|
|
|
toc
|
|
tc muldiv[TC],muldiv[DS]
|
|
tc muldvm[TC],muldvm[DS]
|
|
tc muldvd[TC],muldvd[DS]
|
|
|
|
csect muldiv[DS]
|
|
dc.l .muldiv[PR]
|
|
dc.l TOC[tc0]
|
|
csect muldvm[DS]
|
|
dc.l .muldvm[PR]
|
|
dc.l TOC[tc0]
|
|
csect muldvd[DS]
|
|
dc.l .muldvd[PR]
|
|
dc.l TOC[tc0]
|
|
|
|
#
|
|
# unsigned int muldiv(a,b,c,m,rp)
|
|
# unsigned int a,b,c,m,*rp;
|
|
# returns q = int[(a*b+c)/m] and *rp = (a*b+c) mod m
|
|
# when called a -> (r3), b -> (r4), c -> (r5), m -> (r6), rp -> (r7)
|
|
# upon return q -> (r3), *rp -> [(r12)]
|
|
# registers used: r3 thru r12
|
|
#
|
|
|
|
csect .muldiv[PR]
|
|
function .muldiv[PR]
|
|
|
|
or r12,r7,r7 ;(r12) <- remainder address
|
|
mulhwu r8,r3,r4 ;(r8) <- a * b high word
|
|
mullw r9,r3,r4 ;(r9 ) <- a * b low word
|
|
addc r4,r5,r9 ;(r4) <- a * b + c dividend.lo
|
|
addze r3,r8 ;(r3) <- (r8) + XERca dividend.hi
|
|
subic. r5,r3,0 ;test for zero dividend.hi
|
|
bne divlong ;
|
|
;here if dividend is single word
|
|
divwu r3,r4,r6 ;(r3) <- quotient
|
|
mullw r7,r6,r3; ;(r7) <- r6 * int (r4 / r6)
|
|
subf r5,r7,r4 ;(r5) <- remainder.lo
|
|
stw r5,0x0000(r12) ;[(r12)] <- remainder
|
|
blr ;that's all for single word division
|
|
|
|
divlong:
|
|
xor r7,r7,r7 ;zero divisor.hi
|
|
nor r7,r7,r7 ;calc ~divisor.hi
|
|
subfic r8,r6,0 ;(r8) <- -divisor.lo, set CA
|
|
addze r7,r7 ;(r7) <- ~divisor.hi + CA
|
|
or r11,r4,r4 ;(r11) <- dividend.lo
|
|
or r4,r3,r3 ;(r4) <- dividend.hi
|
|
;try to shift ahead, skipping unnecessary
|
|
;shifting loops
|
|
cntlzw r10,r4 ;find order of dividend.hi
|
|
subfic r9,r10,32 ;calc shift = 32 - order
|
|
slw r4,r4,r10 ;shift ahead dividend.hi
|
|
srw r3,r11,r9 ;get shifted part of dividend.lo
|
|
or r4,r4,r3 ;combine with dividend.hi
|
|
slw r11,r11,r10 ;shift ahead dividend.lo
|
|
addi r9,r9,33 ;setup for looping
|
|
mtctr r9 ;
|
|
xor r3,r3,r3 ;clear quotient.lo
|
|
xor r5,r5,r5 ;clear shift.hi
|
|
xor r6,r6,r6 ;clear shift.lo
|
|
b ldiff ;skip first round of shifting
|
|
align 6 ;align loop to 64-byte boundary
|
|
lshift:
|
|
rlwinm r5,r5,1,0,30 ;shift.hi <<= 1
|
|
rlwimi r5,r6,1,31,31 ;shift.hi[31] = shift.lo[0]
|
|
rlwinm r6,r6,1,0,30 ;shift.lo <<= 1
|
|
rlwimi r6,r4,1,31,31 ;shift.lo[31] = dividend.hi[0]
|
|
rlwinm r4,r4,1,0,30 ;dividend.hi <<= 1
|
|
rlwimi r4,r11,1,31,31 ;dividend.hi[31] = dividend.lo[0]
|
|
rlwinm r11,r11,1,0,30 ;dividend.lo <<= 1
|
|
rlwinm r3,r3,1,0,30 ;quotient.lo <<=1
|
|
ldiff:
|
|
addc r10,r6,r8 ;diff.lo = shift.lo - divisor.lo, set CA
|
|
adde. r9,r5,r7 ;diff.hi = shift.hi - divisor.hi + CA
|
|
blt lloop ;loop if diff < 0
|
|
or r6,r10,r10 ;shift.lo = diff.lo
|
|
or r5,r9,r9 ;shift.hi = diff.hi
|
|
ori r3,r3,1 ;set bit in quotient
|
|
lloop:
|
|
bdnz lshift ;loop until done
|
|
stw r6,0x0000(r12) ;store remainder in rp address
|
|
blr ;return
|
|
|
|
#
|
|
# unsigned int muldvm(a,c,m,rp)
|
|
# unsigned int a,c,m,*rp;
|
|
# returns q = int[(a*base+c)/m] and *rp = (a*base+c) mod m
|
|
# when called a -> (r3), c -> (r4), m -> (r5), rp -> (r6)
|
|
# upon return q -> (r3), *rp -> [(r12)]
|
|
# registers used: r3 thru r12
|
|
#
|
|
|
|
csect .muldvm[PR]
|
|
function .muldvm[PR]
|
|
|
|
or r12,r6,r6 ;(r12) <- remainder address
|
|
or r6,r5,r5 ;(r6) <- m
|
|
xor r7,r7,r7 ;zero divisor.hi
|
|
nor r7,r7,r7 ;calc ~divisor.hi
|
|
subfic r8,r6,0 ;(r8) <- calc -divisor.lo, set CA
|
|
addze r7,r7 ;(r7) <- ~divisor.hi += CA
|
|
or r11,r4,r4 ;(r11) <- dividend.lo
|
|
or r4,r3,r3 ;(r4) <- dividend.hi
|
|
;try to shift ahead, skipping unnecessary
|
|
;shifting loops
|
|
cntlzw r10,r4 ;find order of dividend.hi
|
|
subfic r9,r10,32 ;calc shift = 32 - order
|
|
slw r4,r4,r10 ;shift ahead dividend.hi
|
|
srw r3,r11,r9 ;get shifted part of dividend.lo
|
|
or r4,r4,r3 ;combine with dividend.hi
|
|
slw r11,r11,r10 ;shift ahead dividend.lo
|
|
addi r9,r9,33 ;setup for looping
|
|
mtctr r9 ;
|
|
xor r3,r3,r3 ;clear quotient.lo
|
|
xor r5,r5,r5 ;clear shift.hi
|
|
xor r6,r6,r6 ;clear shift.lo
|
|
b sdiff ;skip first round of shifting
|
|
align 6 ;align loop to 64-byte boundary
|
|
sshift:
|
|
rlwinm r5,r5,1,0,30 ;shift.hi <<= 1
|
|
rlwimi r5,r6,1,31,31 ;shift.hi[31] = shift.lo[0]
|
|
rlwinm r6,r6,1,0,30 ;shift.lo <<= 1
|
|
rlwimi r6,r4,1,31,31 ;shift.lo[31] = dividend.hi[0]
|
|
rlwinm r4,r4,1,0,30 ;dividend.hi <<= 1
|
|
rlwimi r4,r11,1,31,31 ;dividend.hi[31] = dividend.lo[0]
|
|
rlwinm r11,r11,1,0,30 ;dividend.lo <<= 1
|
|
rlwinm r3,r3,1,0,30 ;quotient.lo <<=1
|
|
sdiff:
|
|
addc r10,r6,r8 ;diff.lo = shift.lo - divisor.lo, set CA
|
|
adde. r9,r5,r7 ;diff.hi = shift.hi - divisor.hi + CA
|
|
blt sloop ;loop if diff < 0
|
|
or r6,r10,r10 ;shift.lo = diff.lo
|
|
or r5,r9,r9 ;shift.hi = diff.hi
|
|
ori r3,r3,1 ;set bit in quotient
|
|
sloop:
|
|
bdnz sshift
|
|
stw r6,0x0000(r12) ;store remainder in rp address
|
|
blr ;return
|
|
|
|
#
|
|
# unsigned int muldvd(a,b,c,rp)
|
|
# unsigned int a,b,c,*rp;
|
|
# returns q = int[(a*b+c)/base] and *rp = (a*b+c) mod base
|
|
# when called a -> (r3), b -> (r4), c -> (r5), rp -> (r6)
|
|
# upon return q -> (r3), *rp -> [(r6)]
|
|
# registers used: r3 thru r8
|
|
#
|
|
|
|
csect .muldvd[PR]
|
|
function .muldvd[PR]
|
|
|
|
mulhwu r7,r3,r4 ;(r7) <- a * b high word
|
|
mullw r8,r3,r4 ;(r8) <- a * b low word
|
|
addc r4,r8,r5 ;(r4) <- a * b + c
|
|
addze r3,r7 ;(r3) <- (r7) + XERca
|
|
stw r4,0x0000(r6) ;store remainder -> (r6)
|
|
blr ;return
|
|
|
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
/* Itanium code for Intel compiler, with mr_small a 64-bit long */
|
|
|
|
#include "miracl.h"
|
|
|
|
mr_small muldiv(a,b,c,m,rp)
|
|
mr_small a,b,c,m;
|
|
mr_small *rp;
|
|
{ /* Blakely-Sloan */
|
|
int i;
|
|
mr_small d,q=0,r=0;
|
|
d=m-a;
|
|
for (i=MIRACL/4;i>0;i--)
|
|
{ /* do it bit by bit */
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
mr_small muldvm(a,c,m,rp)
|
|
mr_small a,c,m;
|
|
mr_small *rp;
|
|
{ /* modified Blakely-Sloan */
|
|
register int i,carry;
|
|
register mr_small q=0,r=0;
|
|
r=a;
|
|
for (i=MIRACL/4;i>0;i--)
|
|
{ /* do it bit by bit */
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
/* use intrinsics for speed */
|
|
|
|
/* These are now in-lined - see miracl.h */
|
|
|
|
/*
|
|
|
|
#include <ia64intrin.h>
|
|
|
|
mr_small muldvd(a,b,c,rp)
|
|
mr_small a,b,c;
|
|
mr_small *rp;
|
|
{
|
|
*rp=_m64_xmalu(a,b,c);
|
|
return _m64_xmahu(a,b,c);
|
|
}
|
|
|
|
void muldvd2(a,b,c,rp)
|
|
mr_small a,b;
|
|
mr_small *c,*rp;
|
|
{
|
|
mr_small bot;
|
|
bot=_m64_xmalu(a,b,*c);
|
|
*c=_m64_xmahu(a,b,*c);
|
|
bot+=*rp;
|
|
if (bot<*rp) (*c)++;
|
|
*rp=bot;
|
|
}
|
|
|
|
*/
|
|
|
|
|
|
/
|
|
/ GNU C for Linux (AMD64)
|
|
/ Parameters are passed in rdi,rsi,rdx,rcx,r8....
|
|
/
|
|
|
|
.file "mrmuldv.s"
|
|
.text
|
|
.globl muldiv
|
|
muldiv:
|
|
|
|
pushq %rbx
|
|
movq %rdi,%rax
|
|
movq %rdx,%rbx
|
|
mulq %rsi
|
|
addq %rbx,%rax
|
|
adcq $0,%rdx
|
|
|
|
divq %rcx
|
|
movq %r8,%rbx
|
|
movq %rdx,(%rbx)
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
.globl muldvm
|
|
muldvm:
|
|
|
|
pushq %rbx
|
|
movq %rdx,%rbx
|
|
movq %rdi,%rdx
|
|
movq %rsi,%rax
|
|
divq %rbx
|
|
|
|
movq %rcx,%rbx
|
|
movq %rdx,(%rbx)
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
.globl muldvd
|
|
muldvd:
|
|
|
|
pushq %rbx
|
|
movq %rdi,%rax
|
|
movq %rdx,%rbx
|
|
mulq %rsi
|
|
addq %rbx,%rax
|
|
adcq $0,%rdx
|
|
|
|
movq %rcx,%rbx
|
|
movq %rax,(%rbx)
|
|
movq %rdx,%rax
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
.globl muldvd2
|
|
muldvd2:
|
|
|
|
pushq %rbx
|
|
movq %rdi,%rax
|
|
movq %rdx,%rbx
|
|
mulq %rsi
|
|
addq (%rbx),%rax
|
|
adcq $0,%rdx
|
|
addq (%rcx),%rax
|
|
adcq $0,%rdx
|
|
|
|
movq %rax,(%rcx)
|
|
movq %rdx,(%rbx)
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
; Written by Ed Runnion with full rights granted to Shamus Software.
|
|
;
|
|
; An implementation of mrmuldv routines for miracl
|
|
; for ml64 assembler used by Microsoft Visual Studio (VC8) and X64 processor (AMD 64)
|
|
; X64 arguments are passed in RCX, RDX, R8, R9, Stack...
|
|
|
|
;/*
|
|
; * MIRACL compiler/hardware definitions - mirdef.h
|
|
; * Copyright (c) 1988-2006 Shamus Software Ltd.
|
|
; */
|
|
;#define MR_LITTLE_ENDIAN
|
|
;#define MIRACL 64
|
|
;#define mr_utype __int64
|
|
;#define mr_unsign64 unsigned __int64
|
|
;#define MR_IBITS 32
|
|
;#define MR_LBITS 32
|
|
;#define mr_unsign32 unsigned int
|
|
;#define MR_FLASH 52
|
|
;#define MAXBASE ((mr_small)1<<(MIRACL-1))
|
|
;#define MR_BITSINCHAR 8
|
|
|
|
.code
|
|
|
|
ALIGN 16
|
|
PUBLIC muldiv
|
|
muldiv PROC
|
|
|
|
mov rax,rcx
|
|
mul rdx
|
|
add rax,r8
|
|
adc rdx,0
|
|
div r9
|
|
mov r10, QWORD PTR [rsp+28h]
|
|
mov QWORD PTR[r10],rdx
|
|
|
|
ret
|
|
muldiv ENDP
|
|
|
|
ALIGN 16
|
|
PUBLIC muldvm
|
|
muldvm PROC
|
|
|
|
mov rax,rdx
|
|
mov rdx,rcx
|
|
div r8
|
|
mov QWORD PTR[r9],rdx
|
|
|
|
ret
|
|
muldvm ENDP
|
|
|
|
|
|
ALIGN 16
|
|
PUBLIC muldvd
|
|
muldvd PROC
|
|
|
|
mov rax,rcx
|
|
mul rdx
|
|
add rax,r8
|
|
adc rdx,0
|
|
mov QWORD PTR[r9],rax
|
|
mov rax,rdx
|
|
|
|
ret
|
|
muldvd ENDP
|
|
|
|
ALIGN 16
|
|
PUBLIC muldvd2
|
|
muldvd2 PROC
|
|
|
|
mov rax,rcx
|
|
mul rdx
|
|
add rax,QWORD PTR[r8]
|
|
adc rdx,0
|
|
add rax,QWORD PTR[r9]
|
|
adc rdx,0
|
|
mov QWORD PTR[r9],rax
|
|
mov QWORD PTR[r8],rdx
|
|
|
|
ret
|
|
muldvd2 ENDP
|
|
|
|
end
|
|
|
|
|
|
/* Win64 C version of mrmuldv.c, for 64-bit Visual Studio apps */
|
|
|
|
#include "miracl.h"
|
|
|
|
mr_small muldiv(mr_small a,mr_small b,mr_small c,mr_small m,mr_small *rp)
|
|
{
|
|
int i;
|
|
mr_small d,q=0,r=0;
|
|
d=m-a;
|
|
for (i=MIRACL/4;i>0;i--)
|
|
{ /* do it bit by bit */
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if ((mr_utype)b<0)
|
|
{
|
|
if (r>=m) { r-=d; q++; }
|
|
else r+=a;
|
|
}
|
|
if (r>=m) { r-=m; q++; }
|
|
b<<=1;
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
mr_small muldvm(mr_small a,mr_small c,mr_small m,mr_small *rp)
|
|
{ /* modified Blakely-Sloan */
|
|
register int i,carry;
|
|
register mr_small q=0,r=0;
|
|
r=a;
|
|
for (i=MIRACL/4;i>0;i--)
|
|
{ /* do it bit by bit */
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
carry=0;
|
|
if ((mr_utype)r<0) carry=1;
|
|
r<<=1;
|
|
if ((mr_utype)c<0) r++;
|
|
c<<=1;
|
|
q<<=1;
|
|
if (carry || r>=m) { r-=m; q++; }
|
|
}
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
#ifndef MR_NOFULLWIDTH
|
|
|
|
/* These are now in-lined - see miracl.h */
|
|
|
|
/*
|
|
|
|
mr_small muldvd(mr_small a,mr_small b,mr_small c,mr_small *rp)
|
|
{
|
|
mr_small q,r;
|
|
r=_umul128(a,b,&q);
|
|
r+=c;
|
|
q+=(r<c);
|
|
*rp=r;
|
|
return q;
|
|
}
|
|
|
|
void muldvd2(mr_small a,mr_small b,mr_small *c,mr_small *rp)
|
|
{
|
|
mr_small q,r;
|
|
r=_umul128(a,b,&q);
|
|
r+=*c;
|
|
q+=(r<*c);
|
|
r+=*rp;
|
|
q+=(r<*rp);
|
|
*rp=r;
|
|
*c=q;
|
|
}
|
|
*/
|
|
#endif
|
|
|
|
|