kickc/src/main/kc/lib/fastmultiply.c

214 lines
5.9 KiB
C

// Library Implementation of the Seriously Fast Multiplication
// See http://codebase64.org/doku.php?id=base:seriously_fast_multiplication
// Utilizes the fact that a*b = ((a+b)/2)^2 - ((a-b)/2)^2
#include <fastmultiply.h>
// mulf_sqr tables will contain f(x)=int(x*x/4) and g(x) = f(x-255).
// <f(x) = <(( x * x )/4)
char __align(0x100) mulf_sqr1_lo[512];
// >f(x) = >(( x * x )/4)
char __align(0x100) mulf_sqr1_hi[512];
// <g(x) = <((( x - 255) * ( x - 255 ))/4)
char __align(0x100) mulf_sqr2_lo[512];
// >g(x) = >((( x - 255) * ( x - 255 ))/4)
char __align(0x100) mulf_sqr2_hi[512];
// Initialize the mulf_sqr multiplication tables with f(x)=int(x*x/4)
void mulf_init() {
// Fill mulf_sqr1 = f(x) = int(x*x/4): If f(x) = x*x/4 then f(x+1) = f(x) + x/2 + 1/4
unsigned int sqr = 0; // sqr = (x*x)/4
char x_2 = 0; // x/2
char c = 0; // Counter used for determining x%2==0
char* sqr1_hi = mulf_sqr1_hi+1;
for(char* sqr1_lo = mulf_sqr1_lo+1; sqr1_lo!=mulf_sqr1_lo+512; sqr1_lo++) {
if((++c&1)==0) {
x_2++; // increase i/2 on even numbers
sqr++; // sqr++ on even numbers because 1 = 2*1/4 (from the two previous numbers) + 1/2 (half of the previous uneven number)
}
*sqr1_lo = BYTE0(sqr);
*sqr1_hi++ = BYTE1(sqr);
sqr = sqr + x_2; // sqr = sqr + i/2 (when uneven the 1/2 is not added here - see above)
}
// Fill mulf_sqr2 = g(x) = f(x-255) : If x-255<0 then g(x)=f(255-x) (because x*x = -x*-x)
// g(0) = f(255), g(1) = f(254), ..., g(254) = f(1), g(255) = f(0), g(256) = f(1), ..., g(510) = f(255), g(511) = f(256)
char x_255 = (char)-1; //Start with g(0)=f(255)
char dir = 0xff; // Decrease or increase x_255 - initially we decrease
char* sqr2_hi = mulf_sqr2_hi;
for(char* sqr2_lo = mulf_sqr2_lo; sqr2_lo!=mulf_sqr2_lo+511; sqr2_lo++) {
*sqr2_lo = mulf_sqr1_lo[x_255];
*sqr2_hi++ = mulf_sqr1_hi[x_255];
x_255 = x_255 + dir;
if(x_255==0) {
dir = 1; // when x_255=0 then start counting up
}
}
// Set the very last value g(511) = f(256)
*(mulf_sqr2_lo+511) = *(mulf_sqr1_lo+256);
*(mulf_sqr2_hi+511) = *(mulf_sqr1_hi+256);
}
// Prepare for fast multiply with an unsigned char to a unsigned int result
void mulf8u_prepare(char a) {
char* const memA = (char*)0xfd;
*memA = a;
asm {
lda memA
sta mulf8u_prepared.sm1+1
sta mulf8u_prepared.sm3+1
eor #$ff
sta mulf8u_prepared.sm2+1
sta mulf8u_prepared.sm4+1
}
}
// Calculate fast multiply with a prepared unsigned char to a unsigned int result
// The prepared number is set by calling mulf8u_prepare(char a)
unsigned int mulf8u_prepared(char b) {
char* const resL = (char*)0xfe;
char* const memB = (char*)0xff;
*memB = b;
asm {
ldx memB
sec
sm1:
lda mulf_sqr1_lo,x
sm2:
sbc mulf_sqr2_lo,x
sta resL
sm3:
lda mulf_sqr1_hi,x
sm4:
sbc mulf_sqr2_hi,x
sta memB
}
return MAKEWORD( *memB, *resL );
}
// Fast multiply two unsigned chars to a unsigned int result
unsigned int mulf8u(char a, char b) {
mulf8u_prepare(a);
return mulf8u_prepared(b);
}
// Prepare for fast multiply with an signed char to a unsigned int result
inline void mulf8s_prepare(signed char a) {
mulf8u_prepare((char)a);
}
// Calculate fast multiply with a prepared unsigned char to a unsigned int result
// The prepared number is set by calling mulf8s_prepare(char a)
signed int mulf8s_prepared(signed char b) {
signed char* const memA = (signed char*)0xfd;
unsigned int m = mulf8u_prepared((char) b);
if(*memA<0) {
BYTE1(m) = BYTE1(m)-(char)b;
}
if(b<0) {
BYTE1(m) = BYTE1(m)-(char)*memA;
}
return (signed int)m;
}
// Fast multiply two signed chars to a unsigned int result
signed int mulf8s(signed char a, signed char b) {
mulf8s_prepare(a);
return mulf8s_prepared(b);
}
// Fast multiply two unsigned ints to a double unsigned int result
// Done in assembler to utilize fast addition A+X
unsigned long mulf16u(unsigned int a, unsigned int b) {
unsigned int* const memA = (unsigned int*)0xf8;
unsigned int* const memB = (unsigned int*)0xfa;
unsigned long* const memR = (unsigned long*)0xfc;
*memA = a;
*memB = b;
asm {
lda memA
sta sm1a+1
sta sm3a+1
sta sm5a+1
sta sm7a+1
eor #$ff
sta sm2a+1
sta sm4a+1
sta sm6a+1
sta sm8a+1
lda memA+1
sta sm1b+1
sta sm3b+1
sta sm5b+1
sta sm7b+1
eor #$ff
sta sm2b+1
sta sm4b+1
sta sm6b+1
sta sm8b+1
ldx memB
sec
sm1a: lda mulf_sqr1_lo,x
sm2a: sbc mulf_sqr2_lo,x
sta memR+0
sm3a: lda mulf_sqr1_hi,x
sm4a: sbc mulf_sqr2_hi,x
sta _AA+1
sec
sm1b: lda mulf_sqr1_lo,x
sm2b: sbc mulf_sqr2_lo,x
sta _cc+1
sm3b: lda mulf_sqr1_hi,x
sm4b: sbc mulf_sqr2_hi,x
sta _CC+1
ldx memB+1
sec
sm5a: lda mulf_sqr1_lo,x
sm6a: sbc mulf_sqr2_lo,x
sta _bb+1
sm7a: lda mulf_sqr1_hi,x
sm8a: sbc mulf_sqr2_hi,x
sta _BB+1
sec
sm5b: lda mulf_sqr1_lo,x
sm6b: sbc mulf_sqr2_lo,x
sta _dd+1
sm7b: lda mulf_sqr1_hi,x
sm8b: sbc mulf_sqr2_hi,x
sta memR+3
clc
_AA: lda #0
_bb: adc #0
sta memR+1
_BB: lda #0
_CC: adc #0
sta memR+2
bcc !+
inc memR+3
clc
!:
_cc: lda #0
adc memR+1
sta memR+1
_dd: lda #0
adc memR+2
sta memR+2
bcc !+
inc memR+3
!:
}
return *memR;
}
// Fast multiply two signed ints to a signed double unsigned int result
// Fixes offsets introduced by using unsigned multiplication
signed long mulf16s(signed int a, signed int b) {
unsigned long m = mulf16u((unsigned int)a, (unsigned int)b);
if(a<0) {
WORD1(m) = WORD1(m)-(unsigned int)b;
}
if(b<0) {
WORD1(m) = WORD1(m)-(unsigned int)a;
}
return (signed long)m;
}