optimized byte multiplications

This commit is contained in:
Irmen de Jong 2019-01-11 02:35:57 +01:00
parent 487faf3a08
commit 2a08c22b0f
5 changed files with 424 additions and 37 deletions

View File

@ -1254,4 +1254,195 @@ _magiceors .word $3f1d, $3f81, $3fa5, $3fc5, $4075, $409d, $40cd, $4109
}}
}
%asm {{
mul_byte_3 .proc
; X + X*2
lda c64.ESTACK_LO+1,x
asl a
clc
adc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_5 .proc
; X + X*4
lda c64.ESTACK_LO+1,x
asl a
asl a
clc
adc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_6 .proc
; X*2 + X*4
lda c64.ESTACK_LO+1,x
asl a
sta c64.SCRATCH_ZPREG
asl a
clc
adc c64.SCRATCH_ZPREG
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_7 .proc
; X*8 - X
lda c64.ESTACK_LO+1,x
asl a
asl a
asl a
sec
sbc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_9 .proc
; X + X*8
lda c64.ESTACK_LO+1,x
asl a
asl a
asl a
clc
adc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_10 .proc
; X + X + X*8
lda c64.ESTACK_LO+1,x
asl a
asl a
asl a
clc
adc c64.ESTACK_LO+1,x
adc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_11 .proc
; X + X + X + X*8
lda c64.ESTACK_LO+1,x
sta c64.SCRATCH_ZPREG
asl a
asl a
asl a
clc
adc c64.SCRATCH_ZPREG
adc c64.SCRATCH_ZPREG
adc c64.SCRATCH_ZPREG
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_12 .proc
; X*4 + X*8
lda c64.ESTACK_LO+1,x
asl a
asl a
sta c64.SCRATCH_ZPREG
asl a
clc
adc c64.SCRATCH_ZPREG
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_13 .proc
; X*16 - X -X -X
lda c64.ESTACK_LO+1,x
sta c64.SCRATCH_ZPREG
asl a
asl a
asl a
asl a
sec
sbc c64.SCRATCH_ZPREG
sbc c64.SCRATCH_ZPREG
sbc c64.SCRATCH_ZPREG
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_14 .proc
; X*16 - X -X
lda c64.ESTACK_LO+1,x
asl a
asl a
asl a
asl a
sec
sbc c64.ESTACK_LO+1,x
sbc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_15 .proc
; X*16 - X
lda c64.ESTACK_LO+1,x
sta c64.SCRATCH_ZPREG
asl a
asl a
asl a
asl a
sec
sbc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_20 .proc
; X*4 + X*16
lda c64.ESTACK_LO+1,x
asl a
asl a
sta c64.SCRATCH_ZPREG
asl a
asl a
clc
adc c64.SCRATCH_ZPREG
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_25 .proc
; X + X*8 + X*16
lda c64.ESTACK_LO+1,x
asl a
asl a
asl a
sta c64.SCRATCH_ZPREG
asl a
clc
adc c64.SCRATCH_ZPREG
adc c64.ESTACK_LO+1,x
sta c64.ESTACK_LO+1,x
rts
.pend
mul_byte_40 .proc
; X*8 + X*32
lda c64.ESTACK_LO+1,x
asl a
asl a
asl a
sta c64.SCRATCH_ZPREG
asl a
asl a
clc
adc c64.SCRATCH_ZPREG
sta c64.ESTACK_LO+1,x
rts
.pend
}}
}

View File

@ -807,9 +807,9 @@ class AsmGen(val options: CompilationOptions, val program: IntermediateProgram,
Opcode.SUB_W, Opcode.SUB_UW -> " jsr prog8_lib.sub_w"
Opcode.MUL_B, Opcode.MUL_UB -> " jsr prog8_lib.mul_byte"
Opcode.MUL_W, Opcode.MUL_UW -> " jsr prog8_lib.mul_word"
Opcode.MUL_F -> " jsr c64flt.mul_f"
Opcode.ADD_F -> " jsr c64flt.add_f"
Opcode.SUB_F -> " jsr c64flt.sub_f"
Opcode.MUL_F -> " jsr c64flt.mul_f"
Opcode.DIV_F -> " jsr c64flt.div_f"
Opcode.IDIV_UB -> " jsr prog8_lib.idiv_ub"
Opcode.IDIV_B -> " jsr prog8_lib.idiv_b"
@ -877,6 +877,40 @@ class AsmGen(val options: CompilationOptions, val program: IntermediateProgram,
}
}
private fun optimizedIntMultiplicationsOnStack(mulIns: Instruction, amount: Int): String? {
if(mulIns.opcode == Opcode.MUL_B || mulIns.opcode==Opcode.MUL_UB) {
when(amount) {
0,1,2,4,8,16,32,64,128,256 -> throw AssemblyError("multiplication by power of 2 should have been converted into a left shift instruction already")
3,5,6,7,9,10,11,12,13,14,15,20,25,40 -> return " jsr math.mul_byte_$amount"
else -> {}
}
if(mulIns.opcode == Opcode.MUL_B) {
when(amount) {
-3,-5,-6,-7,-9,-10,-11,-12,-13,-14,-15,-20,-25,-40 -> return " jsr prog8_lib.neg_b | jsr math.mul_byte_${-amount}"
else -> {}
}
}
}
else if(mulIns.opcode == Opcode.MUL_W || mulIns.opcode==Opcode.MUL_UW) {
when(amount) {
0,1,2,4,8,16,32,64,128,256 -> throw AssemblyError("multiplication by power of 2 should have been converted into a left shift instruction already")
3,5,6,7,9,10,11,12,13,14,15,20,25,40 -> return " jsr math.mul_word_$amount"
else -> {}
}
if(mulIns.opcode == Opcode.MUL_W) {
when(amount) {
-3,-5,-6,-7,-9,-10,-11,-12,-13,-14,-15,-20,-25,-40 -> return " jsr prog8_lib.neg_w | jsr math.mul_word_${-amount}"
else -> {}
}
}
}
return null
}
private fun findPatterns(segment: List<Instruction>): List<AsmFragment> {
val opcodes = segment.map { it.opcode }
val result = mutableListOf<AsmFragment>()
@ -3124,6 +3158,22 @@ class AsmGen(val options: CompilationOptions, val program: IntermediateProgram,
lda #0
+
"""
},
// various optimizable integer multiplications
AsmPattern(listOf(Opcode.PUSH_BYTE, Opcode.MUL_B), listOf(Opcode.PUSH_BYTE, Opcode.MUL_UB)) { segment ->
val amount=segment[0].arg!!.integerValue()
val result = optimizedIntMultiplicationsOnStack(segment[1], amount)
result ?: " lda #${hexVal(segment[0])} | sta ${ESTACK_LO.toHex()},x | dex | jsr prog8_lib.mul_byte"
},
AsmPattern(listOf(Opcode.PUSH_WORD, Opcode.MUL_W), listOf(Opcode.PUSH_WORD, Opcode.MUL_UW)) { segment ->
val amount=segment[0].arg!!.integerValue()
val result = optimizedIntMultiplicationsOnStack(segment[1], amount)
if (result != null) result else {
val value = hexVal(segment[0])
" lda #<$value | sta ${ESTACK_LO.toHex()},x | lda #>$value | sta ${ESTACK_HI.toHex()},x | dex | jsr prog8_lib.mul_word"
}
}
)

View File

@ -17,22 +17,6 @@ import kotlin.math.log2
X % 2 -> X and 1 (if X is byte/word)
todo often used multiplications to factors that are more efficiently calculated (via shifts)
X*3 -> X*2+X
X*5 -> X*4+X
X*6 -> X*2+X*2+X*2
X*7 -> X*4+X*2+X
X*9 -> X*8 + X
X*10 -> X*8 + X*2
X*11 -> X*8 + X*2 +X
X*12 -> X*8 + X*4
X*13 -> X*8 + X*4 +X
X*14 -> X*8 + X*4 + X*2
X*15 -> X*8 + X*4 + X*2 + X
(and negatives)
todo expression optimization: common (sub) expression elimination (turn common expressions into single subroutine call + introduce variable to hold it)
*/
@ -396,6 +380,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H
// divided by a power of two => shift right
optimizationsDone++
val numshifts = log2(cv)
println("DIV: SHIFT RIGHT $cv -> $numshifts") // TODO
return BinaryExpression(expr.left, ">>", LiteralValue.optimalInteger(numshifts, expr.position), expr.position)
}
}
@ -404,6 +389,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H
// divided by a negative power of two => negate, then shift right
optimizationsDone++
val numshifts = log2(-cv)
println("DIV: SHIFT RIGHT $cv -> $numshifts") // TODO
return BinaryExpression(PrefixExpression("-", expr.left, expr.position), ">>", LiteralValue.optimalInteger(numshifts, expr.position), expr.position)
}
}
@ -467,7 +453,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H
if(leftValue.resultingDatatype(namespace, heap) in IntegerDatatypes) {
// times a power of two => shift left
optimizationsDone++
val numshifts = log2(cv)
val numshifts = log2(cv).toInt()
return BinaryExpression(expr.left, "<<", LiteralValue.optimalInteger(numshifts, expr.position), expr.position)
}
}
@ -475,7 +461,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H
if(leftValue.resultingDatatype(namespace, heap) in IntegerDatatypes) {
// times a negative power of two => negate, then shift left
optimizationsDone++
val numshifts = log2(-cv)
val numshifts = log2(-cv).toInt()
return BinaryExpression(PrefixExpression("-", expr.left, expr.position), "<<", LiteralValue.optimalInteger(numshifts, expr.position), expr.position)
}
}

View File

@ -4,27 +4,186 @@
sub start() {
ubyte i
byte j
uword uw
word w
ubyte i = 10
ubyte ub2
byte j = 5
byte b2
uword uw = 1000
uword uw2
word w = 1000
word w2
for i in 5 to 0 step -1 {
c64scr.print_ub(i)
c64.CHROUT('\n')
}
ub2=i*1
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*2
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*3
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*4
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*5
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*6
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*7
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*8
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*9
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*10
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*11
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*12
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*13
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*14
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*15
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*16
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*17
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*18
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*19
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*20
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*21
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*22
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*23
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*24
c64scr.print_ub(ub2)
c64.CHROUT('\n')
ub2=i*25
c64scr.print_ub(ub2)
c64.CHROUT('\n')
for j in 5 to 0 step -1 {
c64scr.print_b(j)
c64.CHROUT('\n')
}
i=5
ub2=i*40
c64scr.print_ub(ub2)
c64.CHROUT('\n')
for j in -5 to 0 {
c64scr.print_b(j)
c64.CHROUT('\n')
}
c64.CHROUT('\n')
b2=j*1
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*2
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*3
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*4
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*5
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*6
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*7
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*8
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*9
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*10
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*11
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*12
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*13
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*14
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*15
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*16
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*17
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*18
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*19
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*20
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*21
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*22
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*23
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*24
c64scr.print_b(b2)
c64.CHROUT('\n')
b2=j*25
c64scr.print_b(b2)
c64.CHROUT('\n')
j=3
b2=j*40
c64scr.print_b(b2)
c64.CHROUT('\n')
c64.CHROUT('\n')
;@todo multiplication by negative values
;@todo the same, for uword and word
}
}

View File

@ -37,16 +37,17 @@
c64.SPRPTR[i] = $0a00/64
}
c64.SPENA = 255 ; enable all sprites
c64utils.set_rasterirq(270) ; enable animation
c64utils.set_rasterirq(220) ; enable animation
}
}
~ irq {
ubyte angle=0
sub irq() {
ubyte angle ; no initialization value so it keeps the previous one.
c64.EXTCOL--
angle++