From 2a08c22b0f6dec70f8092291b889c88173ba5fcc Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Fri, 11 Jan 2019 02:35:57 +0100 Subject: [PATCH] optimized byte multiplications --- compiler/res/prog8lib/prog8lib.p8 | 191 ++++++++++++++++++ .../src/prog8/compiler/target/c64/AsmGen.kt | 52 ++++- .../prog8/optimizing/SimplifyExpressions.kt | 22 +- examples/test.p8 | 191 ++++++++++++++++-- examples/wizzine.p8 | 5 +- 5 files changed, 424 insertions(+), 37 deletions(-) diff --git a/compiler/res/prog8lib/prog8lib.p8 b/compiler/res/prog8lib/prog8lib.p8 index ae4886373..4f1949ae1 100644 --- a/compiler/res/prog8lib/prog8lib.p8 +++ b/compiler/res/prog8lib/prog8lib.p8 @@ -1254,4 +1254,195 @@ _magiceors .word $3f1d, $3f81, $3fa5, $3fc5, $4075, $409d, $40cd, $4109 }} } + +%asm {{ + +mul_byte_3 .proc + ; X + X*2 + lda c64.ESTACK_LO+1,x + asl a + clc + adc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_5 .proc + ; X + X*4 + lda c64.ESTACK_LO+1,x + asl a + asl a + clc + adc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_6 .proc + ; X*2 + X*4 + lda c64.ESTACK_LO+1,x + asl a + sta c64.SCRATCH_ZPREG + asl a + clc + adc c64.SCRATCH_ZPREG + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_7 .proc + ; X*8 - X + lda c64.ESTACK_LO+1,x + asl a + asl a + asl a + sec + sbc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_9 .proc + ; X + X*8 + lda c64.ESTACK_LO+1,x + asl a + asl a + asl a + clc + adc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_10 .proc + ; X + X + X*8 + lda c64.ESTACK_LO+1,x + asl a + asl a + asl a + clc + adc c64.ESTACK_LO+1,x + adc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_11 .proc + ; X + X + X + X*8 + lda c64.ESTACK_LO+1,x + sta c64.SCRATCH_ZPREG + asl a + asl a + asl a + clc + adc c64.SCRATCH_ZPREG + adc c64.SCRATCH_ZPREG + adc c64.SCRATCH_ZPREG + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_12 .proc + ; X*4 + X*8 + lda c64.ESTACK_LO+1,x + asl a + asl a + sta c64.SCRATCH_ZPREG + asl a + clc + adc c64.SCRATCH_ZPREG + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_13 .proc + ; X*16 - X -X -X + lda c64.ESTACK_LO+1,x + sta c64.SCRATCH_ZPREG + asl a + asl a + asl a + asl a + sec + sbc c64.SCRATCH_ZPREG + sbc c64.SCRATCH_ZPREG + sbc c64.SCRATCH_ZPREG + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_14 .proc + ; X*16 - X -X + lda c64.ESTACK_LO+1,x + asl a + asl a + asl a + asl a + sec + sbc c64.ESTACK_LO+1,x + sbc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_15 .proc + ; X*16 - X + lda c64.ESTACK_LO+1,x + sta c64.SCRATCH_ZPREG + asl a + asl a + asl a + asl a + sec + sbc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_20 .proc + ; X*4 + X*16 + lda c64.ESTACK_LO+1,x + asl a + asl a + sta c64.SCRATCH_ZPREG + asl a + asl a + clc + adc c64.SCRATCH_ZPREG + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_25 .proc + ; X + X*8 + X*16 + lda c64.ESTACK_LO+1,x + asl a + asl a + asl a + sta c64.SCRATCH_ZPREG + asl a + clc + adc c64.SCRATCH_ZPREG + adc c64.ESTACK_LO+1,x + sta c64.ESTACK_LO+1,x + rts + .pend + +mul_byte_40 .proc + ; X*8 + X*32 + lda c64.ESTACK_LO+1,x + asl a + asl a + asl a + sta c64.SCRATCH_ZPREG + asl a + asl a + clc + adc c64.SCRATCH_ZPREG + sta c64.ESTACK_LO+1,x + rts + .pend + +}} + } diff --git a/compiler/src/prog8/compiler/target/c64/AsmGen.kt b/compiler/src/prog8/compiler/target/c64/AsmGen.kt index ff35fe0f0..3fd200eb5 100644 --- a/compiler/src/prog8/compiler/target/c64/AsmGen.kt +++ b/compiler/src/prog8/compiler/target/c64/AsmGen.kt @@ -807,9 +807,9 @@ class AsmGen(val options: CompilationOptions, val program: IntermediateProgram, Opcode.SUB_W, Opcode.SUB_UW -> " jsr prog8_lib.sub_w" Opcode.MUL_B, Opcode.MUL_UB -> " jsr prog8_lib.mul_byte" Opcode.MUL_W, Opcode.MUL_UW -> " jsr prog8_lib.mul_word" + Opcode.MUL_F -> " jsr c64flt.mul_f" Opcode.ADD_F -> " jsr c64flt.add_f" Opcode.SUB_F -> " jsr c64flt.sub_f" - Opcode.MUL_F -> " jsr c64flt.mul_f" Opcode.DIV_F -> " jsr c64flt.div_f" Opcode.IDIV_UB -> " jsr prog8_lib.idiv_ub" Opcode.IDIV_B -> " jsr prog8_lib.idiv_b" @@ -877,6 +877,40 @@ class AsmGen(val options: CompilationOptions, val program: IntermediateProgram, } } + private fun optimizedIntMultiplicationsOnStack(mulIns: Instruction, amount: Int): String? { + + if(mulIns.opcode == Opcode.MUL_B || mulIns.opcode==Opcode.MUL_UB) { + when(amount) { + 0,1,2,4,8,16,32,64,128,256 -> throw AssemblyError("multiplication by power of 2 should have been converted into a left shift instruction already") + 3,5,6,7,9,10,11,12,13,14,15,20,25,40 -> return " jsr math.mul_byte_$amount" + else -> {} + } + + if(mulIns.opcode == Opcode.MUL_B) { + when(amount) { + -3,-5,-6,-7,-9,-10,-11,-12,-13,-14,-15,-20,-25,-40 -> return " jsr prog8_lib.neg_b | jsr math.mul_byte_${-amount}" + else -> {} + } + } + } + else if(mulIns.opcode == Opcode.MUL_W || mulIns.opcode==Opcode.MUL_UW) { + when(amount) { + 0,1,2,4,8,16,32,64,128,256 -> throw AssemblyError("multiplication by power of 2 should have been converted into a left shift instruction already") + 3,5,6,7,9,10,11,12,13,14,15,20,25,40 -> return " jsr math.mul_word_$amount" + else -> {} + } + + if(mulIns.opcode == Opcode.MUL_W) { + when(amount) { + -3,-5,-6,-7,-9,-10,-11,-12,-13,-14,-15,-20,-25,-40 -> return " jsr prog8_lib.neg_w | jsr math.mul_word_${-amount}" + else -> {} + } + } + } + + return null + } + private fun findPatterns(segment: List): List { val opcodes = segment.map { it.opcode } val result = mutableListOf() @@ -3124,6 +3158,22 @@ class AsmGen(val options: CompilationOptions, val program: IntermediateProgram, lda #0 + """ + }, + + + // various optimizable integer multiplications + AsmPattern(listOf(Opcode.PUSH_BYTE, Opcode.MUL_B), listOf(Opcode.PUSH_BYTE, Opcode.MUL_UB)) { segment -> + val amount=segment[0].arg!!.integerValue() + val result = optimizedIntMultiplicationsOnStack(segment[1], amount) + result ?: " lda #${hexVal(segment[0])} | sta ${ESTACK_LO.toHex()},x | dex | jsr prog8_lib.mul_byte" + }, + AsmPattern(listOf(Opcode.PUSH_WORD, Opcode.MUL_W), listOf(Opcode.PUSH_WORD, Opcode.MUL_UW)) { segment -> + val amount=segment[0].arg!!.integerValue() + val result = optimizedIntMultiplicationsOnStack(segment[1], amount) + if (result != null) result else { + val value = hexVal(segment[0]) + " lda #<$value | sta ${ESTACK_LO.toHex()},x | lda #>$value | sta ${ESTACK_HI.toHex()},x | dex | jsr prog8_lib.mul_word" + } } ) diff --git a/compiler/src/prog8/optimizing/SimplifyExpressions.kt b/compiler/src/prog8/optimizing/SimplifyExpressions.kt index 417c00f0f..6c7c8b005 100644 --- a/compiler/src/prog8/optimizing/SimplifyExpressions.kt +++ b/compiler/src/prog8/optimizing/SimplifyExpressions.kt @@ -17,22 +17,6 @@ import kotlin.math.log2 X % 2 -> X and 1 (if X is byte/word) - todo often used multiplications to factors that are more efficiently calculated (via shifts) - - X*3 -> X*2+X - X*5 -> X*4+X - X*6 -> X*2+X*2+X*2 - X*7 -> X*4+X*2+X - X*9 -> X*8 + X - X*10 -> X*8 + X*2 - X*11 -> X*8 + X*2 +X - X*12 -> X*8 + X*4 - X*13 -> X*8 + X*4 +X - X*14 -> X*8 + X*4 + X*2 - X*15 -> X*8 + X*4 + X*2 + X - (and negatives) - - todo expression optimization: common (sub) expression elimination (turn common expressions into single subroutine call + introduce variable to hold it) */ @@ -396,6 +380,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H // divided by a power of two => shift right optimizationsDone++ val numshifts = log2(cv) + println("DIV: SHIFT RIGHT $cv -> $numshifts") // TODO return BinaryExpression(expr.left, ">>", LiteralValue.optimalInteger(numshifts, expr.position), expr.position) } } @@ -404,6 +389,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H // divided by a negative power of two => negate, then shift right optimizationsDone++ val numshifts = log2(-cv) + println("DIV: SHIFT RIGHT $cv -> $numshifts") // TODO return BinaryExpression(PrefixExpression("-", expr.left, expr.position), ">>", LiteralValue.optimalInteger(numshifts, expr.position), expr.position) } } @@ -467,7 +453,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H if(leftValue.resultingDatatype(namespace, heap) in IntegerDatatypes) { // times a power of two => shift left optimizationsDone++ - val numshifts = log2(cv) + val numshifts = log2(cv).toInt() return BinaryExpression(expr.left, "<<", LiteralValue.optimalInteger(numshifts, expr.position), expr.position) } } @@ -475,7 +461,7 @@ class SimplifyExpressions(private val namespace: INameScope, private val heap: H if(leftValue.resultingDatatype(namespace, heap) in IntegerDatatypes) { // times a negative power of two => negate, then shift left optimizationsDone++ - val numshifts = log2(-cv) + val numshifts = log2(-cv).toInt() return BinaryExpression(PrefixExpression("-", expr.left, expr.position), "<<", LiteralValue.optimalInteger(numshifts, expr.position), expr.position) } } diff --git a/examples/test.p8 b/examples/test.p8 index e01af000c..deb720cde 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -4,27 +4,186 @@ sub start() { - ubyte i - byte j - uword uw - word w + ubyte i = 10 + ubyte ub2 + byte j = 5 + byte b2 + uword uw = 1000 + uword uw2 + word w = 1000 + word w2 - for i in 5 to 0 step -1 { - c64scr.print_ub(i) - c64.CHROUT('\n') - } + ub2=i*1 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*2 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*3 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*4 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*5 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*6 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*7 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*8 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*9 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*10 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*11 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*12 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*13 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*14 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*15 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*16 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*17 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*18 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*19 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*20 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*21 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*22 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*23 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*24 + c64scr.print_ub(ub2) + c64.CHROUT('\n') + ub2=i*25 + c64scr.print_ub(ub2) c64.CHROUT('\n') - for j in 5 to 0 step -1 { - c64scr.print_b(j) - c64.CHROUT('\n') - } + i=5 + ub2=i*40 + c64scr.print_ub(ub2) c64.CHROUT('\n') - for j in -5 to 0 { - c64scr.print_b(j) - c64.CHROUT('\n') - } c64.CHROUT('\n') + + + b2=j*1 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*2 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*3 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*4 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*5 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*6 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*7 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*8 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*9 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*10 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*11 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*12 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*13 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*14 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*15 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*16 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*17 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*18 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*19 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*20 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*21 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*22 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*23 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*24 + c64scr.print_b(b2) + c64.CHROUT('\n') + b2=j*25 + c64scr.print_b(b2) + c64.CHROUT('\n') + + j=3 + b2=j*40 + c64scr.print_b(b2) + c64.CHROUT('\n') + + c64.CHROUT('\n') + + + ;@todo multiplication by negative values + + ;@todo the same, for uword and word + } } diff --git a/examples/wizzine.p8 b/examples/wizzine.p8 index 383698ff5..141fc1f26 100644 --- a/examples/wizzine.p8 +++ b/examples/wizzine.p8 @@ -37,16 +37,17 @@ c64.SPRPTR[i] = $0a00/64 } c64.SPENA = 255 ; enable all sprites - c64utils.set_rasterirq(270) ; enable animation + c64utils.set_rasterirq(220) ; enable animation } } ~ irq { - ubyte angle=0 sub irq() { + ubyte angle ; no initialization value so it keeps the previous one. + c64.EXTCOL-- angle++