From 431f2a2088d5262b4b0ed6ee89182a34294d4ed9 Mon Sep 17 00:00:00 2001 From: Irmen de Jong Date: Sun, 1 Nov 2020 07:36:40 +0100 Subject: [PATCH] optimized memset and memcopy on CX16, memcopy can deal with any size now --- compiler/res/prog8lib/prog8_lib.asm | 54 ++++++++++++- .../c64/codegen/BuiltinFunctionsAsmGen.kt | 76 ++++++++++++++++++- .../src/prog8/functions/BuiltinFunctions.kt | 2 +- docs/source/programming.rst | 4 +- docs/source/todo.rst | 2 - examples/test.p8 | 30 ++------ 6 files changed, 132 insertions(+), 36 deletions(-) diff --git a/compiler/res/prog8lib/prog8_lib.asm b/compiler/res/prog8lib/prog8_lib.asm index 603974c6b..e2efd4b7a 100644 --- a/compiler/res/prog8lib/prog8_lib.asm +++ b/compiler/res/prog8lib/prog8_lib.asm @@ -1388,8 +1388,8 @@ func_rndw .proc .pend -func_memcopy .proc - ; note: clobbers A,Y +func_memcopy255 .proc + ; fast memcopy of up to 255 bytes, note: clobbers A,Y inx stx P8ZP_SCRATCH_REG lda P8ESTACK_LO+2,x @@ -1414,6 +1414,50 @@ func_memcopy .proc rts .pend +func_memcopy .proc + ; memcopy of any number of bytes, note: clobbers A,Y + inx + stx P8ZP_SCRATCH_REG + lda P8ESTACK_LO+2,x + sta P8ZP_SCRATCH_W1 + lda P8ESTACK_HI+2,x + sta P8ZP_SCRATCH_W1+1 + lda P8ESTACK_LO+1,x + sta P8ZP_SCRATCH_W2 + lda P8ESTACK_HI+1,x + sta P8ZP_SCRATCH_W2+1 + lda P8ESTACK_LO,x + pha + lda P8ESTACK_HI,x + pha + + ldy #0 + pla + tax + beq _remain +- lda (P8ZP_SCRATCH_W1),y ; move a page at a time + sta (P8ZP_SCRATCH_W2),y + iny + bne - + inc P8ZP_SCRATCH_W1+1 + inc P8ZP_SCRATCH_W2+1 + dex + bne - +_remain pla + tax + beq _done +- lda (P8ZP_SCRATCH_W1),y ; move the remaining bytes + sta (P8ZP_SCRATCH_W2),y + iny + dex + bne - + +_done ldx P8ZP_SCRATCH_REG + inx + inx + rts + .pend + func_memset .proc ; note: clobbers A,Y inx @@ -1439,7 +1483,6 @@ func_memsetw .proc ; -- fill memory from (SCRATCH_ZPWORD1) number of words in SCRATCH_ZPWORD2, with word value in AY. inx - stx P8ZP_SCRATCH_REG lda P8ESTACK_LO+2,x sta P8ZP_SCRATCH_W1 lda P8ESTACK_HI+2,x @@ -1448,10 +1491,13 @@ func_memsetw .proc sta P8ZP_SCRATCH_W2 lda P8ESTACK_HI+1,x sta P8ZP_SCRATCH_W2+1 + txa + pha lda P8ESTACK_LO,x ldy P8ESTACK_HI,x jsr memsetw - ldx P8ZP_SCRATCH_REG + pla + tax inx inx rts diff --git a/compiler/src/prog8/compiler/target/c64/codegen/BuiltinFunctionsAsmGen.kt b/compiler/src/prog8/compiler/target/c64/codegen/BuiltinFunctionsAsmGen.kt index 98cfb7417..416ad40a8 100644 --- a/compiler/src/prog8/compiler/target/c64/codegen/BuiltinFunctionsAsmGen.kt +++ b/compiler/src/prog8/compiler/target/c64/codegen/BuiltinFunctionsAsmGen.kt @@ -7,6 +7,8 @@ import prog8.ast.expressions.* import prog8.ast.statements.DirectMemoryWrite import prog8.ast.statements.FunctionCallStatement import prog8.compiler.AssemblyError +import prog8.compiler.target.CompilationTarget +import prog8.compiler.target.Cx16Target import prog8.compiler.target.c64.codegen.assignment.AsmAssignSource import prog8.compiler.target.c64.codegen.assignment.AsmAssignTarget import prog8.compiler.target.c64.codegen.assignment.AsmAssignment @@ -78,8 +80,8 @@ internal class BuiltinFunctionsAsmGen(private val program: Program, private val "set_irqd" -> asmgen.out(" sei") "strlen" -> funcStrlen(fcall, resultToStack) "strcmp" -> funcStrcmp(fcall, func, resultToStack) - "substr", "leftstr", "rightstr", - "memcopy", "memset", "memsetw" -> { + "memcopy", "memset", "memsetw" -> funcMemSetCopy(fcall, func, functionName) + "substr", "leftstr", "rightstr" -> { translateArguments(fcall.args, func) asmgen.out(" jsr prog8_lib.func_$functionName") } @@ -88,6 +90,76 @@ internal class BuiltinFunctionsAsmGen(private val program: Program, private val } } + private fun funcMemSetCopy(fcall: IFunctionCall, func: FSignature, functionName: String) { + if(CompilationTarget.instance is Cx16Target) { + when(functionName) { + "memset" -> { + // use the ROM function of the Cx16 + var src = AsmAssignSource.fromAstSource(fcall.args[0], program, asmgen) + var tgt = AsmAssignTarget(TargetStorageKind.VARIABLE, program, asmgen, DataType.UWORD, null, variableAsmName = "cx16.r0") + var assign = AsmAssignment(src, tgt, false, Position.DUMMY) + asmgen.translateNormalAssignment(assign) + src = AsmAssignSource.fromAstSource(fcall.args[1], program, asmgen) + tgt = AsmAssignTarget(TargetStorageKind.VARIABLE, program, asmgen, DataType.UWORD, null, variableAsmName = "cx16.r1") + assign = AsmAssignment(src, tgt, false, Position.DUMMY) + asmgen.translateNormalAssignment(assign) + src = AsmAssignSource.fromAstSource(fcall.args[2], program, asmgen) + tgt = AsmAssignTarget(TargetStorageKind.REGISTER, program, asmgen, DataType.UBYTE, null, register = RegisterOrPair.A) + assign = AsmAssignment(src, tgt, false, Position.DUMMY) + asmgen.translateNormalAssignment(assign) + val sub = (fcall as FunctionCallStatement).definingSubroutine()!! + asmgen.saveRegister(CpuRegister.X, false, sub) + asmgen.out(" jsr cx16.memory_fill") + asmgen.restoreRegister(CpuRegister.X, false) + } + "memcopy" -> { + val count = fcall.args[2].constValue(program)?.number?.toInt() + val countDt = fcall.args[2].inferType(program) + if((count!=null && count <= 255) || countDt.istype(DataType.UBYTE) || countDt.istype(DataType.BYTE)) { + // fast memcopy of up to 255 + translateArguments(fcall.args, func) + asmgen.out(" jsr prog8_lib.func_memcopy255") + return + } + + // use the ROM function of the Cx16 + var src = AsmAssignSource.fromAstSource(fcall.args[0], program, asmgen) + var tgt = AsmAssignTarget(TargetStorageKind.VARIABLE, program, asmgen, DataType.UWORD, null, variableAsmName = "cx16.r0") + var assign = AsmAssignment(src, tgt, false, Position.DUMMY) + asmgen.translateNormalAssignment(assign) + src = AsmAssignSource.fromAstSource(fcall.args[1], program, asmgen) + tgt = AsmAssignTarget(TargetStorageKind.VARIABLE, program, asmgen, DataType.UWORD, null, variableAsmName = "cx16.r1") + assign = AsmAssignment(src, tgt, false, Position.DUMMY) + asmgen.translateNormalAssignment(assign) + src = AsmAssignSource.fromAstSource(fcall.args[2], program, asmgen) + tgt = AsmAssignTarget(TargetStorageKind.VARIABLE, program, asmgen, DataType.UWORD, null, variableAsmName = "cx16.r2") + assign = AsmAssignment(src, tgt, false, Position.DUMMY) + asmgen.translateNormalAssignment(assign) + val sub = (fcall as FunctionCallStatement).definingSubroutine()!! + asmgen.saveRegister(CpuRegister.X, false, sub) + asmgen.out(" jsr cx16.memory_copy") + asmgen.restoreRegister(CpuRegister.X, false) + } + "memsetw" -> { + translateArguments(fcall.args, func) + asmgen.out(" jsr prog8_lib.func_memsetw") + } + } + } else { + if(functionName=="memcopy") { + val count = fcall.args[2].constValue(program)?.number?.toInt() + val countDt = fcall.args[2].inferType(program) + if((count!=null && count <= 255) || countDt.istype(DataType.UBYTE) || countDt.istype(DataType.BYTE)) { + translateArguments(fcall.args, func) + asmgen.out(" jsr prog8_lib.func_memcopy255") + return + } + } + translateArguments(fcall.args, func) + asmgen.out(" jsr prog8_lib.func_$functionName") + } + } + private fun funcStrcmp(fcall: IFunctionCall, func: FSignature, resultToStack: Boolean) { translateArguments(fcall.args, func) if(resultToStack) diff --git a/compiler/src/prog8/functions/BuiltinFunctions.kt b/compiler/src/prog8/functions/BuiltinFunctions.kt index 8a85c03b3..1d3735fcf 100644 --- a/compiler/src/prog8/functions/BuiltinFunctions.kt +++ b/compiler/src/prog8/functions/BuiltinFunctions.kt @@ -79,7 +79,7 @@ val BuiltinFunctions = mapOf( "memcopy" to FSignature(false, listOf( FParam("from", IterableDatatypes + DataType.UWORD), FParam("to", IterableDatatypes + DataType.UWORD), - FParam("numbytes", setOf(DataType.UBYTE))), null), + FParam("numbytes", setOf(DataType.UBYTE, DataType.UWORD))), null), "memset" to FSignature(false, listOf( FParam("address", IterableDatatypes + DataType.UWORD), FParam("numbytes", setOf(DataType.UWORD)), diff --git a/docs/source/programming.rst b/docs/source/programming.rst index 21b376f26..b9e4c7a04 100644 --- a/docs/source/programming.rst +++ b/docs/source/programming.rst @@ -765,12 +765,12 @@ sort(array) Strings and memory blocks ^^^^^^^^^^^^^^^^^^^^^^^^^ memcopy(from, to, numbytes) - Efficiently copy a number of bytes (1 - 256) from a memory location to another. + Efficiently copy a number of bytes from a memory location to another. NOTE: 'to' must NOT overlap with 'from', unless it is *before* 'from'. Because this function imposes some overhead to handle the parameters, it is only faster if the number of bytes is larger than a certain threshold. Compare the generated code to see if it was beneficial or not. - The most efficient will always be to write a specialized copy routine in assembly yourself! + The most efficient will often be to write a specialized copy routine in assembly yourself! memset(address, numbytes, bytevalue) Efficiently set a part of memory to the given (u)byte value. diff --git a/docs/source/todo.rst b/docs/source/todo.rst index d6bd81ab2..1e1671ed8 100644 --- a/docs/source/todo.rst +++ b/docs/source/todo.rst @@ -2,8 +2,6 @@ TODO ==== -- make memset(w) and memcopy able to work with >256 bytes -- after that: make memset and memcopy use the ROM routines on the CX16 - calling convention for builtin functions no longer via stack but via statically allocated vars inside the subroutine proc (just as normal subroutines) - make it possible to use cpu opcodes such as 'nop' as variable names by prefixing all asm vars with something such as '_' - option to load the built-in library files from a directory instead of the embedded ones (for easier library development/debugging) diff --git a/examples/test.p8 b/examples/test.p8 index 7acce0678..e99b05bf2 100644 --- a/examples/test.p8 +++ b/examples/test.p8 @@ -9,36 +9,16 @@ main { sub start() { - float fl + const uword ADDR = $0400 - fl = getfloat() - floats.print_f(fl) - txt.chrout('\n') + memset(ADDR, 40*25, 100) + memsetw(ADDR, 20*10, $3031) + memcopy(ADDR, ADDR+40*12, 20*10*2) + ;memcopy(ADDR, ADDR+40*12, 255) testX() } - sub chrin() -> ubyte { - return 99 - } - - sub getstr() -> str { - @($d020)++ - return "foobar" - } - - sub getfloat() -> float { - float xx - xx = 123.456789 - return xx - } - - sub mcp(uword from, uword dest, ubyte length) { - txt.print_uw(from) - txt.print_uw(dest) - txt.print_ub(length) - } - asmsub testX() { %asm {{ stx _saveX