faster array copy and fix for length 256

This commit is contained in:
Irmen de Jong 2024-02-11 23:27:26 +01:00
parent a4f697bae1
commit 88458f5355
12 changed files with 124 additions and 91 deletions

View File

@ -96,26 +96,24 @@ internal class BuiltinFunctionsAsmGen(private val program: PtProgram,
asmgen.out("""
lda #<${sourceAsm}_lsb
ldy #>${sourceAsm}_lsb
sta cx16.r0L
sty cx16.r0H
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
lda #<${targetAsm}_lsb
ldy #>${targetAsm}_lsb
sta cx16.r1L
sty cx16.r1H
lda #<${numElements}
ldy #>${numElements}
jsr sys.memcopy
sta P8ZP_SCRATCH_W2
sty P8ZP_SCRATCH_W2+1
ldy #${numElements and 255}
jsr prog8_lib.memcopy_small
lda #<${sourceAsm}_msb
ldy #>${sourceAsm}_msb
sta cx16.r0L
sty cx16.r0H
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
lda #<${targetAsm}_msb
ldy #>${targetAsm}_msb
sta cx16.r1L
sty cx16.r1H
lda #<${numElements}
ldy #>${numElements}
jsr sys.memcopy""")
sta P8ZP_SCRATCH_W2
sty P8ZP_SCRATCH_W2+1
ldy #${numElements and 255}
jsr prog8_lib.memcopy_small""")
}
else if(source.type in SplitWordArrayTypes) {
// split word array to normal word array (copy lsb and msb arrays separately)
@ -158,15 +156,14 @@ internal class BuiltinFunctionsAsmGen(private val program: PtProgram,
asmgen.out("""
lda #<${sourceAsm}
ldy #>${sourceAsm}
sta cx16.r0L
sty cx16.r0H
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
lda #<${targetAsm}
ldy #>${targetAsm}
sta cx16.r1L
sty cx16.r1H
lda #<${numBytes}
ldy #>${numBytes}
jsr sys.memcopy""")
sta P8ZP_SCRATCH_W2
sty P8ZP_SCRATCH_W2+1
ldy #${numBytes and 255}
jsr prog8_lib.memcopy_small""")
}
}
@ -1408,8 +1405,7 @@ internal class BuiltinFunctionsAsmGen(private val program: PtProgram,
ldy #>$identifierName
sta P8ZP_SCRATCH_W1
sty P8ZP_SCRATCH_W1+1
lda #$numElements
""")
lda #${numElements and 255}""")
}
private fun translateArguments(call: PtBuiltinFunctionCall, scope: IPtSubroutine?) {

View File

@ -66,11 +66,11 @@ internal class BuiltinFuncGen(private val codeGen: IRCodeGen, private val exprGe
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=fromReg, labelSymbol = source.name+"_lsb")
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=toReg, labelSymbol = target.name+"_lsb")
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=countReg, immediate = sourceLength)
it += codeGen.makeSyscall(IMSyscall.MEMCOPY, listOf(IRDataType.WORD to fromReg, IRDataType.WORD to toReg, IRDataType.WORD to countReg), returns = null)
it += codeGen.makeSyscall(IMSyscall.MEMCOPY_SMALL, listOf(IRDataType.WORD to fromReg, IRDataType.WORD to toReg, IRDataType.BYTE to (countReg and 255)), returns = null)
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=fromReg, labelSymbol = source.name+"_msb")
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=toReg, labelSymbol = target.name+"_msb")
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=countReg, immediate = sourceLength)
it += codeGen.makeSyscall(IMSyscall.MEMCOPY, listOf(IRDataType.WORD to fromReg, IRDataType.WORD to toReg, IRDataType.WORD to countReg), returns = null)
it += codeGen.makeSyscall(IMSyscall.MEMCOPY_SMALL, listOf(IRDataType.WORD to fromReg, IRDataType.WORD to toReg, IRDataType.BYTE to (countReg and 255)), returns = null)
}
}
else if(source.type in SplitWordArrayTypes) {
@ -105,7 +105,7 @@ internal class BuiltinFuncGen(private val codeGen: IRCodeGen, private val exprGe
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=toReg, labelSymbol = target.name)
it += IRInstruction(Opcode.LOAD, IRDataType.WORD, reg1=countReg, immediate = sourceLength * eltsize)
}
result += codeGen.makeSyscall(IMSyscall.MEMCOPY, listOf(IRDataType.WORD to fromReg, IRDataType.WORD to toReg, IRDataType.WORD to countReg), returns = null)
result += codeGen.makeSyscall(IMSyscall.MEMCOPY_SMALL, listOf(IRDataType.WORD to fromReg, IRDataType.WORD to toReg, IRDataType.BYTE to (countReg and 255)), returns = null)
}
return ExpressionCodeResult(result, IRDataType.BYTE, -1, -1)
@ -244,7 +244,7 @@ internal class BuiltinFuncGen(private val codeGen: IRCodeGen, private val exprGe
addInstr(result, IRInstruction(Opcode.PREPARECALL, immediate = 2), null)
val tr = exprGen.translateExpression(arrayName)
addToResult(result, tr, tr.resultReg, -1)
addInstr(result, IRInstruction(Opcode.LOAD, IRDataType.BYTE, reg1 = lengthReg, immediate = arrayLength), null)
addInstr(result, IRInstruction(Opcode.LOAD, IRDataType.BYTE, reg1 = lengthReg, immediate = arrayLength!! and 255), null)
result += codeGen.makeSyscall(syscall, listOf(IRDataType.WORD to tr.resultReg, IRDataType.BYTE to lengthReg), IRDataType.BYTE to tr.resultReg)
return ExpressionCodeResult(result, IRDataType.BYTE, tr.resultReg, -1)
}
@ -272,7 +272,7 @@ internal class BuiltinFuncGen(private val codeGen: IRCodeGen, private val exprGe
val tr = exprGen.translateExpression(arrayName)
addToResult(result, tr, tr.resultReg, -1)
val lengthReg = codeGen.registers.nextFree()
addInstr(result, IRInstruction(Opcode.LOAD, IRDataType.BYTE, reg1 = lengthReg, immediate = arrayLength), null)
addInstr(result, IRInstruction(Opcode.LOAD, IRDataType.BYTE, reg1 = lengthReg, immediate = arrayLength!! and 255), null)
result += codeGen.makeSyscall(syscall, listOf(IRDataType.WORD to tr.resultReg, IRDataType.BYTE to lengthReg), IRDataType.BYTE to tr.resultReg)
return ExpressionCodeResult(result, IRDataType.BYTE, tr.resultReg, -1)
}

View File

@ -99,20 +99,14 @@ sys {
rts ; nothing to copy
_copyshort
; decrease source and target pointers so we can simply index by Y
lda P8ZP_SCRATCH_W1
bne +
dec P8ZP_SCRATCH_W1+1
+ dec P8ZP_SCRATCH_W1
lda P8ZP_SCRATCH_W2
bne +
dec P8ZP_SCRATCH_W2+1
+ dec P8ZP_SCRATCH_W2
dey
beq +
- lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
dey
bne -
+ lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
rts
_longcopy

View File

@ -641,20 +641,14 @@ _loop lda P8ZP_SCRATCH_W1
rts ; nothing to copy
_copyshort
; decrease source and target pointers so we can simply index by Y
lda P8ZP_SCRATCH_W1
bne +
dec P8ZP_SCRATCH_W1+1
+ dec P8ZP_SCRATCH_W1
lda P8ZP_SCRATCH_W2
bne +
dec P8ZP_SCRATCH_W2+1
+ dec P8ZP_SCRATCH_W2
dey
beq +
- lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
dey
bne -
+ lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
rts
_longcopy

View File

@ -639,20 +639,14 @@ _loop lda P8ZP_SCRATCH_W1
rts ; nothing to copy
_copyshort
; decrease source and target pointers so we can simply index by Y
lda P8ZP_SCRATCH_W1
bne +
dec P8ZP_SCRATCH_W1+1
+ dec P8ZP_SCRATCH_W1
lda P8ZP_SCRATCH_W2
bne +
dec P8ZP_SCRATCH_W2+1
+ dec P8ZP_SCRATCH_W2
dey
beq +
- lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
dey
bne -
+ lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
rts
_longcopy

View File

@ -1492,20 +1492,14 @@ _loop lda P8ZP_SCRATCH_W1
rts ; nothing to copy
_copyshort
; decrease source and target pointers so we can simply index by Y
lda cx16.r0
bne +
dec cx16.r0+1
+ dec cx16.r0
lda cx16.r1
bne +
dec cx16.r1+1
+ dec cx16.r1
dey
beq +
- lda (cx16.r0),y
sta (cx16.r1),y
dey
bne -
+ lda (cx16.r0),y
sta (cx16.r1),y
rts
_longcopy

View File

@ -194,20 +194,14 @@ _loop lda P8ZP_SCRATCH_W1
rts ; nothing to copy
_copyshort
; decrease source and target pointers so we can simply index by Y
lda P8ZP_SCRATCH_W1
bne +
dec P8ZP_SCRATCH_W1+1
+ dec P8ZP_SCRATCH_W1
lda P8ZP_SCRATCH_W2
bne +
dec P8ZP_SCRATCH_W2+1
+ dec P8ZP_SCRATCH_W2
dey
beq +
- lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
dey
bne -
+ lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
rts
_longcopy

View File

@ -405,3 +405,27 @@ _modsrcmsb lda $ffff ; modnfied msb read
bne _modsrclsb
rts
.pend
memcopy_small .proc
; copy up to a single page (256 bytes) of memory.
; note: only works for NON-OVERLAPPING memory regions!
; P8ZP_SCRATCH_W1 = from address
; P8ZP_SCRATCH_W2 = destination address
; Y = number of bytes to copy (where 0 means 256)
cpy #0
beq _fullpage
dey
beq _lastbyte
_loop lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
dey
bne _loop
_lastbyte lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
rts
_fullpage lda (P8ZP_SCRATCH_W1),y
sta (P8ZP_SCRATCH_W2),y
dey
bne _fullpage
rts
.pend

View File

@ -5,14 +5,37 @@
main {
sub start() {
ubyte[5] cave_times
ubyte[5] diamonds_needed
ubyte[256] @shared arr1 = 99
ubyte[256] @shared arr2 = 0
uword[128] @shared warr1 = 9999
uword[128] @shared warr2 = 0
cave_times = [1,2,3,4,5]
diamonds_needed = [1,2,3,4,5]
txt.print_ub(all(arr2))
txt.nl()
txt.print_ub(all(warr2))
txt.nl()
arr2 = arr1
warr2 = warr1
txt.print_ub(all(arr2))
txt.nl()
txt.print_ub(all(warr2))
txt.nl()
uword[] @split cave_times = [1111,2222,3333,4444]
cave_times = [9999,8888,7777,6666]
for cx16.r0L in 0 to len(cave_times)-1 {
txt.print_ub(cave_times[cx16.r0L])
txt.print_uw(cave_times[cx16.r0L])
txt.spc()
}
txt.nl()
ubyte[] cave_times2 = [11,22,33,44]
cave_times2 = [99,88,77,66]
for cx16.r0L in 0 to len(cave_times2)-1 {
txt.print_ub(cave_times2[cx16.r0L])
txt.spc()
}
txt.nl()

View File

@ -30,6 +30,7 @@ enum class IMSyscall(val number: Int) {
CLAMP_FLOAT(0x1016),
CALLFAR(0x1017),
MEMCOPY(0x1018),
ARRAYCOPY_SPLITW_TO_NORMAL(0x1019),
ARRAYCOPY_NORMAL_TO_SPLITW(0x101a),
MEMCOPY_SMALL(0x1019),
ARRAYCOPY_SPLITW_TO_NORMAL(0x101a),
ARRAYCOPY_NORMAL_TO_SPLITW(0x101b),
}

View File

@ -62,6 +62,7 @@ SYSCALLS:
52 = stringcopy
53 = ARRAYCOPY_SPLITW_TO_NORMAL
54 = ARRAYCOPY_NORMAL_TO_SPLITW
55 = memcopy_small
*/
enum class Syscall {
@ -120,6 +121,7 @@ enum class Syscall {
STRINGCOPY,
ARRAYCOPY_SPLITW_TO_NORMAL,
ARRAYCOPY_NORMAL_TO_SPLITW,
MEMCOPY_SMALL
;
companion object {
@ -306,7 +308,8 @@ object SysCalls {
val (addressV, lengthV) = getArgValues(callspec.arguments, vm)
val address = (addressV as UShort).toInt()
val length = (lengthV as UByte).toInt()
val addresses = IntProgression.fromClosedRange(address, address+length-1, 1)
val endAddressExcl = address + if(length==0) 256 else length
val addresses = IntProgression.fromClosedRange(address, endAddressExcl-1, 1)
if(addresses.any { vm.memory.getUB(it).toInt()!=0 })
returnValue(callspec.returns!!, 1, vm)
else
@ -316,7 +319,8 @@ object SysCalls {
val (addressV, lengthV) = getArgValues(callspec.arguments, vm)
val address = (addressV as UShort).toInt()
val length = (lengthV as UByte).toInt()
val addresses = IntProgression.fromClosedRange(address, address+length*2-2, 2)
val endAddressExcl = address + if(length==0) 256*2 else length*2
val addresses = IntProgression.fromClosedRange(address, endAddressExcl-2, 2)
if(addresses.any { vm.memory.getUW(it).toInt()!=0 })
returnValue(callspec.returns!!, 1, vm)
else
@ -326,7 +330,8 @@ object SysCalls {
val (addressV, lengthV) = getArgValues(callspec.arguments, vm)
val address = (addressV as UShort).toInt()
val length = (lengthV as UByte).toInt()
val addresses = IntProgression.fromClosedRange(address, address+length*4-2, 4)
val endAddressExcl = address + (if(length==0) 256*vm.machinedef.FLOAT_MEM_SIZE else length*vm.machinedef.FLOAT_MEM_SIZE)
val addresses = IntProgression.fromClosedRange(address, endAddressExcl-vm.machinedef.FLOAT_MEM_SIZE, 4)
if(addresses.any { vm.memory.getFloat(it).toInt()!=0 })
returnValue(callspec.returns!!, 1, vm)
else
@ -336,7 +341,8 @@ object SysCalls {
val (addressV, lengthV) = getArgValues(callspec.arguments, vm)
val address = (addressV as UShort).toInt()
val length = (lengthV as UByte).toInt()
val addresses = IntProgression.fromClosedRange(address, address+length-1, 1)
val endAddressExcl = address + if(length==0) 256 else length
val addresses = IntProgression.fromClosedRange(address, endAddressExcl-1, 1)
if(addresses.all { vm.memory.getUB(it).toInt()!=0 })
returnValue(callspec.returns!!, 1, vm)
else
@ -346,7 +352,8 @@ object SysCalls {
val (addressV, lengthV) = getArgValues(callspec.arguments, vm)
val address = (addressV as UShort).toInt()
val length = (lengthV as UByte).toInt()
val addresses = IntProgression.fromClosedRange(address, address+length*2-2, 2)
val endAddressExcl = address + if(length==0) 256*2 else length*2
val addresses = IntProgression.fromClosedRange(address, endAddressExcl-2, 2)
if(addresses.all { vm.memory.getUW(it).toInt()!=0 })
returnValue(callspec.returns!!, 1, vm)
else
@ -356,7 +363,8 @@ object SysCalls {
val (addressV, lengthV) = getArgValues(callspec.arguments, vm)
val address = (addressV as UShort).toInt()
val length = (lengthV as UByte).toInt()
val addresses = IntProgression.fromClosedRange(address, address+length*4-2, 4)
val endAddressExcl = address + (if(length==0) 256*vm.machinedef.FLOAT_MEM_SIZE else length*vm.machinedef.FLOAT_MEM_SIZE)
val addresses = IntProgression.fromClosedRange(address, endAddressExcl-vm.machinedef.FLOAT_MEM_SIZE, 4)
if(addresses.all { vm.memory.getFloat(it).toInt()!=0 })
returnValue(callspec.returns!!, 1, vm)
else
@ -553,6 +561,16 @@ object SysCalls {
vm.memory.setUB(to+offset, vm.memory.getUB(from+offset))
}
}
Syscall.MEMCOPY_SMALL -> {
val (fromA, toA, countA) = getArgValues(callspec.arguments, vm)
val from = (fromA as UShort).toInt()
val to = (toA as UShort).toInt()
val countV = (countA as UByte).toInt()
val count = if(countV==0) 256 else countV
for(offset in 0..<count) {
vm.memory.setUB(to+offset, vm.memory.getUB(from+offset))
}
}
Syscall.MEMSET -> {
val (memA, numbytesA, valueA) = getArgValues(callspec.arguments, vm)
val mem = (memA as UShort).toInt()

View File

@ -127,6 +127,7 @@ class VmProgramLoader {
IMSyscall.CLAMP_FLOAT.number -> Syscall.CLAMP_FLOAT
IMSyscall.CALLFAR.number -> throw IRParseException("vm doesn't support the callfar() syscall")
IMSyscall.MEMCOPY.number -> Syscall.MEMCOPY
IMSyscall.MEMCOPY_SMALL.number -> Syscall.MEMCOPY_SMALL
IMSyscall.ARRAYCOPY_SPLITW_TO_NORMAL.number -> Syscall.ARRAYCOPY_SPLITW_TO_NORMAL
IMSyscall.ARRAYCOPY_NORMAL_TO_SPLITW.number -> Syscall.ARRAYCOPY_NORMAL_TO_SPLITW
else -> null