From fa6c2bb25d1629f3f0b04c60a0fae75187f50ce8 Mon Sep 17 00:00:00 2001 From: kris Date: Thu, 28 Mar 2019 23:08:52 +0000 Subject: [PATCH] Minor cleanups: - Use a safe ZP address instead of $00 - Use dec instead of hex for IP address bytes - Remove some unused Uthernet/W5100 defines Optimize the socket buffer management - Since we're guaranteeing 2K frame padding, the low byte is always 0 - Remove some vestiges of the Uthernet TCP demo code - AFAICT there isn't a need to compare high and low bytes of the S0RXRSR, this was just being used as a (slightly risky) check that they were both not equal (presumably to 0) - (h/t Oliver Schmidt ) it turns out that the W5100 automatically wraps the address pointer at the end of the 8k RX/TX buffer space, so since we're using 8k buffers we don't need any of the pointer/mask arithmetic to make sure we don't stray outside this range - Instead, we can just save the W5100 address pointer before we start doing the stream buffer management and restore it when we're ready to read from the stream again. - Moreover, since we know the low-byte is 0 we don't even need to save it. This gives us enough free cycles to implement a keypress check. For now any key will pause the video and any other key resume it. We still have a whole 16 cycles left over while maintaining the 36/37 cycle tick cadence. We've saved 73 cycles of "dead time" though, i.e. the op_ack + CHECKRECV + op_nop "slow path" now takes 2*73 rather than 3*73 cycles. This should result in better audio quality. --- player/main.s | 184 +++++++++++++++++++++----------------------------- 1 file changed, 78 insertions(+), 106 deletions(-) diff --git a/player/main.s b/player/main.s index 6a98fb2..0b55cc9 100644 --- a/player/main.s +++ b/player/main.s @@ -33,8 +33,8 @@ .segment "HGR" ; TODO: make these configurable -SRCADDR: .byte $C0,$A8,$01,147 ; 192.168.1.147 W5100 IP -FADDR: .byte $C0,$A8,$01,15 ; 192.168.1.15 FOREIGN IP +SRCADDR: .byte 192,168,1,147 ; 192.168.1.147 W5100 IP +FADDR: .byte 192,168,1,15 ; 192.168.1.15 FOREIGN IP FPORT: .byte $07,$b9 ; 1977 FOREIGN PORT MAC: .byte $00,$08,$DC,$01,$02,$03 ; W5100 MAC ADDRESS @@ -52,7 +52,7 @@ WDATA = $C097 ;;; ; some dummy addresses in order to pad cycle counts -zpdummy = $00 +zpdummy = $08 dummy = $ffff hgr = $f3e2 @@ -70,16 +70,10 @@ RMSR = $001A ; RECEIVE BUFFER SIZE ; SOCKET 0 LOCATIONS S0MR = $0400 ; SOCKET 0 MODE REGISTER S0CR = $0401 ; COMMAND REGISTER -S0IR = $0402 ; INTERRUPT REGISTER S0SR = $0403 ; STATUS REGISTER S0LOCALPORT = $0404 ; LOCAL PORT S0FORADDR = $040C ; FOREIGN ADDRESS S0FORPORT = $0410 ; FOREIGN PORT -S0MSS = $0412 ; MAX SEGMENT SIZE -S0PROTO = $0414 ; IP PROTOCOL -S0TOS = $0415 ; DS/ECN (FORMER TOS) -S0TTL = $0416 ; IP TIME TO LIVE -S0TXFSR = $0420 ; TX FREE SIZE REGISTER S0TXRR = $0422 ; TX READ POINTER REGISTER S0TXWR = $0424 ; TX WRITE POINTER REGISTER S0RXRSR = $0426 ; RX RECEIVED SIZE REGISTER @@ -93,25 +87,14 @@ TXMASK = RXMASK ; SOCKET 0 TX MASK ; SOCKET COMMANDS SCOPEN = $01 ; OPEN -SCLISTEN = $02 ; LISTEN SCCONNECT = $04 ; CONNECT SCDISCON = $08 ; DISCONNECT -SCCLOSE = $10 ; CLOSE SCSEND = $20 ; SEND -SCSENDMAC = $21 ; SEND MAC -SCSENDKEEP = $22 ; SEND KEEP ALIVE SCRECV = $40 ; RECV ; SOCKET STATUS -STCLOSED = $00 STINIT = $13 -STLISTEN = $14 STESTABLISHED = $17 -STCLOSEWAIT = $1C -STUDP = $22 -STIPRAW = $32 -STMAXRAW = $42 -STPPOE = $5F ; MONITOR SUBROUTINES KBD = $C000 @@ -122,9 +105,6 @@ PRNTAX = $F941 ; ZERO-PAGE STORAGE PTR = $06 ; TODO: we only use this for connection retry count -GETSIZE = $08 ; 2 BYTES FOR RX_RSR -GETOFFSET = $0A ; 2 BYTES FOR OFFSET ADDR -GETSTARTADR = $0C ; 2 BYTES FOR PHYSICAL ADDR ; this is the main binary entrypoint (it will be linked at 0x800) .segment "LOWCODE" @@ -284,6 +264,9 @@ ERRMSG: .byte $d3,$cf,$c3,$cb,$c5,$d4,$a0,$c3,$cf,$d5,$cc,$c4,$a0,$ce,$cf,$d4,$a .byte $8D,$00 SETUP: + + ; TODO: Read header from video + JMP init_mainloop .segment "CODE" @@ -321,7 +304,8 @@ init_mainloop: STA $C054 ; MAIN memory active - ; establish invariant expected by decode loop + ; establish invariants expected by decode loop + LDY #>RXBASE ; High byte of socket 0 receive buffer LDX #$00 ; This is the main audio/video decode loop. @@ -348,96 +332,76 @@ init_mainloop: ; Somewhat magically, the cycle timings all align on multiples of 73 (with tick intervals ; alternating 36 and 37 cycles, as in the "neutral" (i.e. 50% speaker duty cycle) -; op_tick_36_* opcodes), without much work needed to optimize this. I'm pretty sure there's -; still "unnecessary" work being done (e.g. low address bytes that are always 0) but there's -; need to work harder since we'd end up having to pad them back anyway. +; op_tick_36_* opcodes), without much work needed to optimize this. ; ; With a 73 cycle fundamental opcode (speaker) period and 1MHz clock speed, this gives a ; 14364 Hz "carrier" for the audio DAC, which is slightly audible (at least to my ageing ; ears) but quite acceptable. ; -; i.e. we get about 14364 player opcodes/second, with the ACK "slow path" costing 6 opcodes. -; Each of the "fat" audio/video opcodes results in storing 4 video bytes, so we store -; about 56KB of video data per second. +; i.e. we get about 14364 player opcodes/second, with the op_ack + CHECKRECV + op_nop +; "slow path" costing 2 opcodes. Each of the "fat" audio/video opcodes results in storing +4 video bytes, so we store about 56KB of video data per second. ; ; With 192x40 = 7680 visible bytes on the hires screen, this means we can do about 7.5 full ; page redraws/sec; but the effective frame rate will usually be much higher than this ; since we only prioritize the parts of the screen that are changing between frames. -; Check for any received data +; Wait until we have enough received data, and +; +; Notice that this has the only conditional opcodes in the entire decode loop ;-) +; +; Calling invariants: +; X = 0 +; Y register has the high byte of the W5100 address pointer in the RX socket code, so we +; can't trash this until we are ready to point back there. CHECKRECV: BIT tick ; 4 - LDA #RXMASK ; 2 - STA GETOFFSET+1,X ; 5 - using X=0 to get an extra cycle before next tick - LDA WDATA ; 4 LOW BYTE +; pad cycles to keep ticking on 36/37 cycle cadence +; TODO: what can we do with the luxury of 16 unused cycles?! +@2: ; 30 so far + NOP ; 2 + BIT tick ; 4 ; 36 - BIT tick ; 4 (37 cycles) - AND #RXBASE ; 2 - STA GETSTARTADR+1 ; 4 - - ; SET BUFFER ADDRESS ON W5100 - LDA GETSTARTADR+1 ; 4 HIGH BYTE FIRST - - BIT tick ; 4 (36) - STA WADRH ;4 - - LDA GETSTARTADR ; 4 - STA WADRL ; 4 - - ; ensure invariant expected by inner loop - ; it's probably already fine, but we have 2 cycles to spare anyway ;) - LDX #$00 ; 2 - - ; fall through to op_nop + NOP ; 2 + STA dummy ; 4 + STA dummy ; 4 + STA dummy ; 4 op_nop: LDY WDATA ; 4 @@ -445,7 +409,7 @@ op_nop: LDY WDATA ; 4 STY @D+1 ; 4 @D: - JMP op_nop ; 3 ; 37 with following tick + JMP op_nop ; 3 ; 23 with following tick (37 in fallthrough case) ; Build macros for "fat" opcodes that do the following: ; - tick twice, N cycles apart (N = 4 .. 66 in steps of 2) @@ -1283,9 +1247,6 @@ op_terminate: ; In order to simplify the buffer management we expect this ACK opcode to consume ; the last 4 bytes in a 2K "TCP frame". i.e. we can assume that we need to consume ; exactly 2K from the W5100 socket buffer. -; -; TODO: actually we are underrunning by 2 bytes currently, we've only consumed 2 -; bytes of 4 by this point. op_ack: BIT tick ; 4 @@ -1294,33 +1255,44 @@ op_ack: LDA WDATA ; 4 STA @D+1 ; 4 @D: - STA $C054 ; 4 low-byte is modified + STA $C0FF ; 4 low-byte is modified LDA WDATA ; 4 dummy read of last byte in TCP frame - CLC ; 2 - LDA #>S0RXRD ; 2 NEED HIGH BYTE HERE + ; Save the W5100 address pointer so we can come back here later + ; We know the low-order byte is 0 because Socket RX memory is page-aligned and so is 2K frame. + ; IMPORTANT - from now on until we restore this in RECV, we can't trash the Y register! + LDY WADRH ; 4 + + ; Read Received Read pointer + LDA #>S0RXRD ; 2 STA WADRH ; 4 LDX #