Video() is now aware of target frame rate, and will continue to emit

opcodes until the cycle budget for the frame is exhausted.

Output stream is also now aware of TCP framing, and schedules an ACK
opcode every 2048 output bytes to instruct the client to perform
TCP ACK and buffer management.

Fixes several serious bugs in RLE encoding, including:

- we were emitting the RLE opcode with the next content byte after the
  run completed!
- we were looking at the wrong field for the start offset!
- handle the case where the entire page is a single run
- stop trying to allow accumulating error when RLE -- this does not
  respect the Apple II colour encoding, i.e. may introduce colour
  fringing.
- also because of this we're unlikely to actually be able to find
  many runs because odd and even columns are encoded differently.  In
  a followup we should start encoding odd and even columns separately

Optimize after profiling -- encoder is now about 2x faster

Add tests.
This commit is contained in:
kris 2019-02-23 23:52:25 +00:00
parent cc6c92335d
commit 1b54c9c864
2 changed files with 303 additions and 106 deletions

231
video.py
View File

@ -1,8 +1,5 @@
import functools import functools
from collections import defaultdict from typing import Iterator, Tuple, Iterable
from typing import Iterator, Set, Tuple
import numpy as np
import opcodes import opcodes
import scheduler import scheduler
@ -10,8 +7,7 @@ import memory_map
import screen import screen
@functools.lru_cache(None) def hamming_weight(n):
def hamming_weight(n: int) -> int:
"""Compute hamming weight of 8-bit int""" """Compute hamming weight of 8-bit int"""
n = (n & 0x55) + ((n & 0xAA) >> 1) n = (n & 0x55) + ((n & 0xAA) >> 1)
n = (n & 0x33) + ((n & 0xCC) >> 2) n = (n & 0x33) + ((n & 0xCC) >> 2)
@ -22,7 +18,9 @@ def hamming_weight(n: int) -> int:
class Video: class Video:
"""Apple II screen memory map encoding a bitmapped frame.""" """Apple II screen memory map encoding a bitmapped frame."""
def __init__(self, screen_page: int = 0, CLOCK_SPEED = 1024 * 1024
def __init__(self, frame_rate: int = 15, screen_page: int = 0,
opcode_scheduler: scheduler.OpcodeScheduler = None): opcode_scheduler: scheduler.OpcodeScheduler = None):
self.screen_page = screen_page self.screen_page = screen_page
@ -31,18 +29,29 @@ class Video:
self.memory_map = memory_map.MemoryMap(screen_page, self.screen) self.memory_map = memory_map.MemoryMap(screen_page, self.screen)
self.cycle_counter = opcodes.CycleCounter()
self.state = opcodes.State(self.cycle_counter, self.memory_map)
self.scheduler = ( self.scheduler = (
opcode_scheduler or scheduler.HeuristicPageFirstScheduler()) opcode_scheduler or scheduler.HeuristicPageFirstScheduler())
def update(self, frame: screen.Bitmap, self.cycle_counter = opcodes.CycleCounter()
cycle_budget: int, fullness: float) -> Iterator[int]:
self.state = opcodes.State(self.cycle_counter, self.memory_map)
self.frame_rate = frame_rate
self.stream_pos = 0
if self.frame_rate:
self.cycles_per_frame = self.CLOCK_SPEED // self.frame_rate
else:
self.cycles_per_frame = None
self._last_op = opcodes.Nop()
def encode_frame(self, frame: screen.Bitmap) -> Iterator[opcodes.Opcode]:
"""Update to match content of frame within provided budget. """Update to match content of frame within provided budget.
Emits encoded byte stream for rendering the image. Emits encoded byte stream for rendering the image.
XXX update
The byte stream consists of offsets against a selected page (e.g. $20xx) The byte stream consists of offsets against a selected page (e.g. $20xx)
at which to write a selected content byte. Those selections are at which to write a selected content byte. Those selections are
controlled by special opcodes emitted to the stream controlled by special opcodes emitted to the stream
@ -53,128 +62,148 @@ class Video:
TICK - tick the speaker TICK - tick the speaker
DONE - terminate the video decoding DONE - terminate the video decoding
In order to "make room" for these opcodes we make use of the fact that
each page has 2 sets of 8-byte "screen holes", at page offsets
0x78-0x7f and 0xf8-0xff. Currently we only use the latter range as
this allows for efficient matching in the critical path of the decoder.
We group by offsets from page boundary (cf some other more We group by offsets from page boundary (cf some other more
optimal starting point) because STA (..),y has 1 extra cycle if optimal starting point) because STA (..),y has 1 extra cycle if
crossing the page boundary. Though maybe this would be worthwhile if crossing the page boundary. Though maybe this would be worthwhile if
it optimizes the bytestream. it optimizes the bytestream.
""" """
self.cycle_counter.reset()
# Target screen memory map for new frame # Target screen memory map for new frame
target = frame.pack() target = frame.pack()
# Estimate number of opcodes that will end up fitting in the cycle
# budget.
byte_cycles = opcodes.Store(0).cycles
est_opcodes = int(cycle_budget / fullness / byte_cycles)
# Sort by highest xor weight and take the estimated number of change # Sort by highest xor weight and take the estimated number of change
# operations # operations
# TODO: changes should be a class # TODO: changes should be a class
changes = list( changes = sorted(list(self._index_changes(self.screen, target)),
sorted(self.index_changes(self.screen, target), reverse=True) reverse=True)
)[:est_opcodes]
for op in self.scheduler.schedule(changes): yield from self.scheduler.schedule(changes)
yield from self.state.emit(op)
def index_changes(self, source: screen.Bytemap, @functools.lru_cache()
target: screen.Bytemap) -> Set[ def _rle_cycles(self, run_length):
Tuple[int, int, int, int, int]]: return opcodes.RLE(0, run_length).cycles
"""Transform encoded screen to sequence of change tuples.
Change tuple is (xor_weight, page, offset, content)
"""
changes = set()
# TODO: don't use 256 bytes if XMAX is smaller, or we may compute RLE
# over the full page!
memmap = defaultdict(lambda: [(0, 0, 0)] * 256)
it = np.nditer(target.bytemap, flags=['multi_index'])
while not it.finished:
y, x_byte = it.multi_index
page, offset = self.memory_map.to_page_offset(x_byte, y)
src_content = source.bytemap[y][x_byte]
target_content = np.asscalar(it[0])
bits_different = hamming_weight(src_content ^ target_content)
memmap[page][offset] = (bits_different, src_content, target_content)
it.iternext()
def _index_page(self, bits_different, target_content):
byte_cycles = opcodes.Store(0).cycles byte_cycles = opcodes.Store(0).cycles
for page, offsets in memmap.items():
cur_content = None cur_content = None
run_length = 0 run_length = 0
maybe_run = [] run = []
for offset, data in enumerate(offsets):
bits_different, src_content, target_content = data
# TODO: allowing odd bit errors introduces colour error
if maybe_run and hamming_weight(
cur_content ^ target_content) > 2:
# End of run
# Decide if it's worth emitting as a run vs single stores
# Number of changes in run for which >0 bits differ # Number of changes in run for which >0 bits differ
num_changes = len([c for c in maybe_run if c[0]]) num_changes_in_run = 0
run_cost = opcodes.RLE(0, run_length).cycles
single_cost = byte_cycles * num_changes
# print("Run of %d cheaper than %d singles" % (
# run_length, num_changes))
# TODO: don't allow too much error to accumulate # Total weight of differences accumulated in run
total_xor_in_run = 0
def end_run():
# Decide if it's worth emitting as a run vs single stores
run_cost = self._rle_cycles(run_length)
single_cost = byte_cycles * num_changes_in_run
# print("Run of %d cheaper than %d singles" % (
# run_length, num_changes_in_run))
if run_cost < single_cost: if run_cost < single_cost:
# Compute median bit value over run start_offset = run[0][1]
median_bits = np.median(
np.vstack(
np.unpackbits(
np.array(r[3], dtype=np.uint8)
)
for r in maybe_run
), axis=0
) > 0.5
typical_content = np.asscalar(np.packbits(median_bits)) # print("Found run of %d * %2x at %2x" % (
# run_length, cur_content, offset - run_length)
total_xor = sum(ch[0] for ch in maybe_run)
start_offset = maybe_run[0][2]
change = (total_xor, page, start_offset,
typical_content, run_length)
# print("Found run of %d * %2x at %2x:%2x" % (
# run_length, cur_content, page, offset - run_length)
# ) # )
# print(maybe_run) # print(run)
# print("change =", change) yield (
changes.add(change) total_xor_in_run, start_offset, cur_content, run_length)
else: else:
changes.update(ch for ch in maybe_run if ch[0]) for ch in run:
maybe_run = [] if ch[0]:
yield ch
for offset in range(256):
bd = bits_different[offset]
tc = target_content[offset]
if run and cur_content != tc:
# End of run
yield from end_run()
run = []
run_length = 0 run_length = 0
cur_content = target_content num_changes_in_run = 0
total_xor_in_run = 0
cur_content = tc
if cur_content is None: if cur_content is None:
cur_content = target_content cur_content = tc
run_length += 1 run_length += 1
maybe_run.append( run.append((bd, offset, tc, 1))
(bits_different, page, offset, target_content, 1)) if bd:
num_changes_in_run += 1
total_xor_in_run += bd
return changes if run:
# End of run
yield from end_run()
def _index_changes(
self,
source: screen.Bytemap,
target: screen.Bytemap) -> Iterator[Tuple[int, int, int, int, int]]:
"""Transform encoded screen to sequence of change tuples.
Change tuple is (xor_weight, page, offset, content, run_length)
"""
# TODO: work with memory maps directly?
source_memmap = memory_map.MemoryMap.to_memory_map(source.bytemap)
target_memmap = memory_map.MemoryMap.to_memory_map(target.bytemap)
# TODO: don't use 256 bytes if XMAX is smaller, or we may compute RLE
# (with bit errors) over the full page!
diff_weights = hamming_weight(source_memmap ^ target_memmap)
for page in range(32):
for change in self._index_page(
diff_weights[page], target_memmap[page]):
total_xor_in_run, start_offset, target_content, run_length = \
change
# TODO: handle screen page
yield (
total_xor_in_run, page + 32, start_offset,
target_content, run_length
)
def _emit_bytes(self, _op):
# print("%04X:" % self.stream_pos)
for b in self.state.emit(self._last_op, _op):
yield b
self.stream_pos += 1
self._last_op = _op
def emit_stream(self, ops: Iterable[opcodes.Opcode]) -> Iterator[int]:
self.cycle_counter.reset()
for op in ops:
# Keep track of where we are in TCP client socket buffer
socket_pos = self.stream_pos % 2048
if socket_pos >= 2045:
# May be about to emit a 3-byte opcode, pad out to last byte
# in frame
nops = 2047 - socket_pos
# print("At position %04x, padding with %d nops" % (
# socket_pos, nops))
for _ in range(nops):
yield from self._emit_bytes(opcodes.Nop())
yield from self._emit_bytes(opcodes.Ack())
# Ack falls through to nop
self._last_op = opcodes.Nop()
yield from self._emit_bytes(op)
if self.cycles_per_frame and (
self.cycle_counter.cycles > self.cycles_per_frame):
print("Out of cycle budget")
return
# TODO: pad to cycles_per_frame with NOPs
def done(self) -> Iterator[int]: def done(self) -> Iterator[int]:
"""Terminate opcode stream.""" """Terminate opcode stream."""
yield from self._emit_bytes(opcodes.Terminate())
yield from self.state.emit(opcodes.Terminate())

168
video_test.py Normal file
View File

@ -0,0 +1,168 @@
import unittest
import numpy as np
import opcodes
import screen
import video
class TestHammingWeight(unittest.TestCase):
def testHammingWeight(self):
self.assertEqual(0, video.hamming_weight(0))
self.assertEqual(1, video.hamming_weight(0b1))
self.assertEqual(1, video.hamming_weight(0b100))
self.assertEqual(3, video.hamming_weight(0b101100))
self.assertEqual(7, video.hamming_weight(0b11111110))
class TestVideo(unittest.TestCase):
def testEncodeEmptyFrame(self):
f = screen.HGR140Bitmap()
v = video.Video()
self.assertEqual([], list(v.encode_frame(f)))
def testEncodeOnePixel(self):
f = screen.HGR140Bitmap()
a = np.zeros((f.YMAX, f.XMAX), dtype=bool)
a[0, 0] = True
f = screen.HGR140Bitmap(a)
v = video.Video()
want = [
opcodes.SetPage(0x20),
opcodes.SetContent(0x03),
opcodes.Store(0x00),
]
got = list(v.encode_frame(f))
self.assertListEqual(want, got)
class TestIndexPage(unittest.TestCase):
def testFullPageSameValue(self):
"""Constant data with nonzero weights should return single run"""
v = video.Video()
data = np.ones((256,), dtype=np.uint8)
# total_xor_difference, start_offset, content, run_length
want = [(256, 0, 1, 256)]
got = list(v._index_page(video.hamming_weight(data), data))
self.assertEqual(want, got)
def testFullPageZeroValue(self):
"""Zero data with 0 weights should return nothing"""
v = video.Video()
data = np.zeros((256,), dtype=np.uint8)
# total_xor_difference, start_offset, content, run_length
want = []
got = list(v._index_page(video.hamming_weight(data), data))
self.assertEqual(want, got)
def testFullPageZeroValueWithDiff(self):
"""Zero data with nonzero weights should return single run"""
v = video.Video()
old_data = np.ones((256,), dtype=np.uint8)
data = np.zeros((256,), dtype=np.uint8)
# total_xor_difference, start_offset, content, run_length
want = [(256, 0, 0, 256)]
got = list(v._index_page(video.hamming_weight(old_data), data))
self.assertEqual(want, got)
def testSingleRun(self):
"""Single run of nonzero data"""
v = video.Video()
data = np.zeros((256,), dtype=np.uint8)
for i in range(5):
data[i] = 1
# total_xor_difference, start_offset, content, run_length
want = [(5, 0, 1, 5)]
got = list(v._index_page(video.hamming_weight(data), data))
self.assertEqual(want, got)
def testTwoRuns(self):
"""Two consecutive runs of nonzero data"""
v = video.Video()
data = np.zeros((256,), dtype=np.uint8)
for i in range(5):
data[i] = 1
for i in range(5, 10):
data[i] = 2
# total_xor_difference, start_offset, content, run_length
want = [(5, 0, 1, 5), (5, 5, 2, 5)]
got = list(v._index_page(video.hamming_weight(data), data))
self.assertEqual(want, got)
def testShortRun(self):
"""Run that is too short to encode as RLE opcode"""
v = video.Video()
data = np.zeros((256,), dtype=np.uint8)
for i in range(2):
data[i] = 1
# total_xor_difference, start_offset, content, run_length
want = [(1, 0, 1, 1), (1, 1, 1, 1)]
got = list(v._index_page(video.hamming_weight(data), data))
self.assertEqual(want, got)
class TestEncodeDecode(unittest.TestCase):
def testEncodeDecode(self):
for _ in range(10):
s = video.Video(frame_rate=1)
screen_cls = screen.HGR140Bitmap
im = np.random.randint(
0, 2, (screen_cls.YMAX, screen_cls.XMAX), dtype=np.bool)
f = screen_cls(im)
_ = bytes(s.emit_stream(s.encode_frame(f)))
# assert that the screen decodes to the original bitmap
bm = screen_cls.from_bytemap(s.screen).bitmap
self.assertTrue(np.array_equal(bm, im))
def testEncodeDecodeTwoFrames(self):
for _ in range(10):
s = video.Video(frame_rate=1)
screen_cls = screen.HGR140Bitmap
im = np.random.randint(
0, 2, (screen_cls.YMAX, screen_cls.XMAX), dtype=np.bool)
f = screen_cls(im)
_ = bytes(s.emit_stream(s.encode_frame(f)))
im2 = np.random.randint(
0, 2, (screen_cls.YMAX, screen_cls.XMAX), dtype=np.bool)
f = screen_cls(im2)
_ = bytes(s.emit_stream(s.encode_frame(f)))
# assert that the screen decodes to the original bitmap
bm = screen_cls.from_bytemap(s.screen).bitmap
self.assertTrue(np.array_equal(bm, im2))
if __name__ == '__main__':
unittest.main()