apple1-worple/packer.py

103 lines
3.0 KiB
Python

# Read in file of five-letter words and encode them into two groups
# of three-byte packed binary.
#
# Each letter is converted to a five-bit value, 0-25.
# The bits are put together into a 25 bits, which would normally take
# up four bytes of storage. Since bit 0 of the MSB (the right-most bit
# of our 25 bits) can only be 0 or 1, we'll just split the data
# into two groups and only store the least-significant 24 bits.
#
# In other words, if the first letter is A-P (0-15), we can store that
# first letter in only four bits. Remaining letters are five bits each
# regardless, for a total of 24 bits (three bytes).
# If the first letter is P-Z (15-25), we'll still store it in four bits,
# but in separate list that we'll keep in a specific section of
# memory. We'll just need to be aware that words from that part of the
# list will need to have that bit put back on (i.e., have that first
# letter shifted up the alphabet by 16 letters)
import random
INFILE = 'words.txt'
OUTFILE_0 = 'words_0.bin'
OUTFILE_1 = 'words_1.bin'
def pack(word):
# Pack a five-letter word into three bytes.
# Return binary bytes along with an overflow flag (1 or 0)
word = word.strip().upper()
if len(word) != 5:
raise ValueError(f'"{word}" is not five letters long')
b = 0
for i in range(5):
# Convert letter to 0-25 integer
c = ord(word[i]) - 65
if (c < 0) or (c > 25):
raise ValueError('Character out of range A-Z in word ' + word)
# Scoot current contents of b over by five bits,
# then add new character
b = b * 32
b += c
# Did we "overflow" into bit 24?
if b & 0x1000000:
# Mask off just the leftmost three bytes
b &= 0xFFFFFF
overflow = 1
else:
overflow = 0
# Return as a three-byte bytes object
return [b.to_bytes(3, 'big'), overflow]
def head_and_tail(list, size=6):
print('Head: ' + str(list[0:size]))
print('Tail: ' + str(list[-size:]))
return
##############################################################################
# Read in list
print('\nReading ' + INFILE)
with open(INFILE, 'r') as infile:
words = infile.readlines()
print('Words read: ' + str(len(words)))
head_and_tail(words)
print()
# Shuffle them up
print('Shuffling...')
random.seed(24601)
random.shuffle(words)
head_and_tail(words)
print()
# Pack into bytes (this also uppercases and strips whitespace/returns)
print('Packing words...')
packed_list = [[],[]]
for word in words:
packed = pack(word)
packed_list[packed[1]].append(packed[0])
print('Words with first bit = 0: ' + str(len(packed_list[0])))
print('Words with first bit = 1: ' + str(len(packed_list[1])))
print()
# Save as two separate list files
print('Saving word lists...')
with open(OUTFILE_0, 'wb') as outfile:
for data in packed_list[0]:
outfile.write(data)
with open(OUTFILE_1, 'wb') as outfile:
for data in packed_list[1]:
outfile.write(data)
print('Done\n')