2021-08-25 12:33:05 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import base64
|
|
|
|
import hashlib
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
|
|
seenhashes = set()
|
|
|
|
scriptlines = 0
|
2021-09-02 16:21:18 +00:00
|
|
|
changing = False
|
|
|
|
new_hashes = {}
|
2021-08-25 12:33:05 +00:00
|
|
|
|
2021-09-02 16:21:18 +00:00
|
|
|
###############################################################################
|
2021-08-25 12:33:05 +00:00
|
|
|
def hashstr(s):
|
|
|
|
md5 = hashlib.md5(s.encode())
|
|
|
|
return re.sub(r'=+$', '', base64.b32encode(md5.digest()[0:6]).decode())
|
|
|
|
|
2021-09-02 16:21:18 +00:00
|
|
|
###############################################################################
|
2021-08-25 12:33:05 +00:00
|
|
|
def processsentence(sentence):
|
|
|
|
global scriptlines
|
|
|
|
h = hashstr(sentence)
|
2021-09-02 16:21:18 +00:00
|
|
|
if changing:
|
2021-10-13 17:48:48 +00:00
|
|
|
if not h in new_hashes:
|
|
|
|
print(f"hmm, hash {h!r} is missing from new_hashes. Sentence: {sentence!r}")
|
|
|
|
return sentence
|
2021-09-02 16:21:18 +00:00
|
|
|
return new_hashes[h]
|
2021-08-25 12:33:05 +00:00
|
|
|
else:
|
2021-09-02 16:21:18 +00:00
|
|
|
if h in seenhashes:
|
|
|
|
print(f"{h}\t<dupe>")
|
|
|
|
else:
|
|
|
|
print(f"{h}\t{sentence}")
|
|
|
|
scriptlines += 1
|
2021-08-25 12:33:05 +00:00
|
|
|
seenhashes.add(h)
|
|
|
|
|
2021-09-02 16:21:18 +00:00
|
|
|
###############################################################################
|
2021-08-25 12:33:05 +00:00
|
|
|
def processnode(textfield):
|
|
|
|
text = textfield.text
|
|
|
|
if text is not None:
|
|
|
|
lines = re.split(r'(\^[mM])', text)
|
2021-09-02 16:21:18 +00:00
|
|
|
out = []
|
2021-08-25 12:33:05 +00:00
|
|
|
for i in range(0, len(lines), 2):
|
|
|
|
line = lines[i]
|
|
|
|
sep = lines[i+1] if i+1 < len(lines) else None
|
|
|
|
assert line is not None
|
|
|
|
parts = re.split(r'((?<!Mr)(?<!Mrs)(?<!Ms)(?<!Dr)[.!?…]+[)”"=]?\s*)', line)
|
|
|
|
for j in range(0, len(parts), 2):
|
|
|
|
sentence = parts[j] + (parts[j+1] if j+1 < len(parts) else '')
|
|
|
|
if not sentence == "":
|
2021-10-13 17:48:48 +00:00
|
|
|
newsent = processsentence(sentence)
|
|
|
|
# Retain exact spacing from old sentence, since it is hard
|
|
|
|
# to see in the correction doc.
|
|
|
|
oldstartsp = re.match(r'^\s*', sentence)[0]
|
|
|
|
oldendsp = re.search(r'\s*$', sentence)[0]
|
|
|
|
newsent = oldstartsp + newsent.strip() + oldendsp
|
|
|
|
out.append(newsent)
|
2021-09-02 16:21:18 +00:00
|
|
|
if changing and sep:
|
|
|
|
out.append(sep)
|
|
|
|
if changing:
|
|
|
|
newtext = "".join(out)
|
|
|
|
textfield.text = newtext
|
2021-08-25 12:33:05 +00:00
|
|
|
|
2021-09-02 16:21:18 +00:00
|
|
|
###############################################################################
|
2021-08-25 12:33:05 +00:00
|
|
|
def trav(node, parentpath):
|
|
|
|
global scriptlines
|
|
|
|
if node.tag.endswith("script"):
|
|
|
|
if scriptlines > 0:
|
|
|
|
print()
|
|
|
|
scriptlines = 0
|
|
|
|
path = parentpath + [f"{node.tag.replace('{outlaw}', '')}{':'+node.attrib['type'] if 'type' in node.attrib else ''}"]
|
|
|
|
if node.tag.endswith("next"):
|
|
|
|
path = path[0:-2]
|
|
|
|
strpath = ";".join(path)
|
|
|
|
if re.search(r'block:text_(print|story).*block:text.*field', strpath):
|
|
|
|
processnode(node)
|
|
|
|
for kid in node:
|
|
|
|
trav(kid, path)
|
|
|
|
|
2021-09-02 16:21:18 +00:00
|
|
|
###############################################################################
|
|
|
|
def read_hashes(filename):
|
|
|
|
out = {}
|
|
|
|
with open(filename, "r") as io:
|
|
|
|
for line in io.readlines():
|
|
|
|
if line == '\n':
|
|
|
|
continue
|
|
|
|
line = line.replace("\ufeff", "") # Get rid of byte order mark from Word
|
2021-10-13 17:48:48 +00:00
|
|
|
m = re.match(r'^(?P<hashcode>[A-Z0-9]{10}) {8}(?P<sentence>.*)\n$', line)
|
2021-09-02 16:21:18 +00:00
|
|
|
assert m, f"Can't parse {line!r}"
|
|
|
|
d = m.groupdict()
|
|
|
|
if m['sentence'] != "<dupe>":
|
|
|
|
out[m['hashcode']] = m['sentence']
|
|
|
|
return out
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
# From https://stackoverflow.com/questions/54439309/how-to-preserve-namespaces-when-parsing-xml-via-elementtree-in-python
|
|
|
|
def register_all_namespaces(filename):
|
|
|
|
namespaces = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])])
|
|
|
|
for ns in namespaces:
|
|
|
|
ET.register_namespace(ns, namespaces[ns])
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
|
|
|
|
register_all_namespaces(sys.argv[1])
|
2021-08-25 12:33:05 +00:00
|
|
|
tree = ET.parse(sys.argv[1])
|
2021-09-02 16:21:18 +00:00
|
|
|
if len(sys.argv) == 3: # progname infile newhashes
|
|
|
|
new_hashes = read_hashes(sys.argv[2])
|
|
|
|
changing = True
|
|
|
|
else:
|
|
|
|
changing = False
|
|
|
|
|
2021-08-25 12:33:05 +00:00
|
|
|
trav(tree.getroot(), [])
|
|
|
|
|
2021-09-02 16:21:18 +00:00
|
|
|
if changing:
|
2021-10-13 17:48:48 +00:00
|
|
|
print("Writing 'out.xml'.")
|
2021-09-02 16:21:18 +00:00
|
|
|
with open('out.xml', 'wb') as io:
|
|
|
|
io.write("""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n""".encode())
|
|
|
|
tree.write(io, encoding='utf-8')
|