From 6a4706d6b00f663fe9ecd8a1cb4cf00a36fd5497 Mon Sep 17 00:00:00 2001 From: kris Date: Thu, 11 May 2017 23:54:59 +0100 Subject: [PATCH] Build a JSON graph of the levenshtein distances between pairs of boot1 sectors and visualize it as a D3 force-directed graph. This is pretty cool There are two thresholds to control the graph size and connectivity - limit to boot1 images that are represented in at least 10 disks - only render links for distances < 200 bits Needs improvements, e.g. - use human-readable names for the boot1 images - potentially add controls for dynamically changing those thresholds and other D3 params - display list of associated disks to each image --- build_levenshtein_graph.py | 55 +++++++++++++++++ render.html | 119 +++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 build_levenshtein_graph.py create mode 100644 render.html diff --git a/build_levenshtein_graph.py b/build_levenshtein_graph.py new file mode 100644 index 0000000..d438f21 --- /dev/null +++ b/build_levenshtein_graph.py @@ -0,0 +1,55 @@ +# Constructs JSON representing boot1 levenshtein distance data from DB + +import json +import sqlite3 + +DB_PATH = '/tank/apple2/data/apple2.db' + +def main(): + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + + q = cursor.execute( + """ + select boot1_sha1, boot1.name, count(*) as c from disks + join + (select sha1, name from boot1) as boot1 + on disks.boot1_sha1 = boot1.sha1 group by 1; + """ + ) + + sha1_indexes = {} + graph = { + "nodes": [], + "links": [] + } + idx = 0 + for r in q: + (sha1, name, count) = r + if count < 10: + continue + sha1_indexes[sha1] = idx + idx += 1 + graph["nodes"].append({"sha1": sha1, "name": name, "radius": count, "group": idx}) + + q = cursor.execute( + """ + select source, target, distance from boot1distances; + """ + ) + + for r in q: + (source, target, distance) = r + if source > target: + try: + graph["links"].append( + {"source": sha1_indexes[source], "target": sha1_indexes[target], "distance": distance}) + except KeyError: + # Source or target is not common enough to include + continue + + out = file("levenshtein.json", "w+") + json.dump(graph, out, indent=4, separators=(',', ': ')) + +if __name__ == "__main__": + main() diff --git a/render.html b/render.html new file mode 100644 index 0000000..63f34f4 --- /dev/null +++ b/render.html @@ -0,0 +1,119 @@ + + + + +