From 65a118d1f386858f474c20b90516d3de81592ee2 Mon Sep 17 00:00:00 2001
From: Thomas Harte <thomas.harte@gmail.com>
Date: Wed, 4 Dec 2024 21:41:05 -0500
Subject: [PATCH] Attempt to locate and disassemble machine code.

---
 Analyser/Static/Commodore/File.cpp            |  47 -------
 Analyser/Static/Commodore/File.hpp            |   2 -
 Analyser/Static/Commodore/StaticAnalyser.cpp  | 125 ++++++++++++++++--
 Analyser/Static/StaticAnalyser.hpp            |   2 +-
 .../Clock Signal.xcodeproj/project.pbxproj    |   8 --
 Storage/Cartridge/Formats/PRG.cpp             |   4 +-
 6 files changed, 116 insertions(+), 72 deletions(-)
 delete mode 100644 Analyser/Static/Commodore/File.cpp

diff --git a/Analyser/Static/Commodore/File.cpp b/Analyser/Static/Commodore/File.cpp
deleted file mode 100644
index 6bcb9bfa6..000000000
--- a/Analyser/Static/Commodore/File.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//
-//  File.cpp
-//  Clock Signal
-//
-//  Created by Thomas Harte on 10/09/2016.
-//  Copyright 2016 Thomas Harte. All rights reserved.
-//
-
-#include "File.hpp"
-
-bool Analyser::Static::Commodore::File::is_basic() {
-	// BASIC files are always relocatable (?)
-	if(type != File::RelocatableProgram) return false;
-
-	uint16_t line_address = starting_address;
-	int line_number = -1;
-
-	// decide whether this is a BASIC file based on the proposition that:
-	//	(1) they're always relocatable; and
-	//	(2) they have a per-line structure of:
-	//		[4 bytes: address of start of next line]
-	//		[4 bytes: this line number]
-	//		... null-terminated code ...
-	//	(with a next line address of 0000 indicating end of program)
-	while(1) {
-		if(size_t(line_address - starting_address) + 1 >= data.size()) break;
-
-		uint16_t next_line_address = data[line_address - starting_address];
-		next_line_address |= data[line_address - starting_address + 1] << 8;
-
-		if(!next_line_address) {
-			return true;
-		}
-		if(next_line_address < line_address + 5) break;
-
-		if(size_t(line_address - starting_address) + 3 >= data.size()) break;
-		uint16_t next_line_number = data[line_address - starting_address + 2];
-		next_line_number |= data[line_address - starting_address + 3] << 8;
-
-		if(next_line_number <= line_number) break;
-
-		line_number = uint16_t(next_line_number);
-		line_address = next_line_address;
-	}
-
-	return false;
-}
diff --git a/Analyser/Static/Commodore/File.hpp b/Analyser/Static/Commodore/File.hpp
index e84b00d50..35394ce11 100644
--- a/Analyser/Static/Commodore/File.hpp
+++ b/Analyser/Static/Commodore/File.hpp
@@ -29,8 +29,6 @@ struct File {
 		Relative
 	} type;
 	std::vector<uint8_t> data;
-
-	bool is_basic();
 };
 
 }
diff --git a/Analyser/Static/Commodore/StaticAnalyser.cpp b/Analyser/Static/Commodore/StaticAnalyser.cpp
index dbe78bbc5..f4da89d1e 100644
--- a/Analyser/Static/Commodore/StaticAnalyser.cpp
+++ b/Analyser/Static/Commodore/StaticAnalyser.cpp
@@ -15,23 +15,28 @@
 #include "../../../Storage/Cartridge/Encodings/CommodoreROM.hpp"
 #include "../../../Outputs/Log.hpp"
 
+#include "../Disassembler/6502.hpp"
+#include "../Disassembler/AddressMapper.hpp"
+
 #include <algorithm>
 #include <cstring>
 #include <sstream>
 
 using namespace Analyser::Static::Commodore;
 
-static std::vector<std::shared_ptr<Storage::Cartridge::Cartridge>>
+namespace {
+
+std::vector<std::shared_ptr<Storage::Cartridge::Cartridge>>
 Vic20CartridgesFrom(const std::vector<std::shared_ptr<Storage::Cartridge::Cartridge>> &cartridges) {
 	std::vector<std::shared_ptr<Storage::Cartridge::Cartridge>> vic20_cartridges;
 
 	for(const auto &cartridge : cartridges) {
 		const auto &segments = cartridge->get_segments();
 
-		// only one mapped item is allowed
+		// Only one mapped item is allowed ...
 		if(segments.size() != 1) continue;
 
-		// which must be 16 kb in size
+		// ... which must be 16 kb in size.
 		Storage::Cartridge::Cartridge::Segment segment = segments.front();
 		if(segment.start_address != 0xa000) continue;
 		if(!Storage::Cartridge::Encodings::CommodoreROM::isROM(segment.data)) continue;
@@ -39,28 +44,111 @@ Vic20CartridgesFrom(const std::vector<std::shared_ptr<Storage::Cartridge::Cartri
 		vic20_cartridges.push_back(cartridge);
 	}
 
+	// TODO: other machines?
+
 	return vic20_cartridges;
 }
 
+struct BASICAnalysis {
+	enum class Version {
+		BASIC2,
+		BASIC4,
+		BASIC3_5,
+	} minimum_version = Version::BASIC2;
+	std::vector<uint16_t> machine_code_addresses;
+};
+
+std::optional<BASICAnalysis> analyse(const File &file) {
+	// Accept only 'program' types.
+	if(file.type != File::RelocatableProgram && file.type != File::NonRelocatableProgram) {
+		return std::nullopt;
+	}
+
+	uint16_t line_address = file.starting_address;
+	int previous_line_number = -1;
+
+	const auto byte = [&](uint16_t address) {
+		return file.data[address - file.starting_address];
+	};
+	const auto word = [&](uint16_t address) {
+		return uint16_t(byte(address) | byte(address + 1) << 8);
+	};
+
+	// BASIC programs have a per-line structure of:
+	//		[2 bytes: address of start of next line]
+	//		[2 bytes: this line number]
+	//		... null-terminated code ...
+	//	(with a next line address of 0000 indicating end of program)
+	//
+	// If a SYS is encountered that jumps into the BASIC program then treat that as
+	// a machine code entry point.
+
+	BASICAnalysis analysis;
+	while(true) {
+		// Analysis has failed if there isn't at least one complete BASIC line from here.
+		if(size_t(line_address - file.starting_address) + 5 >= file.data.size()) {
+			return std::nullopt;
+		}
+
+		const auto next_line_address = word(line_address);
+		const auto line_number = word(line_address + 2);
+
+		uint16_t code = line_address + 4;
+		const auto next = [&]() -> uint8_t {
+			if(code >= file.starting_address + file.data.size()) {
+				return 0;
+			}
+			return byte(code++);
+		};
+
+		while(true) {
+			const auto token = next();
+			if(!token) break;
+
+			switch(token) {
+				case 0x9e: {	// SYS; parse following ASCII argument.
+					uint16_t address = 0;
+					while(true) {
+						const auto c = next();
+						if(c < '0' || c > '9') {
+							break;
+						}
+						address = (address * 10) + (c - '0');
+					};
+					analysis.machine_code_addresses.push_back(address);
+				} break;
+			}
+		}
+
+		if(!next_line_address) {
+			break;
+		}
+
+		previous_line_number = line_number;
+		line_address = next_line_address;
+	}
+
+	return analysis;
+}
+
+}
+
 Analyser::Static::TargetList Analyser::Static::Commodore::GetTargets(
 	const Media &media,
 	const std::string &file_name,
 	TargetPlatform::IntType
 ) {
 	TargetList destination;
-
 	auto target = std::make_unique<Target>();
-	target->machine = Machine::Vic20;	// TODO: machine estimation
-	target->confidence = 0.5; // TODO: a proper estimation
 
 	int device = 0;
 	std::vector<File> files;
 	bool is_disk = false;
 
-	// strip out inappropriate cartridges
+	// Strip out inappropriate cartridges.
 	target->media.cartridges = Vic20CartridgesFrom(media.cartridges);
 
-	// check disks
+	// Find all valid Commodore files on disks.
 	for(auto &disk : media.disks) {
 		std::vector<File> disk_files = GetFiles(disk);
 		if(!disk_files.empty()) {
@@ -71,7 +159,7 @@ Analyser::Static::TargetList Analyser::Static::Commodore::GetTargets(
 		}
 	}
 
-	// check tapes
+	// Find all valid Commodore files on tapes.
 	for(auto &tape : media.tapes) {
 		std::vector<File> tape_files = GetFiles(tape);
 		tape->reset();
@@ -82,15 +170,28 @@ Analyser::Static::TargetList Analyser::Static::Commodore::GetTargets(
 		}
 	}
 
+	// Inspect discovered files to try to divine machine and memory model.
 	if(!files.empty()) {
+		const auto &file = files.front();
+
 		auto memory_model = Target::MemoryModel::Unexpanded;
 		std::ostringstream string_stream;
 		string_stream << "LOAD\"" << (is_disk ? "*" : "") << "\"," << device << ",";
-		if(files.front().is_basic()) {
-			string_stream << "0";
-		} else {
+
+		const auto analysis = analyse(file);
+		if(!analysis->machine_code_addresses.empty()) {
 			string_stream << "1";
+
+			const auto disassembly = Analyser::Static::MOS6502::Disassemble(
+				file.data,
+				Analyser::Static::Disassembler::OffsetMapper(file.starting_address),
+				analysis->machine_code_addresses
+			);
+			// TODO: disassemble.
+
+			printf("");
 		}
+
 		string_stream << "\nRUN\n";
 		target->loading_command = string_stream.str();
 
diff --git a/Analyser/Static/StaticAnalyser.hpp b/Analyser/Static/StaticAnalyser.hpp
index 8e8a1648a..acfe1d9f9 100644
--- a/Analyser/Static/StaticAnalyser.hpp
+++ b/Analyser/Static/StaticAnalyser.hpp
@@ -64,7 +64,7 @@ struct Target {
 
 	Machine machine;
 	Media media;
-	float confidence = 0.0f;
+	float confidence = 0.5f;
 };
 typedef std::vector<std::unique_ptr<Target>> TargetList;
 
diff --git a/OSBindings/Mac/Clock Signal.xcodeproj/project.pbxproj b/OSBindings/Mac/Clock Signal.xcodeproj/project.pbxproj
index aae7c1a32..2eb27a5c2 100644
--- a/OSBindings/Mac/Clock Signal.xcodeproj/project.pbxproj	
+++ b/OSBindings/Mac/Clock Signal.xcodeproj/project.pbxproj	
@@ -565,7 +565,6 @@
 		4B778F5E23A5F3230000D260 /* Oric.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B8805F91DCFF807003085B1 /* Oric.cpp */; };
 		4B778F6023A5F3460000D260 /* Disk.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B8944EC201967B4007DE474 /* Disk.cpp */; };
 		4B778F6123A5F3560000D260 /* Disk.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B8944FC201967B4007DE474 /* Disk.cpp */; };
-		4B778F6223A5F35F0000D260 /* File.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894500201967B4007DE474 /* File.cpp */; };
 		4B778F6323A5F3630000D260 /* Tape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894501201967B4007DE474 /* Tape.cpp */; };
 		4B7962A02819681F008130F9 /* Decoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B79629F2819681F008130F9 /* Decoder.cpp */; };
 		4B7962A12819681F008130F9 /* Decoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B79629F2819681F008130F9 /* Decoder.cpp */; };
@@ -639,8 +638,6 @@
 		4B894527201967B4007DE474 /* StaticAnalyser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B8944FA201967B4007DE474 /* StaticAnalyser.cpp */; };
 		4B894528201967B4007DE474 /* Disk.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B8944FC201967B4007DE474 /* Disk.cpp */; };
 		4B894529201967B4007DE474 /* Disk.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B8944FC201967B4007DE474 /* Disk.cpp */; };
-		4B89452A201967B4007DE474 /* File.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894500201967B4007DE474 /* File.cpp */; };
-		4B89452B201967B4007DE474 /* File.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894500201967B4007DE474 /* File.cpp */; };
 		4B89452C201967B4007DE474 /* Tape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894501201967B4007DE474 /* Tape.cpp */; };
 		4B89452D201967B4007DE474 /* Tape.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894501201967B4007DE474 /* Tape.cpp */; };
 		4B89452E201967B4007DE474 /* StaticAnalyser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4B894503201967B4007DE474 /* StaticAnalyser.cpp */; };
@@ -1756,7 +1753,6 @@
 		4B8944FD201967B4007DE474 /* StaticAnalyser.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StaticAnalyser.hpp; sourceTree = "<group>"; };
 		4B8944FE201967B4007DE474 /* File.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = File.hpp; sourceTree = "<group>"; };
 		4B8944FF201967B4007DE474 /* Tape.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = Tape.hpp; sourceTree = "<group>"; };
-		4B894500201967B4007DE474 /* File.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = File.cpp; sourceTree = "<group>"; };
 		4B894501201967B4007DE474 /* Tape.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Tape.cpp; sourceTree = "<group>"; };
 		4B894502201967B4007DE474 /* Disk.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = Disk.hpp; sourceTree = "<group>"; };
 		4B894503201967B4007DE474 /* StaticAnalyser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StaticAnalyser.cpp; sourceTree = "<group>"; };
@@ -3810,7 +3806,6 @@
 			isa = PBXGroup;
 			children = (
 				4B8944FC201967B4007DE474 /* Disk.cpp */,
-				4B894500201967B4007DE474 /* File.cpp */,
 				4B894503201967B4007DE474 /* StaticAnalyser.cpp */,
 				4B894501201967B4007DE474 /* Tape.cpp */,
 				4B894502201967B4007DE474 /* Disk.hpp */,
@@ -6006,7 +6001,6 @@
 				4B055AA31FAE85DF0060FFFF /* ImplicitSectors.cpp in Sources */,
 				4B8318B322D3E540006DB630 /* Audio.cpp in Sources */,
 				4B055AAE1FAE85FD0060FFFF /* TrackSerialiser.cpp in Sources */,
-				4B89452B201967B4007DE474 /* File.cpp in Sources */,
 				4B6AAEAC230E40250078E864 /* SCSI.cpp in Sources */,
 				4B055A981FAE85C50060FFFF /* Drive.cpp in Sources */,
 				4BD424E62193B5830097291A /* Shader.cpp in Sources */,
@@ -6331,7 +6325,6 @@
 				4B228CD524D773B40077EF25 /* CSScanTarget.mm in Sources */,
 				4BCD634922D6756400F567F1 /* MacintoshDoubleDensityDrive.cpp in Sources */,
 				4B0F94FE208C1A1600FE41D9 /* NIB.cpp in Sources */,
-				4B89452A201967B4007DE474 /* File.cpp in Sources */,
 				4BC080D026A257A200D03FD8 /* StaticAnalyser.cpp in Sources */,
 				4B4DC8211D2C2425003C5BF8 /* Vic20.cpp in Sources */,
 				4B71368E1F788112008B8ED9 /* Parser.cpp in Sources */,
@@ -6581,7 +6574,6 @@
 				4BEDA3BB25B25563000C2DBD /* Decoder.cpp in Sources */,
 				4B778F2423A5EDEE0000D260 /* PRG.cpp in Sources */,
 				4B778F5A23A5F2D50000D260 /* 6502.cpp in Sources */,
-				4B778F6223A5F35F0000D260 /* File.cpp in Sources */,
 				4B06AB0F2C6461780034D014 /* MultiProducer.cpp in Sources */,
 				4B778F3523A5F1040000D260 /* SCSI.cpp in Sources */,
 				4BD388882239E198002D14B5 /* 68000Tests.mm in Sources */,
diff --git a/Storage/Cartridge/Formats/PRG.cpp b/Storage/Cartridge/Formats/PRG.cpp
index 0bcc4af28..de94446ac 100644
--- a/Storage/Cartridge/Formats/PRG.cpp
+++ b/Storage/Cartridge/Formats/PRG.cpp
@@ -29,11 +29,11 @@ PRG::PRG(const std::string &file_name) {
 	int loading_address = fgetc(file);
 	loading_address |= fgetc(file) << 8;
 
-	std::size_t data_length = size_t(file_stats.st_size) - 2;
+	const std::size_t data_length = size_t(file_stats.st_size) - 2;
 	std::size_t padded_data_length = 1;
 	while(padded_data_length < data_length) padded_data_length <<= 1;
 	std::vector<uint8_t> contents(padded_data_length);
-	std::size_t length = std::fread(contents.data(), 1, size_t(data_length), file);
+	const std::size_t length = std::fread(contents.data(), 1, size_t(data_length), file);
 	std::fclose(file);
 
 	// accept only files intended to load at 0xa000