refactored last commit

git-svn-id: https://svn.code.sf.net/p/acme-crossass/code-0/trunk@421 4df02467-bbd4-4a76-a152-e7ce94205b78
2026-04-26 14:17:43 +00:00 · 2024-09-05 11:48:06 +00:00
parent ad9ef576ec
commit df8e52b544
2 changed files with 45 additions and 33 deletions
@@ -21,7 +21,6 @@
 // values for current_input.state
 enum inputstate {
 	INPUTSTATE_SOF,		// start of file (check for BOM and hashbang)
-	INPUTSTATE_BOMKLUGE,	// send second byte of non-BOM and then the byte read from file
 	INPUTSTATE_NORMAL,	// everything's fine
 	INPUTSTATE_AGAIN,	// re-process last byte
 	INPUTSTATE_SKIPBLANKS,	// shrink multiple spaces
@@ -31,6 +30,8 @@ enum inputstate {
 	INPUTSTATE_COMMENT,	// skip characters until newline or EOF
 	INPUTSTATE_EOB,		// send end-of-block after end-of-statement
 	INPUTSTATE_EOF,		// send end-of-file after end-of-statement
+	INPUTSTATE_BOMCHECK,	// first byte in file looks like first byte of BOM
+	INPUTSTATE_BOMFAIL,	// send second byte of non-BOM and then the byte read from file
 };


@@ -162,6 +163,13 @@ static void report_srcchar(int new_char)
 }


+// the byte order mark (Unicode code point 0xfeff) becomes 0xef 0xbb 0xbf when
+// encoded in UTF-8. though UTF-8 files no not need any byte order mark, some
+// text editors add it anyway. these definitions are used to ignore it instead
+// of treating it as a label:
+#define UTF8BOM_1ST	0xef
+#define UTF8BOM_2ND	0xbb
+#define UTF8BOM_3RD	0xbf
 // deliver source code from current file (!) in shortened high-level format
 static char get_processed_from_file(void)
 {
@@ -172,33 +180,10 @@ static char get_processed_from_file(void)
 		case INPUTSTATE_SOF:
 			// fetch first byte from the current source file
 			from_file = getc(current_input.u.fd);
-			// check for bogus/malformed BOM (0xef 0xbb 0xbf as UTF-8-encoded 0xfeff)
-			if (from_file == 0xef) {
-				// first byte looks like BOM, so check second:
-				from_file = getc(current_input.u.fd);
-				if (from_file == 0xbb) {
-					// first two bytes look like BOM, so check third:
-					from_file = getc(current_input.u.fd);
-					if (from_file == 0xbf) {
-						// found BOM, so ignore
-						current_input.state = INPUTSTATE_NORMAL;
-						break;
-					} else {
-						// third byte does not match, so return first byte and make sure the others are delivered later
-						IF_WANTED_REPORT_SRCCHAR(0xef);
-						IF_WANTED_REPORT_SRCCHAR(0xbb);
-						IF_WANTED_REPORT_SRCCHAR(from_file);
-						current_input.state = INPUTSTATE_BOMKLUGE;
-						return 0xef;
-					}
-				} else {
-					// second byte does not match, so return first byte and remember to re-process second:
-					IF_WANTED_REPORT_SRCCHAR(0xef);
-					IF_WANTED_REPORT_SRCCHAR(from_file);
-					current_input.state = INPUTSTATE_AGAIN;
-					return 0xef;
-				}
-				BUG("InputBOM", 0);
+			// check for first byte of UTF-8-encoded BOM
+			if (from_file == UTF8BOM_1ST) {
+				current_input.state = INPUTSTATE_BOMCHECK;
+				break;
 			}
 			IF_WANTED_REPORT_SRCCHAR(from_file);
 			// check for hashbang line and ignore
@@ -209,10 +194,6 @@ static char get_processed_from_file(void)
 			}
 			current_input.state = INPUTSTATE_AGAIN;
 			break;
-		case INPUTSTATE_BOMKLUGE:
-			// send second byte of non-BOM and then the byte read from file
-			current_input.state = INPUTSTATE_AGAIN;
-			return 0xbb;
 		case INPUTSTATE_NORMAL:
 			// fetch a fresh byte from the current source file
 			from_file = getc(current_input.u.fd);
@@ -336,6 +317,37 @@ static char get_processed_from_file(void)
 			current_input.state = INPUTSTATE_NORMAL;
 			return CHAR_EOF;	// end of file

+// two cases for BOM:
+		case INPUTSTATE_BOMCHECK:
+			// first byte matches BOM, so check second:
+			from_file = getc(current_input.u.fd);
+			if (from_file != UTF8BOM_2ND) {
+				// second byte does not match, so return first byte and remember to re-process second:
+				IF_WANTED_REPORT_SRCCHAR(UTF8BOM_1ST);
+				IF_WANTED_REPORT_SRCCHAR(from_file);
+				current_input.state = INPUTSTATE_AGAIN;
+				return UTF8BOM_1ST;
+			}
+			// first two bytes match BOM, so check third:
+			from_file = getc(current_input.u.fd);
+			if (from_file != UTF8BOM_3RD) {
+				// third byte does not match, so return first byte and make sure the others are delivered later:
+				IF_WANTED_REPORT_SRCCHAR(UTF8BOM_1ST);
+				IF_WANTED_REPORT_SRCCHAR(UTF8BOM_2ND);
+				IF_WANTED_REPORT_SRCCHAR(from_file);
+				current_input.state = INPUTSTATE_BOMFAIL;	// next time, deliver second byte
+				return UTF8BOM_1ST;
+			}
+			// found three-byte BOM, so ignore by starting normally with next byte:
+			current_input.state = INPUTSTATE_NORMAL;
+			break;
+
+		case INPUTSTATE_BOMFAIL:
+			// third byte did not match BOM. first byte has already been delivered.
+			// deliver second byte and remember to re-process third:
+			current_input.state = INPUTSTATE_AGAIN;	// next time, deliver byte last read
+			return UTF8BOM_2ND;
+
 		default:
 			BUG("StrangeInputMode", current_input.state);
 		}
@@ -9,7 +9,7 @@

 #define RELEASE		"0.97"		// update before release	FIXME
 #define CODENAME	"Zem"		// update before release
-#define CHANGE_DATE	"23 Aug"	// update before release	FIXME
+#define CHANGE_DATE	"24 Aug"	// update before release	FIXME
 #define CHANGE_YEAR	"2024"		// update before release
 //#define HOME_PAGE	"http://home.pages.de/~mac_bacon/smorbrod/acme/"
 #define HOME_PAGE	"http://sourceforge.net/p/acme-crossass/"	// FIXME