Parse character map information

This adds associations between scripts, regions, and character maps, and parses them from a Go utility.
2024-11-24 17:31:40 +00:00 · 2022-03-15 13:38:45 -04:00 · 2022-03-15 13:38:45 -04:00 · 022d11fa14
commit 022d11fa14
parent 7bc44f4a5a
9 changed files with 270 additions and 284 deletions
--- a/gen/.gitignore
+++ b/gen/.gitignore
@ -1 +1 @@
-/macroman
+/macscript
--- a/gen/data.go
+++ b/gen/data.go
@ -0,0 +1,186 @@
+package main
+
+import (
+	"encoding/csv"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var isIdent = regexp.MustCompile("^[a-zA-Z][_a-zA-Z0-9]*$")
+
+// A dataError indicates an error in the contents of one of the data files.
+type dataError struct {
+	filename string
+	line     int
+	column   int
+	err      error
+}
+
+func (e *dataError) Error() string {
+	var b strings.Builder
+	b.WriteString(e.filename)
+	if e.line != 0 {
+		b.WriteByte(':')
+		b.WriteString(strconv.Itoa(e.line))
+		if e.column != 0 {
+			b.WriteByte(':')
+			b.WriteString(strconv.Itoa(e.column))
+		}
+	}
+	b.WriteString(": ")
+	b.WriteString(e.err.Error())
+	return b.String()
+}
+
+// readHeader reads the header row of a CSV file and checks that columns exist with the given names.
+func readHeader(filename string, r *csv.Reader, names ...string) error {
+	row, err := r.Read()
+	if err != nil {
+		return err
+	}
+	for i, name := range names {
+		if len(row) <= i {
+			line, _ := r.FieldPos(0)
+			return &dataError{filename, line, 0, fmt.Errorf("missing column: %q", name)}
+		}
+		cname := row[i]
+		if !strings.EqualFold(name, cname) {
+			line, col := r.FieldPos(i)
+			return &dataError{filename, line, col, fmt.Errorf("column name is %q, expected %q", cname, name)}
+		}
+	}
+	return nil
+}
+
+// A constmap is a map between names and integer values.
+type constmap struct {
+	names  map[string]int
+	values map[int]string
+}
+
+// readConsts reads a CSV file containing a map between names and integer values.
+func readConsts(filename string) (m constmap, err error) {
+	fp, err := os.Open(filename)
+	if err != nil {
+		return m, err
+	}
+	defer fp.Close()
+	r := csv.NewReader(fp)
+	r.ReuseRecord = true
+	if err := readHeader(filename, r, "name", "value"); err != nil {
+		return m, err
+	}
+	m.names = make(map[string]int)
+	m.values = make(map[int]string)
+	for {
+		row, err := r.Read()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return m, err
+		}
+		if len(row) < 2 {
+			line, _ := r.FieldPos(0)
+			return m, &dataError{filename, line, 0, errors.New("expected at least two columns")}
+		}
+		name := row[0]
+		if !isIdent.MatchString(name) {
+			line, col := r.FieldPos(0)
+			return m, &dataError{filename, line, col, fmt.Errorf("invalid name: %q", name)}
+		}
+		if _, e := m.names[name]; e {
+			line, col := r.FieldPos(0)
+			return m, &dataError{filename, line, col, fmt.Errorf("duplicate name: %q", name)}
+		}
+		value, err := strconv.Atoi(row[1])
+		if err != nil {
+			line, col := r.FieldPos(1)
+			return m, &dataError{filename, line, col, fmt.Errorf("invalid value: %v", err)}
+		}
+		m.names[name] = value
+		if _, e := m.values[value]; !e {
+			m.values[value] = name
+		}
+	}
+	return m, nil
+}
+
+type charmapinfo struct {
+	name    string
+	file    string
+	script  string
+	regions []string
+}
+
+// readCharmaps reads and parses the charmaps.csv file.
+func readCharmaps(filename string, scripts, regions map[string]int) ([]charmapinfo, error) {
+	fp, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer fp.Close()
+	r := csv.NewReader(fp)
+	r.ReuseRecord = true
+	if err := readHeader(filename, r, "name", "file", "script", "regions"); err != nil {
+		return nil, err
+	}
+	var arr []charmapinfo
+	for {
+		row, err := r.Read()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return nil, err
+		}
+		if len(row) < 3 {
+			line, _ := r.FieldPos(0)
+			return nil, &dataError{filename, line, 0, errors.New("expected at least three columns")}
+		}
+		ifo := charmapinfo{
+			name:   row[0],
+			file:   row[1],
+			script: row[2],
+		}
+		if _, e := scripts[ifo.script]; !e {
+			line, col := r.FieldPos(2)
+			return nil, &dataError{filename, line, col, fmt.Errorf("unknown script: %q", ifo.script)}
+		}
+		if len(row) >= 4 && row[3] != "" {
+			ifo.regions = strings.Split(row[3], ";")
+			for _, region := range ifo.regions {
+				if _, e := regions[region]; !e {
+					line, col := r.FieldPos(3)
+					return nil, &dataError{filename, line, col, fmt.Errorf("unknown region: %q", region)}
+				}
+			}
+		}
+		arr = append(arr, ifo)
+	}
+	return arr, nil
+}
+
+type scriptdata struct {
+	scripts  constmap
+	regions  constmap
+	charmaps []charmapinfo
+}
+
+func readData() (d scriptdata, err error) {
+	d.scripts, err = readConsts("scripts/script.csv")
+	if err != nil {
+		return d, err
+	}
+	d.regions, err = readConsts("scripts/region.csv")
+	if err != nil {
+		return d, err
+	}
+	d.charmaps, err = readCharmaps("scripts/charmap.csv", d.scripts.names, d.regions.names)
+	return
+}
--- a/gen/go.mod
+++ b/gen/go.mod
@ -1,8 +1,5 @@
-module moria.us/macroman
+module moria.us/macscript

 go 1.16

-require (
-	github.com/depp/packbits v1.0.0
-	golang.org/x/text v0.3.5
-)
+require golang.org/x/text v0.3.5
--- a/gen/go.sum
+++ b/gen/go.sum
@ -1,5 +1,3 @@
-github.com/depp/packbits v1.0.0 h1:KqnhCkzI5WbWuwKsAgPI/JLF11mBXBEvYADiwKZMXi0=
-github.com/depp/packbits v1.0.0/go.mod h1:wDV3NXiMB4a+KztSJ93UMH9cBKj5cEGooAbgRXTpQ78=
 golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
 golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/gen/macroman.go
+++ b/gen/macroman.go
@ -1,139 +0,0 @@
-package main
-
-import (
-	"bufio"
-	"flag"
-	"fmt"
-	"os"
-	"strconv"
-
-	"github.com/depp/packbits"
-	"golang.org/x/text/unicode/norm"
-)
-
-var (
-	flagDumpSequences   bool
-	flagDumpTransitions bool
-)
-
-func init() {
-	flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
-	flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
-}
-
-var characters [256]rune
-
-func init() {
-	hichars := [128]uint16{
-		0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
-		0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
-		0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
-		0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
-		0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
-		0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
-		0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
-		0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
-		0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
-		0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
-		0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
-		0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
-		0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
-		0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
-		0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
-		0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
-	}
-	for i := 0; i < 128; i++ {
-		characters[i] = rune(i)
-	}
-	for i, c := range hichars {
-		characters[i+128] = rune(c)
-	}
-}
-
-var (
-	// lineBreaks is the set of all sequences recognized as line breaks.
-	lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}
-	// normForms is the set of Unicode normalization forms recognized.
-	normForms = []norm.Form{norm.NFC, norm.NFD}
-)
-
-func makeConverter(cmap *[256]rune) (*node, error) {
-	var n node
-	// Special case for CR and LF.
-	for _, b := range lineBreaks {
-		if err := n.add('\r', b); err != nil {
-			return nil, err
-		}
-	}
-	for m, u := range *cmap {
-		if m == '\r' || m == '\n' {
-			continue
-		}
-		us := string(u)
-		for _, form := range normForms {
-			bytes := []byte(form.String(us))
-			fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)
-			if err := n.add(byte(m), bytes); err != nil {
-				return nil, err
-			}
-		}
-	}
-	return &n, nil
-}
-
-func printData(f *os.File, ulen int, data []byte) error {
-	if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+
-		"// clang-format off\n"); err != nil {
-		return err
-	}
-	if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil {
-		return err
-	}
-	if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil {
-		return err
-	}
-	var line []byte
-	for _, n := range data {
-		sv := len(line)
-		line = strconv.AppendUint(line, uint64(n), 10)
-		line = append(line, ',')
-		if len(line) > 80 {
-			line = append(line[:sv], '\n')
-			if _, err := f.Write(line); err != nil {
-				return err
-			}
-			line = strconv.AppendUint(line[:0], uint64(n), 10)
-			line = append(line, ',')
-		}
-	}
-	line = append(line, '\n')
-	if _, err := f.Write(line); err != nil {
-		return err
-	}
-	_, err := fmt.Print("};\n")
-	return err
-}
-
-func mainE() error {
-	n, err := makeConverter(&characters)
-	if err != nil {
-		return err
-	}
-	table := n.genTable()
-	if flagDumpTransitions {
-		w := bufio.NewWriter(os.Stderr)
-		table.dumpTransitions(w)
-		w.Flush()
-	}
-	bytes := table.toBytes()
-	bits := packbits.Pack(bytes)
-	return printData(os.Stdout, len(bytes), bits)
-}
-
-func main() {
-	flag.Parse()
-	if err := mainE(); err != nil {
-		fmt.Fprintln(os.Stderr, "Error:", err)
-		os.Exit(1)
-	}
-}
--- a/gen/main.go
+++ b/gen/main.go
@ -0,0 +1,50 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+var (
+	flagDumpSequences   bool
+	flagDumpTransitions bool
+)
+
+func init() {
+	flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
+	flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
+}
+
+func getSrcdir() (string, error) {
+	exe, err := os.Executable()
+	if err != nil {
+		return "", err
+	}
+	return filepath.Dir(filepath.Dir(exe)), nil
+}
+
+func mainE() error {
+	srcdir, err := getSrcdir()
+	if err != nil {
+		return fmt.Errorf("could not find source dir: %v", err)
+	}
+	if err := os.Chdir(srcdir); err != nil {
+		return err
+	}
+	d, err := readData()
+	if err != nil {
+		return err
+	}
+	_ = d
+	return nil
+}
+
+func main() {
+	flag.Parse()
+	if err := mainE(); err != nil {
+		fmt.Fprintln(os.Stderr, "Error:", err)
+		os.Exit(1)
+	}
+}
--- a/gen/table.go
+++ b/gen/table.go
@ -1,136 +0,0 @@
-package main
-
-import (
-	"bufio"
-	"errors"
-	"fmt"
-)
-
-var (
-	errEmptyString = errors.New("empty input")
-	errZeroInput   = errors.New("zero byte input")
-	errZeroOutput  = errors.New("zero byte output")
-)
-
-type inputConflictErr struct {
-	input []byte
-	out1  byte
-	out2  byte
-}
-
-func (e *inputConflictErr) Error() string {
-	return fmt.Sprintf("table conflict: %d maps to both %d and %d", e.input, e.out1, e.out2)
-}
-
-// A node is an element in a Unicode decoding graph.
-type node struct {
-	chars    [256]uint8
-	children [256]*node
-}
-
-// add adds the mapping from "in" to "out", creating additional nodes as
-// necessary.
-func (n *node) add(out byte, in []byte) error {
-	if len(in) == 0 {
-		return errEmptyString
-	}
-	if in[0] == 0 {
-		if out == 0 {
-			return nil
-		}
-	}
-	if out == 0 {
-		return errZeroOutput
-	}
-	for _, b := range in[:len(in)-1] {
-		old := n
-		n = n.children[b]
-		if n == nil {
-			n = new(node)
-			old.children[b] = n
-		}
-	}
-	b := in[len(in)-1]
-	x := n.chars[b]
-	if x == 0 {
-		n.chars[b] = out
-		return nil
-	}
-	if x == out {
-		return nil
-	}
-	return &inputConflictErr{
-		input: in,
-		out1:  x,
-		out2:  out,
-	}
-}
-
-func (n *node) size() int {
-	sz := 1
-	for _, c := range n.children {
-		if c != nil {
-			sz += c.size()
-		}
-	}
-	return sz
-}
-
-func (n *node) writeTable(table decoderTable, pos int) int {
-	data := table[pos*256 : pos*256+256 : pos*256+256]
-	pos++
-	for i, c := range n.chars {
-		data[i] = uint16(c)
-	}
-	for i, c := range n.children {
-		if c != nil {
-			data[i] |= uint16(pos << 8)
-			pos = c.writeTable(table, pos)
-		}
-	}
-	return pos
-}
-
-func (n *node) genTable() decoderTable {
-	sz := n.size()
-	table := make(decoderTable, 256*sz)
-	pos := n.writeTable(table, 0)
-	if pos != sz {
-		panic("bad table")
-	}
-	return table
-}
-
-type decoderTable []uint16
-
-func (t decoderTable) dumpTransitions(w *bufio.Writer) {
-	n := len(t) >> 8
-	for i := 0; i < n; i++ {
-		t := t[i<<8 : (i+1)<<8]
-		fmt.Fprintf(w, "State $%02x\n", i)
-		for m, v := range t {
-			if v != 0 {
-				fmt.Fprintf(w, "    $%02x ->", m)
-				st := v >> 8
-				chr := v & 255
-				if st != 0 {
-					fmt.Fprintf(w, " state $%02x", st)
-				}
-				if chr != 0 {
-					fmt.Fprintf(w, " char $%02x", chr)
-				}
-				w.WriteByte('\n')
-			}
-		}
-		w.WriteByte('\n')
-	}
-}
-
-func (t decoderTable) toBytes() []byte {
-	b := make([]byte, len(t)*2)
-	for i, x := range t {
-		b[i*2] = byte(x >> 8)
-		b[i*2+1] = byte(x)
-	}
-	return b
-}
--- a/scripts/README.md
+++ b/scripts/README.md
@ -2,4 +2,10 @@

 This folder contains the script and region definitions for the Mac OS toolbox.

-These constants are extracted from the `Script.h` file in Universal Interfaces.
+- `extract.py`: Generate `script.csv` and `region.csv` from the `Script.h` file in Mac OS Universal Interfaces. The output of this program is checked in, so it does not need to be run again unless the logic is changed.
+
+- `script.csv`: Constants identifying scripts.
+
+- `region.csv`: Constants identifying localization regions.
+
+- `charmap.csv`: Identifies character maps used by classic Mac OS. Each character map is given a name, a data file in the `../charmap` folder, and the script and, optionally, regions it corresponds to. This mapping is taken from the readme in the charmap folder. More specific mappings (which contain regions) in this file take precedence less specific mappings (which do not contain regions). For example, Turkish is more specific than Roman.
--- a/scripts/charmap.csv
+++ b/scripts/charmap.csv
@ -0,0 +1,24 @@
+Name,File,Script,Regions
+Roman,ROMAN.TXT,smRoman,
+Turkish,TURKISH.TXT,smRoman,verTurkey
+Croatian,CROATIAN.TXT,smRoman,verCroatia;verSlovenian;verYugoCroatian
+Icelandic,ICELAND.TXT,smRoman,verIceland;verFaroeIsl
+Romanian,ROMANIAN.TXT,smRoman,verRomania
+Celtic,CELTIC.TXT,smRoman,verIreland;verScottishGaelic;verManxGaelic;verBreton;verWelsh
+Gaelic,GAELIC.TXT,smRoman,verIrishGaelicScript
+Greek,GREEK.TXT,smRoman,verGreece
+Japanese,JAPANESE.TXT,smJapanese,
+Chinese (Traditional),CHINTRAD.TXT,smTradChinese,
+Korean,KOREAN.TXT,smKorean,
+Arabic,ARABIC.TXT,smArabic,
+Farsi,FARSI.TXT,smArabic,verIran
+Hebrew,HEBREW.TXT,smHebrew,
+Cyrillic,CYRILLIC.TXT,smCyrillic,
+Devanagari,DEVANAGA.TXT,smDevanagari,
+Gurmukhi,GURMUKHI.TXT,smGurmukhi,
+Gujarati,GUJARATI.TXT,smGujarati,
+Thai,,smThai,
+Chinese (Simplified),CHINSIMP.TXT,smSimpChinese,
+Tibetan,,smTibetan,
+Inuit,INUIT.TXT,smEthiopic,verNunavut
+Central European,CENTEURO.TXT,smCentralEuroRoman,