diff --git a/gen/.gitignore b/gen/.gitignore index 28c5eee..8394e5f 100644 --- a/gen/.gitignore +++ b/gen/.gitignore @@ -1 +1 @@ -/macroman +/macscript diff --git a/gen/data.go b/gen/data.go new file mode 100644 index 0000000..a44bdbe --- /dev/null +++ b/gen/data.go @@ -0,0 +1,186 @@ +package main + +import ( + "encoding/csv" + "errors" + "fmt" + "io" + "os" + "regexp" + "strconv" + "strings" +) + +var isIdent = regexp.MustCompile("^[a-zA-Z][_a-zA-Z0-9]*$") + +// A dataError indicates an error in the contents of one of the data files. +type dataError struct { + filename string + line int + column int + err error +} + +func (e *dataError) Error() string { + var b strings.Builder + b.WriteString(e.filename) + if e.line != 0 { + b.WriteByte(':') + b.WriteString(strconv.Itoa(e.line)) + if e.column != 0 { + b.WriteByte(':') + b.WriteString(strconv.Itoa(e.column)) + } + } + b.WriteString(": ") + b.WriteString(e.err.Error()) + return b.String() +} + +// readHeader reads the header row of a CSV file and checks that columns exist with the given names. +func readHeader(filename string, r *csv.Reader, names ...string) error { + row, err := r.Read() + if err != nil { + return err + } + for i, name := range names { + if len(row) <= i { + line, _ := r.FieldPos(0) + return &dataError{filename, line, 0, fmt.Errorf("missing column: %q", name)} + } + cname := row[i] + if !strings.EqualFold(name, cname) { + line, col := r.FieldPos(i) + return &dataError{filename, line, col, fmt.Errorf("column name is %q, expected %q", cname, name)} + } + } + return nil +} + +// A constmap is a map between names and integer values. +type constmap struct { + names map[string]int + values map[int]string +} + +// readConsts reads a CSV file containing a map between names and integer values. +func readConsts(filename string) (m constmap, err error) { + fp, err := os.Open(filename) + if err != nil { + return m, err + } + defer fp.Close() + r := csv.NewReader(fp) + r.ReuseRecord = true + if err := readHeader(filename, r, "name", "value"); err != nil { + return m, err + } + m.names = make(map[string]int) + m.values = make(map[int]string) + for { + row, err := r.Read() + if err != nil { + if err == io.EOF { + break + } + return m, err + } + if len(row) < 2 { + line, _ := r.FieldPos(0) + return m, &dataError{filename, line, 0, errors.New("expected at least two columns")} + } + name := row[0] + if !isIdent.MatchString(name) { + line, col := r.FieldPos(0) + return m, &dataError{filename, line, col, fmt.Errorf("invalid name: %q", name)} + } + if _, e := m.names[name]; e { + line, col := r.FieldPos(0) + return m, &dataError{filename, line, col, fmt.Errorf("duplicate name: %q", name)} + } + value, err := strconv.Atoi(row[1]) + if err != nil { + line, col := r.FieldPos(1) + return m, &dataError{filename, line, col, fmt.Errorf("invalid value: %v", err)} + } + m.names[name] = value + if _, e := m.values[value]; !e { + m.values[value] = name + } + } + return m, nil +} + +type charmapinfo struct { + name string + file string + script string + regions []string +} + +// readCharmaps reads and parses the charmaps.csv file. +func readCharmaps(filename string, scripts, regions map[string]int) ([]charmapinfo, error) { + fp, err := os.Open(filename) + if err != nil { + return nil, err + } + defer fp.Close() + r := csv.NewReader(fp) + r.ReuseRecord = true + if err := readHeader(filename, r, "name", "file", "script", "regions"); err != nil { + return nil, err + } + var arr []charmapinfo + for { + row, err := r.Read() + if err != nil { + if err == io.EOF { + break + } + return nil, err + } + if len(row) < 3 { + line, _ := r.FieldPos(0) + return nil, &dataError{filename, line, 0, errors.New("expected at least three columns")} + } + ifo := charmapinfo{ + name: row[0], + file: row[1], + script: row[2], + } + if _, e := scripts[ifo.script]; !e { + line, col := r.FieldPos(2) + return nil, &dataError{filename, line, col, fmt.Errorf("unknown script: %q", ifo.script)} + } + if len(row) >= 4 && row[3] != "" { + ifo.regions = strings.Split(row[3], ";") + for _, region := range ifo.regions { + if _, e := regions[region]; !e { + line, col := r.FieldPos(3) + return nil, &dataError{filename, line, col, fmt.Errorf("unknown region: %q", region)} + } + } + } + arr = append(arr, ifo) + } + return arr, nil +} + +type scriptdata struct { + scripts constmap + regions constmap + charmaps []charmapinfo +} + +func readData() (d scriptdata, err error) { + d.scripts, err = readConsts("scripts/script.csv") + if err != nil { + return d, err + } + d.regions, err = readConsts("scripts/region.csv") + if err != nil { + return d, err + } + d.charmaps, err = readCharmaps("scripts/charmap.csv", d.scripts.names, d.regions.names) + return +} diff --git a/gen/go.mod b/gen/go.mod index 92c0f19..fc2f957 100644 --- a/gen/go.mod +++ b/gen/go.mod @@ -1,8 +1,5 @@ -module moria.us/macroman +module moria.us/macscript go 1.16 -require ( - github.com/depp/packbits v1.0.0 - golang.org/x/text v0.3.5 -) +require golang.org/x/text v0.3.5 diff --git a/gen/go.sum b/gen/go.sum index 09e6a01..bbd33e8 100644 --- a/gen/go.sum +++ b/gen/go.sum @@ -1,5 +1,3 @@ -github.com/depp/packbits v1.0.0 h1:KqnhCkzI5WbWuwKsAgPI/JLF11mBXBEvYADiwKZMXi0= -github.com/depp/packbits v1.0.0/go.mod h1:wDV3NXiMB4a+KztSJ93UMH9cBKj5cEGooAbgRXTpQ78= golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/gen/macroman.go b/gen/macroman.go deleted file mode 100644 index 1f7343c..0000000 --- a/gen/macroman.go +++ /dev/null @@ -1,139 +0,0 @@ -package main - -import ( - "bufio" - "flag" - "fmt" - "os" - "strconv" - - "github.com/depp/packbits" - "golang.org/x/text/unicode/norm" -) - -var ( - flagDumpSequences bool - flagDumpTransitions bool -) - -func init() { - flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences") - flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables") -} - -var characters [256]rune - -func init() { - hichars := [128]uint16{ - 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, - 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, - 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, - 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, - 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, - 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, - 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, - 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, - 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, - 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, - 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, - 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, - 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, - 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, - 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, - 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7, - } - for i := 0; i < 128; i++ { - characters[i] = rune(i) - } - for i, c := range hichars { - characters[i+128] = rune(c) - } -} - -var ( - // lineBreaks is the set of all sequences recognized as line breaks. - lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}} - // normForms is the set of Unicode normalization forms recognized. - normForms = []norm.Form{norm.NFC, norm.NFD} -) - -func makeConverter(cmap *[256]rune) (*node, error) { - var n node - // Special case for CR and LF. - for _, b := range lineBreaks { - if err := n.add('\r', b); err != nil { - return nil, err - } - } - for m, u := range *cmap { - if m == '\r' || m == '\n' { - continue - } - us := string(u) - for _, form := range normForms { - bytes := []byte(form.String(us)) - fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes) - if err := n.add(byte(m), bytes); err != nil { - return nil, err - } - } - } - return &n, nil -} - -func printData(f *os.File, ulen int, data []byte) error { - if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+ - "// clang-format off\n"); err != nil { - return err - } - if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil { - return err - } - if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil { - return err - } - var line []byte - for _, n := range data { - sv := len(line) - line = strconv.AppendUint(line, uint64(n), 10) - line = append(line, ',') - if len(line) > 80 { - line = append(line[:sv], '\n') - if _, err := f.Write(line); err != nil { - return err - } - line = strconv.AppendUint(line[:0], uint64(n), 10) - line = append(line, ',') - } - } - line = append(line, '\n') - if _, err := f.Write(line); err != nil { - return err - } - _, err := fmt.Print("};\n") - return err -} - -func mainE() error { - n, err := makeConverter(&characters) - if err != nil { - return err - } - table := n.genTable() - if flagDumpTransitions { - w := bufio.NewWriter(os.Stderr) - table.dumpTransitions(w) - w.Flush() - } - bytes := table.toBytes() - bits := packbits.Pack(bytes) - return printData(os.Stdout, len(bytes), bits) -} - -func main() { - flag.Parse() - if err := mainE(); err != nil { - fmt.Fprintln(os.Stderr, "Error:", err) - os.Exit(1) - } -} diff --git a/gen/main.go b/gen/main.go new file mode 100644 index 0000000..e06aded --- /dev/null +++ b/gen/main.go @@ -0,0 +1,50 @@ +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" +) + +var ( + flagDumpSequences bool + flagDumpTransitions bool +) + +func init() { + flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences") + flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables") +} + +func getSrcdir() (string, error) { + exe, err := os.Executable() + if err != nil { + return "", err + } + return filepath.Dir(filepath.Dir(exe)), nil +} + +func mainE() error { + srcdir, err := getSrcdir() + if err != nil { + return fmt.Errorf("could not find source dir: %v", err) + } + if err := os.Chdir(srcdir); err != nil { + return err + } + d, err := readData() + if err != nil { + return err + } + _ = d + return nil +} + +func main() { + flag.Parse() + if err := mainE(); err != nil { + fmt.Fprintln(os.Stderr, "Error:", err) + os.Exit(1) + } +} diff --git a/gen/table.go b/gen/table.go deleted file mode 100644 index 888db1d..0000000 --- a/gen/table.go +++ /dev/null @@ -1,136 +0,0 @@ -package main - -import ( - "bufio" - "errors" - "fmt" -) - -var ( - errEmptyString = errors.New("empty input") - errZeroInput = errors.New("zero byte input") - errZeroOutput = errors.New("zero byte output") -) - -type inputConflictErr struct { - input []byte - out1 byte - out2 byte -} - -func (e *inputConflictErr) Error() string { - return fmt.Sprintf("table conflict: %d maps to both %d and %d", e.input, e.out1, e.out2) -} - -// A node is an element in a Unicode decoding graph. -type node struct { - chars [256]uint8 - children [256]*node -} - -// add adds the mapping from "in" to "out", creating additional nodes as -// necessary. -func (n *node) add(out byte, in []byte) error { - if len(in) == 0 { - return errEmptyString - } - if in[0] == 0 { - if out == 0 { - return nil - } - } - if out == 0 { - return errZeroOutput - } - for _, b := range in[:len(in)-1] { - old := n - n = n.children[b] - if n == nil { - n = new(node) - old.children[b] = n - } - } - b := in[len(in)-1] - x := n.chars[b] - if x == 0 { - n.chars[b] = out - return nil - } - if x == out { - return nil - } - return &inputConflictErr{ - input: in, - out1: x, - out2: out, - } -} - -func (n *node) size() int { - sz := 1 - for _, c := range n.children { - if c != nil { - sz += c.size() - } - } - return sz -} - -func (n *node) writeTable(table decoderTable, pos int) int { - data := table[pos*256 : pos*256+256 : pos*256+256] - pos++ - for i, c := range n.chars { - data[i] = uint16(c) - } - for i, c := range n.children { - if c != nil { - data[i] |= uint16(pos << 8) - pos = c.writeTable(table, pos) - } - } - return pos -} - -func (n *node) genTable() decoderTable { - sz := n.size() - table := make(decoderTable, 256*sz) - pos := n.writeTable(table, 0) - if pos != sz { - panic("bad table") - } - return table -} - -type decoderTable []uint16 - -func (t decoderTable) dumpTransitions(w *bufio.Writer) { - n := len(t) >> 8 - for i := 0; i < n; i++ { - t := t[i<<8 : (i+1)<<8] - fmt.Fprintf(w, "State $%02x\n", i) - for m, v := range t { - if v != 0 { - fmt.Fprintf(w, " $%02x ->", m) - st := v >> 8 - chr := v & 255 - if st != 0 { - fmt.Fprintf(w, " state $%02x", st) - } - if chr != 0 { - fmt.Fprintf(w, " char $%02x", chr) - } - w.WriteByte('\n') - } - } - w.WriteByte('\n') - } -} - -func (t decoderTable) toBytes() []byte { - b := make([]byte, len(t)*2) - for i, x := range t { - b[i*2] = byte(x >> 8) - b[i*2+1] = byte(x) - } - return b -} diff --git a/scripts/README.md b/scripts/README.md index a17736d..ca7cb8a 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,4 +2,10 @@ This folder contains the script and region definitions for the Mac OS toolbox. -These constants are extracted from the `Script.h` file in Universal Interfaces. +- `extract.py`: Generate `script.csv` and `region.csv` from the `Script.h` file in Mac OS Universal Interfaces. The output of this program is checked in, so it does not need to be run again unless the logic is changed. + +- `script.csv`: Constants identifying scripts. + +- `region.csv`: Constants identifying localization regions. + +- `charmap.csv`: Identifies character maps used by classic Mac OS. Each character map is given a name, a data file in the `../charmap` folder, and the script and, optionally, regions it corresponds to. This mapping is taken from the readme in the charmap folder. More specific mappings (which contain regions) in this file take precedence less specific mappings (which do not contain regions). For example, Turkish is more specific than Roman. diff --git a/scripts/charmap.csv b/scripts/charmap.csv new file mode 100644 index 0000000..d278b20 --- /dev/null +++ b/scripts/charmap.csv @@ -0,0 +1,24 @@ +Name,File,Script,Regions +Roman,ROMAN.TXT,smRoman, +Turkish,TURKISH.TXT,smRoman,verTurkey +Croatian,CROATIAN.TXT,smRoman,verCroatia;verSlovenian;verYugoCroatian +Icelandic,ICELAND.TXT,smRoman,verIceland;verFaroeIsl +Romanian,ROMANIAN.TXT,smRoman,verRomania +Celtic,CELTIC.TXT,smRoman,verIreland;verScottishGaelic;verManxGaelic;verBreton;verWelsh +Gaelic,GAELIC.TXT,smRoman,verIrishGaelicScript +Greek,GREEK.TXT,smRoman,verGreece +Japanese,JAPANESE.TXT,smJapanese, +Chinese (Traditional),CHINTRAD.TXT,smTradChinese, +Korean,KOREAN.TXT,smKorean, +Arabic,ARABIC.TXT,smArabic, +Farsi,FARSI.TXT,smArabic,verIran +Hebrew,HEBREW.TXT,smHebrew, +Cyrillic,CYRILLIC.TXT,smCyrillic, +Devanagari,DEVANAGA.TXT,smDevanagari, +Gurmukhi,GURMUKHI.TXT,smGurmukhi, +Gujarati,GUJARATI.TXT,smGujarati, +Thai,,smThai, +Chinese (Simplified),CHINSIMP.TXT,smSimpChinese, +Tibetan,,smTibetan, +Inuit,INUIT.TXT,smEthiopic,verNunavut +Central European,CENTEURO.TXT,smCentralEuroRoman,