Parse character map information

This adds associations between scripts, regions, and character maps, and
parses them from a Go utility.
This commit is contained in:
Dietrich Epp 2022-03-15 13:38:45 -04:00
parent 7bc44f4a5a
commit 022d11fa14
9 changed files with 270 additions and 284 deletions

2
gen/.gitignore vendored
View File

@ -1 +1 @@
/macroman
/macscript

186
gen/data.go Normal file
View File

@ -0,0 +1,186 @@
package main
import (
"encoding/csv"
"errors"
"fmt"
"io"
"os"
"regexp"
"strconv"
"strings"
)
var isIdent = regexp.MustCompile("^[a-zA-Z][_a-zA-Z0-9]*$")
// A dataError indicates an error in the contents of one of the data files.
type dataError struct {
filename string
line int
column int
err error
}
func (e *dataError) Error() string {
var b strings.Builder
b.WriteString(e.filename)
if e.line != 0 {
b.WriteByte(':')
b.WriteString(strconv.Itoa(e.line))
if e.column != 0 {
b.WriteByte(':')
b.WriteString(strconv.Itoa(e.column))
}
}
b.WriteString(": ")
b.WriteString(e.err.Error())
return b.String()
}
// readHeader reads the header row of a CSV file and checks that columns exist with the given names.
func readHeader(filename string, r *csv.Reader, names ...string) error {
row, err := r.Read()
if err != nil {
return err
}
for i, name := range names {
if len(row) <= i {
line, _ := r.FieldPos(0)
return &dataError{filename, line, 0, fmt.Errorf("missing column: %q", name)}
}
cname := row[i]
if !strings.EqualFold(name, cname) {
line, col := r.FieldPos(i)
return &dataError{filename, line, col, fmt.Errorf("column name is %q, expected %q", cname, name)}
}
}
return nil
}
// A constmap is a map between names and integer values.
type constmap struct {
names map[string]int
values map[int]string
}
// readConsts reads a CSV file containing a map between names and integer values.
func readConsts(filename string) (m constmap, err error) {
fp, err := os.Open(filename)
if err != nil {
return m, err
}
defer fp.Close()
r := csv.NewReader(fp)
r.ReuseRecord = true
if err := readHeader(filename, r, "name", "value"); err != nil {
return m, err
}
m.names = make(map[string]int)
m.values = make(map[int]string)
for {
row, err := r.Read()
if err != nil {
if err == io.EOF {
break
}
return m, err
}
if len(row) < 2 {
line, _ := r.FieldPos(0)
return m, &dataError{filename, line, 0, errors.New("expected at least two columns")}
}
name := row[0]
if !isIdent.MatchString(name) {
line, col := r.FieldPos(0)
return m, &dataError{filename, line, col, fmt.Errorf("invalid name: %q", name)}
}
if _, e := m.names[name]; e {
line, col := r.FieldPos(0)
return m, &dataError{filename, line, col, fmt.Errorf("duplicate name: %q", name)}
}
value, err := strconv.Atoi(row[1])
if err != nil {
line, col := r.FieldPos(1)
return m, &dataError{filename, line, col, fmt.Errorf("invalid value: %v", err)}
}
m.names[name] = value
if _, e := m.values[value]; !e {
m.values[value] = name
}
}
return m, nil
}
type charmapinfo struct {
name string
file string
script string
regions []string
}
// readCharmaps reads and parses the charmaps.csv file.
func readCharmaps(filename string, scripts, regions map[string]int) ([]charmapinfo, error) {
fp, err := os.Open(filename)
if err != nil {
return nil, err
}
defer fp.Close()
r := csv.NewReader(fp)
r.ReuseRecord = true
if err := readHeader(filename, r, "name", "file", "script", "regions"); err != nil {
return nil, err
}
var arr []charmapinfo
for {
row, err := r.Read()
if err != nil {
if err == io.EOF {
break
}
return nil, err
}
if len(row) < 3 {
line, _ := r.FieldPos(0)
return nil, &dataError{filename, line, 0, errors.New("expected at least three columns")}
}
ifo := charmapinfo{
name: row[0],
file: row[1],
script: row[2],
}
if _, e := scripts[ifo.script]; !e {
line, col := r.FieldPos(2)
return nil, &dataError{filename, line, col, fmt.Errorf("unknown script: %q", ifo.script)}
}
if len(row) >= 4 && row[3] != "" {
ifo.regions = strings.Split(row[3], ";")
for _, region := range ifo.regions {
if _, e := regions[region]; !e {
line, col := r.FieldPos(3)
return nil, &dataError{filename, line, col, fmt.Errorf("unknown region: %q", region)}
}
}
}
arr = append(arr, ifo)
}
return arr, nil
}
type scriptdata struct {
scripts constmap
regions constmap
charmaps []charmapinfo
}
func readData() (d scriptdata, err error) {
d.scripts, err = readConsts("scripts/script.csv")
if err != nil {
return d, err
}
d.regions, err = readConsts("scripts/region.csv")
if err != nil {
return d, err
}
d.charmaps, err = readCharmaps("scripts/charmap.csv", d.scripts.names, d.regions.names)
return
}

View File

@ -1,8 +1,5 @@
module moria.us/macroman
module moria.us/macscript
go 1.16
require (
github.com/depp/packbits v1.0.0
golang.org/x/text v0.3.5
)
require golang.org/x/text v0.3.5

View File

@ -1,5 +1,3 @@
github.com/depp/packbits v1.0.0 h1:KqnhCkzI5WbWuwKsAgPI/JLF11mBXBEvYADiwKZMXi0=
github.com/depp/packbits v1.0.0/go.mod h1:wDV3NXiMB4a+KztSJ93UMH9cBKj5cEGooAbgRXTpQ78=
golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@ -1,139 +0,0 @@
package main
import (
"bufio"
"flag"
"fmt"
"os"
"strconv"
"github.com/depp/packbits"
"golang.org/x/text/unicode/norm"
)
var (
flagDumpSequences bool
flagDumpTransitions bool
)
func init() {
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
}
var characters [256]rune
func init() {
hichars := [128]uint16{
0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
}
for i := 0; i < 128; i++ {
characters[i] = rune(i)
}
for i, c := range hichars {
characters[i+128] = rune(c)
}
}
var (
// lineBreaks is the set of all sequences recognized as line breaks.
lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}
// normForms is the set of Unicode normalization forms recognized.
normForms = []norm.Form{norm.NFC, norm.NFD}
)
func makeConverter(cmap *[256]rune) (*node, error) {
var n node
// Special case for CR and LF.
for _, b := range lineBreaks {
if err := n.add('\r', b); err != nil {
return nil, err
}
}
for m, u := range *cmap {
if m == '\r' || m == '\n' {
continue
}
us := string(u)
for _, form := range normForms {
bytes := []byte(form.String(us))
fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)
if err := n.add(byte(m), bytes); err != nil {
return nil, err
}
}
}
return &n, nil
}
func printData(f *os.File, ulen int, data []byte) error {
if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+
"// clang-format off\n"); err != nil {
return err
}
if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil {
return err
}
if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil {
return err
}
var line []byte
for _, n := range data {
sv := len(line)
line = strconv.AppendUint(line, uint64(n), 10)
line = append(line, ',')
if len(line) > 80 {
line = append(line[:sv], '\n')
if _, err := f.Write(line); err != nil {
return err
}
line = strconv.AppendUint(line[:0], uint64(n), 10)
line = append(line, ',')
}
}
line = append(line, '\n')
if _, err := f.Write(line); err != nil {
return err
}
_, err := fmt.Print("};\n")
return err
}
func mainE() error {
n, err := makeConverter(&characters)
if err != nil {
return err
}
table := n.genTable()
if flagDumpTransitions {
w := bufio.NewWriter(os.Stderr)
table.dumpTransitions(w)
w.Flush()
}
bytes := table.toBytes()
bits := packbits.Pack(bytes)
return printData(os.Stdout, len(bytes), bits)
}
func main() {
flag.Parse()
if err := mainE(); err != nil {
fmt.Fprintln(os.Stderr, "Error:", err)
os.Exit(1)
}
}

50
gen/main.go Normal file
View File

@ -0,0 +1,50 @@
package main
import (
"flag"
"fmt"
"os"
"path/filepath"
)
var (
flagDumpSequences bool
flagDumpTransitions bool
)
func init() {
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
}
func getSrcdir() (string, error) {
exe, err := os.Executable()
if err != nil {
return "", err
}
return filepath.Dir(filepath.Dir(exe)), nil
}
func mainE() error {
srcdir, err := getSrcdir()
if err != nil {
return fmt.Errorf("could not find source dir: %v", err)
}
if err := os.Chdir(srcdir); err != nil {
return err
}
d, err := readData()
if err != nil {
return err
}
_ = d
return nil
}
func main() {
flag.Parse()
if err := mainE(); err != nil {
fmt.Fprintln(os.Stderr, "Error:", err)
os.Exit(1)
}
}

View File

@ -1,136 +0,0 @@
package main
import (
"bufio"
"errors"
"fmt"
)
var (
errEmptyString = errors.New("empty input")
errZeroInput = errors.New("zero byte input")
errZeroOutput = errors.New("zero byte output")
)
type inputConflictErr struct {
input []byte
out1 byte
out2 byte
}
func (e *inputConflictErr) Error() string {
return fmt.Sprintf("table conflict: %d maps to both %d and %d", e.input, e.out1, e.out2)
}
// A node is an element in a Unicode decoding graph.
type node struct {
chars [256]uint8
children [256]*node
}
// add adds the mapping from "in" to "out", creating additional nodes as
// necessary.
func (n *node) add(out byte, in []byte) error {
if len(in) == 0 {
return errEmptyString
}
if in[0] == 0 {
if out == 0 {
return nil
}
}
if out == 0 {
return errZeroOutput
}
for _, b := range in[:len(in)-1] {
old := n
n = n.children[b]
if n == nil {
n = new(node)
old.children[b] = n
}
}
b := in[len(in)-1]
x := n.chars[b]
if x == 0 {
n.chars[b] = out
return nil
}
if x == out {
return nil
}
return &inputConflictErr{
input: in,
out1: x,
out2: out,
}
}
func (n *node) size() int {
sz := 1
for _, c := range n.children {
if c != nil {
sz += c.size()
}
}
return sz
}
func (n *node) writeTable(table decoderTable, pos int) int {
data := table[pos*256 : pos*256+256 : pos*256+256]
pos++
for i, c := range n.chars {
data[i] = uint16(c)
}
for i, c := range n.children {
if c != nil {
data[i] |= uint16(pos << 8)
pos = c.writeTable(table, pos)
}
}
return pos
}
func (n *node) genTable() decoderTable {
sz := n.size()
table := make(decoderTable, 256*sz)
pos := n.writeTable(table, 0)
if pos != sz {
panic("bad table")
}
return table
}
type decoderTable []uint16
func (t decoderTable) dumpTransitions(w *bufio.Writer) {
n := len(t) >> 8
for i := 0; i < n; i++ {
t := t[i<<8 : (i+1)<<8]
fmt.Fprintf(w, "State $%02x\n", i)
for m, v := range t {
if v != 0 {
fmt.Fprintf(w, " $%02x ->", m)
st := v >> 8
chr := v & 255
if st != 0 {
fmt.Fprintf(w, " state $%02x", st)
}
if chr != 0 {
fmt.Fprintf(w, " char $%02x", chr)
}
w.WriteByte('\n')
}
}
w.WriteByte('\n')
}
}
func (t decoderTable) toBytes() []byte {
b := make([]byte, len(t)*2)
for i, x := range t {
b[i*2] = byte(x >> 8)
b[i*2+1] = byte(x)
}
return b
}

View File

@ -2,4 +2,10 @@
This folder contains the script and region definitions for the Mac OS toolbox.
These constants are extracted from the `Script.h` file in Universal Interfaces.
- `extract.py`: Generate `script.csv` and `region.csv` from the `Script.h` file in Mac OS Universal Interfaces. The output of this program is checked in, so it does not need to be run again unless the logic is changed.
- `script.csv`: Constants identifying scripts.
- `region.csv`: Constants identifying localization regions.
- `charmap.csv`: Identifies character maps used by classic Mac OS. Each character map is given a name, a data file in the `../charmap` folder, and the script and, optionally, regions it corresponds to. This mapping is taken from the readme in the charmap folder. More specific mappings (which contain regions) in this file take precedence less specific mappings (which do not contain regions). For example, Turkish is more specific than Roman.

24
scripts/charmap.csv Normal file
View File

@ -0,0 +1,24 @@
Name,File,Script,Regions
Roman,ROMAN.TXT,smRoman,
Turkish,TURKISH.TXT,smRoman,verTurkey
Croatian,CROATIAN.TXT,smRoman,verCroatia;verSlovenian;verYugoCroatian
Icelandic,ICELAND.TXT,smRoman,verIceland;verFaroeIsl
Romanian,ROMANIAN.TXT,smRoman,verRomania
Celtic,CELTIC.TXT,smRoman,verIreland;verScottishGaelic;verManxGaelic;verBreton;verWelsh
Gaelic,GAELIC.TXT,smRoman,verIrishGaelicScript
Greek,GREEK.TXT,smRoman,verGreece
Japanese,JAPANESE.TXT,smJapanese,
Chinese (Traditional),CHINTRAD.TXT,smTradChinese,
Korean,KOREAN.TXT,smKorean,
Arabic,ARABIC.TXT,smArabic,
Farsi,FARSI.TXT,smArabic,verIran
Hebrew,HEBREW.TXT,smHebrew,
Cyrillic,CYRILLIC.TXT,smCyrillic,
Devanagari,DEVANAGA.TXT,smDevanagari,
Gurmukhi,GURMUKHI.TXT,smGurmukhi,
Gujarati,GUJARATI.TXT,smGujarati,
Thai,,smThai,
Chinese (Simplified),CHINSIMP.TXT,smSimpChinese,
Tibetan,,smTibetan,
Inuit,INUIT.TXT,smEthiopic,verNunavut
Central European,CENTEURO.TXT,smCentralEuroRoman,
1 Name File Script Regions
2 Roman ROMAN.TXT smRoman
3 Turkish TURKISH.TXT smRoman verTurkey
4 Croatian CROATIAN.TXT smRoman verCroatia;verSlovenian;verYugoCroatian
5 Icelandic ICELAND.TXT smRoman verIceland;verFaroeIsl
6 Romanian ROMANIAN.TXT smRoman verRomania
7 Celtic CELTIC.TXT smRoman verIreland;verScottishGaelic;verManxGaelic;verBreton;verWelsh
8 Gaelic GAELIC.TXT smRoman verIrishGaelicScript
9 Greek GREEK.TXT smRoman verGreece
10 Japanese JAPANESE.TXT smJapanese
11 Chinese (Traditional) CHINTRAD.TXT smTradChinese
12 Korean KOREAN.TXT smKorean
13 Arabic ARABIC.TXT smArabic
14 Farsi FARSI.TXT smArabic verIran
15 Hebrew HEBREW.TXT smHebrew
16 Cyrillic CYRILLIC.TXT smCyrillic
17 Devanagari DEVANAGA.TXT smDevanagari
18 Gurmukhi GURMUKHI.TXT smGurmukhi
19 Gujarati GUJARATI.TXT smGujarati
20 Thai smThai
21 Chinese (Simplified) CHINSIMP.TXT smSimpChinese
22 Tibetan smTibetan
23 Inuit INUIT.TXT smEthiopic verNunavut
24 Central European CENTEURO.TXT smCentralEuroRoman