mirror of
https://github.com/depp/syncfiles.git
synced 2024-11-28 12:51:05 +00:00
Parse character map information
This adds associations between scripts, regions, and character maps, and parses them from a Go utility.
This commit is contained in:
parent
7bc44f4a5a
commit
022d11fa14
2
gen/.gitignore
vendored
2
gen/.gitignore
vendored
@ -1 +1 @@
|
||||
/macroman
|
||||
/macscript
|
||||
|
186
gen/data.go
Normal file
186
gen/data.go
Normal file
@ -0,0 +1,186 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var isIdent = regexp.MustCompile("^[a-zA-Z][_a-zA-Z0-9]*$")
|
||||
|
||||
// A dataError indicates an error in the contents of one of the data files.
|
||||
type dataError struct {
|
||||
filename string
|
||||
line int
|
||||
column int
|
||||
err error
|
||||
}
|
||||
|
||||
func (e *dataError) Error() string {
|
||||
var b strings.Builder
|
||||
b.WriteString(e.filename)
|
||||
if e.line != 0 {
|
||||
b.WriteByte(':')
|
||||
b.WriteString(strconv.Itoa(e.line))
|
||||
if e.column != 0 {
|
||||
b.WriteByte(':')
|
||||
b.WriteString(strconv.Itoa(e.column))
|
||||
}
|
||||
}
|
||||
b.WriteString(": ")
|
||||
b.WriteString(e.err.Error())
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// readHeader reads the header row of a CSV file and checks that columns exist with the given names.
|
||||
func readHeader(filename string, r *csv.Reader, names ...string) error {
|
||||
row, err := r.Read()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i, name := range names {
|
||||
if len(row) <= i {
|
||||
line, _ := r.FieldPos(0)
|
||||
return &dataError{filename, line, 0, fmt.Errorf("missing column: %q", name)}
|
||||
}
|
||||
cname := row[i]
|
||||
if !strings.EqualFold(name, cname) {
|
||||
line, col := r.FieldPos(i)
|
||||
return &dataError{filename, line, col, fmt.Errorf("column name is %q, expected %q", cname, name)}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// A constmap is a map between names and integer values.
|
||||
type constmap struct {
|
||||
names map[string]int
|
||||
values map[int]string
|
||||
}
|
||||
|
||||
// readConsts reads a CSV file containing a map between names and integer values.
|
||||
func readConsts(filename string) (m constmap, err error) {
|
||||
fp, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return m, err
|
||||
}
|
||||
defer fp.Close()
|
||||
r := csv.NewReader(fp)
|
||||
r.ReuseRecord = true
|
||||
if err := readHeader(filename, r, "name", "value"); err != nil {
|
||||
return m, err
|
||||
}
|
||||
m.names = make(map[string]int)
|
||||
m.values = make(map[int]string)
|
||||
for {
|
||||
row, err := r.Read()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return m, err
|
||||
}
|
||||
if len(row) < 2 {
|
||||
line, _ := r.FieldPos(0)
|
||||
return m, &dataError{filename, line, 0, errors.New("expected at least two columns")}
|
||||
}
|
||||
name := row[0]
|
||||
if !isIdent.MatchString(name) {
|
||||
line, col := r.FieldPos(0)
|
||||
return m, &dataError{filename, line, col, fmt.Errorf("invalid name: %q", name)}
|
||||
}
|
||||
if _, e := m.names[name]; e {
|
||||
line, col := r.FieldPos(0)
|
||||
return m, &dataError{filename, line, col, fmt.Errorf("duplicate name: %q", name)}
|
||||
}
|
||||
value, err := strconv.Atoi(row[1])
|
||||
if err != nil {
|
||||
line, col := r.FieldPos(1)
|
||||
return m, &dataError{filename, line, col, fmt.Errorf("invalid value: %v", err)}
|
||||
}
|
||||
m.names[name] = value
|
||||
if _, e := m.values[value]; !e {
|
||||
m.values[value] = name
|
||||
}
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
type charmapinfo struct {
|
||||
name string
|
||||
file string
|
||||
script string
|
||||
regions []string
|
||||
}
|
||||
|
||||
// readCharmaps reads and parses the charmaps.csv file.
|
||||
func readCharmaps(filename string, scripts, regions map[string]int) ([]charmapinfo, error) {
|
||||
fp, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer fp.Close()
|
||||
r := csv.NewReader(fp)
|
||||
r.ReuseRecord = true
|
||||
if err := readHeader(filename, r, "name", "file", "script", "regions"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var arr []charmapinfo
|
||||
for {
|
||||
row, err := r.Read()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
if len(row) < 3 {
|
||||
line, _ := r.FieldPos(0)
|
||||
return nil, &dataError{filename, line, 0, errors.New("expected at least three columns")}
|
||||
}
|
||||
ifo := charmapinfo{
|
||||
name: row[0],
|
||||
file: row[1],
|
||||
script: row[2],
|
||||
}
|
||||
if _, e := scripts[ifo.script]; !e {
|
||||
line, col := r.FieldPos(2)
|
||||
return nil, &dataError{filename, line, col, fmt.Errorf("unknown script: %q", ifo.script)}
|
||||
}
|
||||
if len(row) >= 4 && row[3] != "" {
|
||||
ifo.regions = strings.Split(row[3], ";")
|
||||
for _, region := range ifo.regions {
|
||||
if _, e := regions[region]; !e {
|
||||
line, col := r.FieldPos(3)
|
||||
return nil, &dataError{filename, line, col, fmt.Errorf("unknown region: %q", region)}
|
||||
}
|
||||
}
|
||||
}
|
||||
arr = append(arr, ifo)
|
||||
}
|
||||
return arr, nil
|
||||
}
|
||||
|
||||
type scriptdata struct {
|
||||
scripts constmap
|
||||
regions constmap
|
||||
charmaps []charmapinfo
|
||||
}
|
||||
|
||||
func readData() (d scriptdata, err error) {
|
||||
d.scripts, err = readConsts("scripts/script.csv")
|
||||
if err != nil {
|
||||
return d, err
|
||||
}
|
||||
d.regions, err = readConsts("scripts/region.csv")
|
||||
if err != nil {
|
||||
return d, err
|
||||
}
|
||||
d.charmaps, err = readCharmaps("scripts/charmap.csv", d.scripts.names, d.regions.names)
|
||||
return
|
||||
}
|
@ -1,8 +1,5 @@
|
||||
module moria.us/macroman
|
||||
module moria.us/macscript
|
||||
|
||||
go 1.16
|
||||
|
||||
require (
|
||||
github.com/depp/packbits v1.0.0
|
||||
golang.org/x/text v0.3.5
|
||||
)
|
||||
require golang.org/x/text v0.3.5
|
||||
|
@ -1,5 +1,3 @@
|
||||
github.com/depp/packbits v1.0.0 h1:KqnhCkzI5WbWuwKsAgPI/JLF11mBXBEvYADiwKZMXi0=
|
||||
github.com/depp/packbits v1.0.0/go.mod h1:wDV3NXiMB4a+KztSJ93UMH9cBKj5cEGooAbgRXTpQ78=
|
||||
golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
|
||||
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
|
139
gen/macroman.go
139
gen/macroman.go
@ -1,139 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/depp/packbits"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
var (
|
||||
flagDumpSequences bool
|
||||
flagDumpTransitions bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
|
||||
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
|
||||
}
|
||||
|
||||
var characters [256]rune
|
||||
|
||||
func init() {
|
||||
hichars := [128]uint16{
|
||||
0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
|
||||
0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
|
||||
0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
|
||||
0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
|
||||
0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
|
||||
0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
|
||||
0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
|
||||
0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
|
||||
0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
|
||||
0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
|
||||
0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
|
||||
0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
|
||||
0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
|
||||
0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
|
||||
0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
|
||||
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
|
||||
}
|
||||
for i := 0; i < 128; i++ {
|
||||
characters[i] = rune(i)
|
||||
}
|
||||
for i, c := range hichars {
|
||||
characters[i+128] = rune(c)
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// lineBreaks is the set of all sequences recognized as line breaks.
|
||||
lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}
|
||||
// normForms is the set of Unicode normalization forms recognized.
|
||||
normForms = []norm.Form{norm.NFC, norm.NFD}
|
||||
)
|
||||
|
||||
func makeConverter(cmap *[256]rune) (*node, error) {
|
||||
var n node
|
||||
// Special case for CR and LF.
|
||||
for _, b := range lineBreaks {
|
||||
if err := n.add('\r', b); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
for m, u := range *cmap {
|
||||
if m == '\r' || m == '\n' {
|
||||
continue
|
||||
}
|
||||
us := string(u)
|
||||
for _, form := range normForms {
|
||||
bytes := []byte(form.String(us))
|
||||
fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)
|
||||
if err := n.add(byte(m), bytes); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
return &n, nil
|
||||
}
|
||||
|
||||
func printData(f *os.File, ulen int, data []byte) error {
|
||||
if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+
|
||||
"// clang-format off\n"); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil {
|
||||
return err
|
||||
}
|
||||
var line []byte
|
||||
for _, n := range data {
|
||||
sv := len(line)
|
||||
line = strconv.AppendUint(line, uint64(n), 10)
|
||||
line = append(line, ',')
|
||||
if len(line) > 80 {
|
||||
line = append(line[:sv], '\n')
|
||||
if _, err := f.Write(line); err != nil {
|
||||
return err
|
||||
}
|
||||
line = strconv.AppendUint(line[:0], uint64(n), 10)
|
||||
line = append(line, ',')
|
||||
}
|
||||
}
|
||||
line = append(line, '\n')
|
||||
if _, err := f.Write(line); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err := fmt.Print("};\n")
|
||||
return err
|
||||
}
|
||||
|
||||
func mainE() error {
|
||||
n, err := makeConverter(&characters)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
table := n.genTable()
|
||||
if flagDumpTransitions {
|
||||
w := bufio.NewWriter(os.Stderr)
|
||||
table.dumpTransitions(w)
|
||||
w.Flush()
|
||||
}
|
||||
bytes := table.toBytes()
|
||||
bits := packbits.Pack(bytes)
|
||||
return printData(os.Stdout, len(bytes), bits)
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
if err := mainE(); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
50
gen/main.go
Normal file
50
gen/main.go
Normal file
@ -0,0 +1,50 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
var (
|
||||
flagDumpSequences bool
|
||||
flagDumpTransitions bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
|
||||
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
|
||||
}
|
||||
|
||||
func getSrcdir() (string, error) {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.Dir(filepath.Dir(exe)), nil
|
||||
}
|
||||
|
||||
func mainE() error {
|
||||
srcdir, err := getSrcdir()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not find source dir: %v", err)
|
||||
}
|
||||
if err := os.Chdir(srcdir); err != nil {
|
||||
return err
|
||||
}
|
||||
d, err := readData()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_ = d
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
if err := mainE(); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
136
gen/table.go
136
gen/table.go
@ -1,136 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
var (
|
||||
errEmptyString = errors.New("empty input")
|
||||
errZeroInput = errors.New("zero byte input")
|
||||
errZeroOutput = errors.New("zero byte output")
|
||||
)
|
||||
|
||||
type inputConflictErr struct {
|
||||
input []byte
|
||||
out1 byte
|
||||
out2 byte
|
||||
}
|
||||
|
||||
func (e *inputConflictErr) Error() string {
|
||||
return fmt.Sprintf("table conflict: %d maps to both %d and %d", e.input, e.out1, e.out2)
|
||||
}
|
||||
|
||||
// A node is an element in a Unicode decoding graph.
|
||||
type node struct {
|
||||
chars [256]uint8
|
||||
children [256]*node
|
||||
}
|
||||
|
||||
// add adds the mapping from "in" to "out", creating additional nodes as
|
||||
// necessary.
|
||||
func (n *node) add(out byte, in []byte) error {
|
||||
if len(in) == 0 {
|
||||
return errEmptyString
|
||||
}
|
||||
if in[0] == 0 {
|
||||
if out == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if out == 0 {
|
||||
return errZeroOutput
|
||||
}
|
||||
for _, b := range in[:len(in)-1] {
|
||||
old := n
|
||||
n = n.children[b]
|
||||
if n == nil {
|
||||
n = new(node)
|
||||
old.children[b] = n
|
||||
}
|
||||
}
|
||||
b := in[len(in)-1]
|
||||
x := n.chars[b]
|
||||
if x == 0 {
|
||||
n.chars[b] = out
|
||||
return nil
|
||||
}
|
||||
if x == out {
|
||||
return nil
|
||||
}
|
||||
return &inputConflictErr{
|
||||
input: in,
|
||||
out1: x,
|
||||
out2: out,
|
||||
}
|
||||
}
|
||||
|
||||
func (n *node) size() int {
|
||||
sz := 1
|
||||
for _, c := range n.children {
|
||||
if c != nil {
|
||||
sz += c.size()
|
||||
}
|
||||
}
|
||||
return sz
|
||||
}
|
||||
|
||||
func (n *node) writeTable(table decoderTable, pos int) int {
|
||||
data := table[pos*256 : pos*256+256 : pos*256+256]
|
||||
pos++
|
||||
for i, c := range n.chars {
|
||||
data[i] = uint16(c)
|
||||
}
|
||||
for i, c := range n.children {
|
||||
if c != nil {
|
||||
data[i] |= uint16(pos << 8)
|
||||
pos = c.writeTable(table, pos)
|
||||
}
|
||||
}
|
||||
return pos
|
||||
}
|
||||
|
||||
func (n *node) genTable() decoderTable {
|
||||
sz := n.size()
|
||||
table := make(decoderTable, 256*sz)
|
||||
pos := n.writeTable(table, 0)
|
||||
if pos != sz {
|
||||
panic("bad table")
|
||||
}
|
||||
return table
|
||||
}
|
||||
|
||||
type decoderTable []uint16
|
||||
|
||||
func (t decoderTable) dumpTransitions(w *bufio.Writer) {
|
||||
n := len(t) >> 8
|
||||
for i := 0; i < n; i++ {
|
||||
t := t[i<<8 : (i+1)<<8]
|
||||
fmt.Fprintf(w, "State $%02x\n", i)
|
||||
for m, v := range t {
|
||||
if v != 0 {
|
||||
fmt.Fprintf(w, " $%02x ->", m)
|
||||
st := v >> 8
|
||||
chr := v & 255
|
||||
if st != 0 {
|
||||
fmt.Fprintf(w, " state $%02x", st)
|
||||
}
|
||||
if chr != 0 {
|
||||
fmt.Fprintf(w, " char $%02x", chr)
|
||||
}
|
||||
w.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
w.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
|
||||
func (t decoderTable) toBytes() []byte {
|
||||
b := make([]byte, len(t)*2)
|
||||
for i, x := range t {
|
||||
b[i*2] = byte(x >> 8)
|
||||
b[i*2+1] = byte(x)
|
||||
}
|
||||
return b
|
||||
}
|
@ -2,4 +2,10 @@
|
||||
|
||||
This folder contains the script and region definitions for the Mac OS toolbox.
|
||||
|
||||
These constants are extracted from the `Script.h` file in Universal Interfaces.
|
||||
- `extract.py`: Generate `script.csv` and `region.csv` from the `Script.h` file in Mac OS Universal Interfaces. The output of this program is checked in, so it does not need to be run again unless the logic is changed.
|
||||
|
||||
- `script.csv`: Constants identifying scripts.
|
||||
|
||||
- `region.csv`: Constants identifying localization regions.
|
||||
|
||||
- `charmap.csv`: Identifies character maps used by classic Mac OS. Each character map is given a name, a data file in the `../charmap` folder, and the script and, optionally, regions it corresponds to. This mapping is taken from the readme in the charmap folder. More specific mappings (which contain regions) in this file take precedence less specific mappings (which do not contain regions). For example, Turkish is more specific than Roman.
|
||||
|
24
scripts/charmap.csv
Normal file
24
scripts/charmap.csv
Normal file
@ -0,0 +1,24 @@
|
||||
Name,File,Script,Regions
|
||||
Roman,ROMAN.TXT,smRoman,
|
||||
Turkish,TURKISH.TXT,smRoman,verTurkey
|
||||
Croatian,CROATIAN.TXT,smRoman,verCroatia;verSlovenian;verYugoCroatian
|
||||
Icelandic,ICELAND.TXT,smRoman,verIceland;verFaroeIsl
|
||||
Romanian,ROMANIAN.TXT,smRoman,verRomania
|
||||
Celtic,CELTIC.TXT,smRoman,verIreland;verScottishGaelic;verManxGaelic;verBreton;verWelsh
|
||||
Gaelic,GAELIC.TXT,smRoman,verIrishGaelicScript
|
||||
Greek,GREEK.TXT,smRoman,verGreece
|
||||
Japanese,JAPANESE.TXT,smJapanese,
|
||||
Chinese (Traditional),CHINTRAD.TXT,smTradChinese,
|
||||
Korean,KOREAN.TXT,smKorean,
|
||||
Arabic,ARABIC.TXT,smArabic,
|
||||
Farsi,FARSI.TXT,smArabic,verIran
|
||||
Hebrew,HEBREW.TXT,smHebrew,
|
||||
Cyrillic,CYRILLIC.TXT,smCyrillic,
|
||||
Devanagari,DEVANAGA.TXT,smDevanagari,
|
||||
Gurmukhi,GURMUKHI.TXT,smGurmukhi,
|
||||
Gujarati,GUJARATI.TXT,smGujarati,
|
||||
Thai,,smThai,
|
||||
Chinese (Simplified),CHINSIMP.TXT,smSimpChinese,
|
||||
Tibetan,,smTibetan,
|
||||
Inuit,INUIT.TXT,smEthiopic,verNunavut
|
||||
Central European,CENTEURO.TXT,smCentralEuroRoman,
|
|
Loading…
Reference in New Issue
Block a user