diff --git a/Formats.md b/Formats.md new file mode 100644 index 0000000..31bd95b --- /dev/null +++ b/Formats.md @@ -0,0 +1,16 @@ +# Table Formats + +Each character map table starts with a single nonzero byte, indicating the table's format. + +## Extended ASCII + +Format 1 is for "extended ASCII". Encoded values 0-127 are identical to ASCII, and encoded values 128-255 are mapped to single Unicode characters. + +The table contains 128 entries, for encoded values 128-255, with the following format: + + u8 Length of Unicode character + u8[] Unicode character, UTF-8 + u8 Length of normalized Unicode character, may be zero + u8[] Unicode character in NFD normal form, UTF-8 + +The second copy of the character is only present if the character decomposes into multiple characters. diff --git a/gen/charmap/charmap.go b/gen/charmap/charmap.go index f77250f..bec298b 100644 --- a/gen/charmap/charmap.go +++ b/gen/charmap/charmap.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "io" + "os" "strconv" "unicode" ) @@ -38,19 +39,7 @@ var ( errColumns = errors.New("expected 2 columns") errUnicode = errors.New("invalid unicode sequence") errCodePointRange = errors.New("code point out of range") -) - -// An EntryType is a type of entry in a character mapping. -type EntryType uint32 - -const ( - // EntryOneByte is an entry where a one-byte character is mapped to Unicode. - EntryOneByte EntryType = iota - // EntryTwoByte is an entry where a two-byte character is mapped to Unicode. - EntryTwoByte - // EntryDigraph is an entry where two one-byte characters are mapped to - // Unicode. - EntryDigraph + errDuplicate = errors.New("duplicate entry") ) // A Direction is a direction context for Unicode characters. @@ -68,34 +57,37 @@ const ( // An Entry is a single entry in a character map. type Entry struct { - Type EntryType - Encoded []byte Direction Direction - Unicode []byte + Unicode []rune } -// A Reader is a character map reader. -type Reader struct { - name string - lineno int - scanner *bufio.Scanner - encoded [2]byte - unicode bytes.Buffer +// A Charmap is a character map, mapping characters from a platform encoding to +// Unicode. +type Charmap struct { + // Pairs of valid one-byte characters that have an alternate Unicode + // representation. + Digraph map[[2]byte]Entry + + // Valid single-byte characters. + OneByte map[byte]Entry + + // Valid two-byte characters. + TwoByte map[[2]byte]Entry } -// NewReader creates a character map reader. -func NewReader(r io.Reader, name string) *Reader { - return &Reader{ - name: name, - scanner: bufio.NewScanner(r), +func (m *Charmap) parseLine(line []byte) error { + // Remove comment. + if i := bytes.IndexByte(line, '#'); i != -1 { + line = line[:i] + } + if len(line) == 0 { + return nil } -} -func (r *Reader) parseEntry(line []byte) (e Entry, err error) { // Split into two columns. i := bytes.IndexByte(line, '\t') if i == -1 || i == 0 { - return e, errColumns + return errColumns } c1 := line[:i] c2 := line[i+1:] @@ -103,48 +95,15 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) { c2 = c2[:i] } if len(c2) == 0 { - return e, errColumns + return errColumns } - // Parse columns. - switch len(c1) { - case 4: - if string(c1[0:2]) != "0x" { - return e, errBadType - } - if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil { - return e, err - } - e.Type = EntryOneByte - e.Encoded = r.encoded[0:1] - case 6: - if string(c1[0:2]) != "0x" { - return e, errBadType - } - if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil { - return e, err - } - e.Type = EntryTwoByte - e.Encoded = r.encoded[0:2] - case 9: - if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" { - return e, errBadType - } - if _, err := hex.Decode(r.encoded[0:1], c1[2:4]); err != nil { - return e, err - } - if _, err := hex.Decode(r.encoded[1:2], c1[7:9]); err != nil { - return e, err - } - e.Type = EntryDigraph - e.Encoded = r.encoded[0:2] - default: - return e, errBadType - } + // Parse Unicode sequence and context. + var e Entry if c2[0] == '<' { i := bytes.IndexByte(c2, '+') if i == -1 { - return e, errors.New("invalid Unicode string") + return errors.New("invalid Unicode string") } ctx := c2[:i] c2 = c2[i+1:] @@ -154,10 +113,11 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) { case "": e.Direction = DirectionRL default: - return e, fmt.Errorf("unknown context: %q", ctx) + return fmt.Errorf("unknown context: %q", ctx) } } - r.unicode.Reset() + var ubuf [8]rune + var ulen int for c2 != nil { var cp []byte if i := bytes.IndexByte(c2, '+'); i != -1 { @@ -168,44 +128,104 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) { c2 = nil } if len(cp) < 2 || string(cp[0:2]) != "0x" { - return e, errUnicode + return errUnicode } cp = cp[2:] x, err := strconv.ParseUint(string(cp), 16, 32) if err != nil { - return e, err + return err } if x > unicode.MaxRune { - return e, errCodePointRange + return errCodePointRange } - r.unicode.WriteRune(rune(x)) + if ulen >= len(ubuf) { + return errors.New("Unicode sequence too long") + } + ubuf[ulen] = rune(x) + ulen++ } - e.Unicode = r.unicode.Bytes() - return e, nil + e.Unicode = make([]rune, ulen) + copy(e.Unicode, ubuf[:]) + + // Parse platform encoded value, store value there. + switch len(c1) { + case 4: + if string(c1[0:2]) != "0x" { + return errBadType + } + var k [1]byte + if _, err := hex.Decode(k[:], c1[2:]); err != nil { + return err + } + ch := k[0] + if m.OneByte == nil { + m.OneByte = make(map[byte]Entry) + } + if _, ok := m.OneByte[ch]; ok { + return errDuplicate + } + m.OneByte[ch] = e + case 6: + if string(c1[0:2]) != "0x" { + return errBadType + } + var k [2]byte + if _, err := hex.Decode(k[:], c1[2:]); err != nil { + return err + } + if m.TwoByte == nil { + m.TwoByte = make(map[[2]byte]Entry) + } + if _, ok := m.TwoByte[k]; ok { + return errDuplicate + } + m.TwoByte[k] = e + case 9: + if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" { + return errBadType + } + var k [2]byte + if _, err := hex.Decode(k[0:1], c1[2:4]); err != nil { + return err + } + if _, err := hex.Decode(k[1:2], c1[7:9]); err != nil { + return err + } + if m.Digraph == nil { + m.Digraph = make(map[[2]byte]Entry) + } + if _, ok := m.Digraph[k]; ok { + return errDuplicate + } + m.Digraph[k] = e + default: + return errBadType + } + + return nil } -// Next returns the next entry in the character map. Returns io.EOF if there are -// no more entries. -func (r *Reader) Next() (e Entry, err error) { - for { - if !r.scanner.Scan() { - if err := r.scanner.Err(); err != nil { - return e, err - } - return e, io.EOF - } - r.lineno++ - line := r.scanner.Bytes() - // Remove comment. - if i := bytes.IndexByte(line, '#'); i != -1 { - line = line[:i] - } - if len(line) != 0 { - e, err = r.parseEntry(line) - if err != nil { - err = &Error{r.name, r.lineno, err} - } - return +// Read reads a charmap from a stream. +func Read(r io.Reader, name string) (*Charmap, error) { + sc := bufio.NewScanner(r) + var m Charmap + for lineno := 1; sc.Scan(); lineno++ { + if err := m.parseLine(sc.Bytes()); err != nil { + return nil, &Error{name, lineno, err} } } + if err := sc.Err(); err != nil { + return nil, err + } + return &m, nil +} + +// ReadFile reads a charmap from a file on disk. +func ReadFile(name string) (*Charmap, error) { + fp, err := os.Open(name) + if err != nil { + return nil, err + } + defer fp.Close() + return Read(fp, name) } diff --git a/gen/main.go b/gen/main.go index fa695bd..545d6c9 100644 --- a/gen/main.go +++ b/gen/main.go @@ -1,24 +1,20 @@ package main import ( + "errors" "flag" "fmt" + "io/ioutil" "os" "path/filepath" + "strings" + + "moria.us/macscript/charmap" + "moria.us/macscript/table" ) const header = "/* This file is automatically generated. */\n" -var ( - flagDumpSequences bool - flagDumpTransitions bool -) - -func init() { - flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences") - flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables") -} - func getSrcdir() (string, error) { exe, err := os.Executable() if err != nil { @@ -35,15 +31,53 @@ func mainE() error { if err := os.Chdir(srcdir); err != nil { return err } + + // Read metadata. d, err := readData() if err != nil { return err } + + // Compile and emit charmap data. + cms := make([]string, len(d.charmaps)) + var hascmap bool + for i, c := range d.charmaps { + if c.file == "" { + continue + } + cm, err := charmap.ReadFile(filepath.Join("charmap", c.file)) + if err != nil { + return err + } + t, err := table.Create(cm) + if err != nil { + if e, ok := err.(*table.UnsupportedError); ok { + fmt.Fprintf(os.Stderr, "Warning: unsupported charmap %q: %s\n", c.file, e.Message) + continue + } + fmt.Fprintf(os.Stderr, "Error: %s: %v", c.file, err) + os.Exit(1) + } + data := t.Data() + name := "charmap_" + strings.ToLower(strings.TrimSuffix(c.file, ".TXT")) + ".dat" + fpath := filepath.Join("src", name) + fmt.Fprintln(os.Stderr, "Writing:", fpath) + if err := ioutil.WriteFile(fpath, data, 0666); err != nil { + return err + } + cms[i] = name + hascmap = true + } + if !hascmap { + return errors.New("could not compile any character map") + } + + // Write generated output. m := genMap(&d) if err := writeMap(&d, m, "src/getcharmap.c"); err != nil { return err } - if err := writeRez(&d, "src/charmaps.r"); err != nil { + if err := writeRez(&d, cms, "src/charmaps.r"); err != nil { return err } return nil @@ -51,6 +85,10 @@ func mainE() error { func main() { flag.Parse() + if args := flag.Args(); len(args) != 0 { + fmt.Fprintf(os.Stderr, "Error: unexpected argument: %q\n", args[0]) + os.Exit(2) + } if err := mainE(); err != nil { fmt.Fprintln(os.Stderr, "Error:", err) os.Exit(1) diff --git a/gen/rez.go b/gen/rez.go index 0422dc9..3e3b514 100644 --- a/gen/rez.go +++ b/gen/rez.go @@ -41,7 +41,7 @@ func constStrings(c *constmap) []string { return r } -func writeRez(d *scriptdata, filename string) error { +func writeRez(d *scriptdata, charmaps []string, filename string) error { fmt.Fprintln(os.Stderr, "Writing:", filename) fp, err := os.Create(filename) if err != nil { @@ -52,9 +52,14 @@ func writeRez(d *scriptdata, filename string) error { w.WriteString(header) w.WriteString("#include \"resources.h\"\n") - writeStrings(w, "rSTRS_Charmaps", charmapNames(d)) - writeStrings(w, "rSTRS_Scripts", constStrings(&d.scripts)) - writeStrings(w, "rSTRS_Regions", constStrings(&d.regions)) + writeStrings(w, `rSTRS_Charmaps, "Character Maps"`, charmapNames(d)) + writeStrings(w, `rSTRS_Scripts, "Scripts"`, constStrings(&d.scripts)) + writeStrings(w, `rSTRS_Regions, "Regions"`, constStrings(&d.regions)) + for i, cm := range charmaps { + if cm != "" { + fmt.Fprintf(w, "read 'cmap' (%d, %q) %q;\n", 128+i, d.charmaps[i].name, cm) + } + } if err := w.Flush(); err != nil { return err diff --git a/gen/table/table.go b/gen/table/table.go new file mode 100644 index 0000000..1e481c5 --- /dev/null +++ b/gen/table/table.go @@ -0,0 +1,95 @@ +package table + +import ( + "bytes" + "errors" + "fmt" + "unicode/utf8" + + "golang.org/x/text/unicode/norm" + + "moria.us/macscript/charmap" +) + +// An UnsupportedError indicates that the charmap is not supported by the +// conversion routines. +type UnsupportedError struct { + Message string +} + +func (e *UnsupportedError) Error() string { + return "unsupported charmap: " + e.Message +} + +// Table type identifiers. +const ( + extendedASCIITable = iota + 1 +) + +type Table interface { + Data() []byte +} + +func Create(m *charmap.Charmap) (Table, error) { + if m.OneByte == nil { + return nil, errors.New("missing one-byte map") + } + if m.TwoByte != nil { + return nil, &UnsupportedError{"multibyte encoding"} + } + if m.Digraph != nil { + return nil, &UnsupportedError{"contains digraphs"} + } + var t ExtendedASCII + for c, e := range m.OneByte { + if e.Direction != charmap.DirectionAny { + return nil, &UnsupportedError{ + fmt.Sprintf("character has bidirectional context: 0x%02x", c)} + } + var u rune + switch len(e.Unicode) { + case 0: + case 1: + u = e.Unicode[0] + default: + return nil, &UnsupportedError{ + fmt.Sprintf("character maps to multiple code points: 0x%02x", c)} + } + if c < 128 { + if u != rune(c) { + return nil, &UnsupportedError{ + fmt.Sprintf("character is not equal to ASCII equivalent: 0x%02x", c)} + } + } else { + t.HighCharacters[c-128] = u + } + } + return &t, nil +} + +// An ExtendedASCII is a table for converting from extended ASCII. +type ExtendedASCII struct { + HighCharacters [128]rune +} + +func (t *ExtendedASCII) Data() []byte { + var ubuf [4]byte + var nbuf [16]byte + d := []byte{extendedASCIITable} + for _, u := range t.HighCharacters { + var udata, ndata []byte + if u != 0 { + n := utf8.EncodeRune(ubuf[:], u) + udata = ubuf[:n] + ndata = norm.NFD.Append(nbuf[:0], udata...) + if bytes.Equal(udata, ndata) { + ndata = nil + } + } + d = append(d, byte(len(udata))) + d = append(d, udata...) + d = append(d, byte(len(ndata))) + d = append(d, ndata...) + } + return d +} diff --git a/src/.gitignore b/src/.gitignore index aa02f65..1282dca 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,2 +1,3 @@ /getcharmap.c /charmaps.r +/*.dat