Generate conversion data for extended ASCII

This creates conversion tables for 11 of the 21 character sets.
This commit is contained in:
Dietrich Epp 2022-03-15 16:19:56 -04:00
parent f49e81388c
commit f94a65d245
6 changed files with 288 additions and 113 deletions

16
Formats.md Normal file
View File

@ -0,0 +1,16 @@
# Table Formats
Each character map table starts with a single nonzero byte, indicating the table's format.
## Extended ASCII
Format 1 is for "extended ASCII". Encoded values 0-127 are identical to ASCII, and encoded values 128-255 are mapped to single Unicode characters.
The table contains 128 entries, for encoded values 128-255, with the following format:
u8 Length of Unicode character
u8[] Unicode character, UTF-8
u8 Length of normalized Unicode character, may be zero
u8[] Unicode character in NFD normal form, UTF-8
The second copy of the character is only present if the character decomposes into multiple characters.

View File

@ -8,6 +8,7 @@ import (
"errors"
"fmt"
"io"
"os"
"strconv"
"unicode"
)
@ -38,19 +39,7 @@ var (
errColumns = errors.New("expected 2 columns")
errUnicode = errors.New("invalid unicode sequence")
errCodePointRange = errors.New("code point out of range")
)
// An EntryType is a type of entry in a character mapping.
type EntryType uint32
const (
// EntryOneByte is an entry where a one-byte character is mapped to Unicode.
EntryOneByte EntryType = iota
// EntryTwoByte is an entry where a two-byte character is mapped to Unicode.
EntryTwoByte
// EntryDigraph is an entry where two one-byte characters are mapped to
// Unicode.
EntryDigraph
errDuplicate = errors.New("duplicate entry")
)
// A Direction is a direction context for Unicode characters.
@ -68,34 +57,37 @@ const (
// An Entry is a single entry in a character map.
type Entry struct {
Type EntryType
Encoded []byte
Direction Direction
Unicode []byte
Unicode []rune
}
// A Reader is a character map reader.
type Reader struct {
name string
lineno int
scanner *bufio.Scanner
encoded [2]byte
unicode bytes.Buffer
// A Charmap is a character map, mapping characters from a platform encoding to
// Unicode.
type Charmap struct {
// Pairs of valid one-byte characters that have an alternate Unicode
// representation.
Digraph map[[2]byte]Entry
// Valid single-byte characters.
OneByte map[byte]Entry
// Valid two-byte characters.
TwoByte map[[2]byte]Entry
}
// NewReader creates a character map reader.
func NewReader(r io.Reader, name string) *Reader {
return &Reader{
name: name,
scanner: bufio.NewScanner(r),
func (m *Charmap) parseLine(line []byte) error {
// Remove comment.
if i := bytes.IndexByte(line, '#'); i != -1 {
line = line[:i]
}
if len(line) == 0 {
return nil
}
}
func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
// Split into two columns.
i := bytes.IndexByte(line, '\t')
if i == -1 || i == 0 {
return e, errColumns
return errColumns
}
c1 := line[:i]
c2 := line[i+1:]
@ -103,48 +95,15 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
c2 = c2[:i]
}
if len(c2) == 0 {
return e, errColumns
return errColumns
}
// Parse columns.
switch len(c1) {
case 4:
if string(c1[0:2]) != "0x" {
return e, errBadType
}
if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil {
return e, err
}
e.Type = EntryOneByte
e.Encoded = r.encoded[0:1]
case 6:
if string(c1[0:2]) != "0x" {
return e, errBadType
}
if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil {
return e, err
}
e.Type = EntryTwoByte
e.Encoded = r.encoded[0:2]
case 9:
if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" {
return e, errBadType
}
if _, err := hex.Decode(r.encoded[0:1], c1[2:4]); err != nil {
return e, err
}
if _, err := hex.Decode(r.encoded[1:2], c1[7:9]); err != nil {
return e, err
}
e.Type = EntryDigraph
e.Encoded = r.encoded[0:2]
default:
return e, errBadType
}
// Parse Unicode sequence and context.
var e Entry
if c2[0] == '<' {
i := bytes.IndexByte(c2, '+')
if i == -1 {
return e, errors.New("invalid Unicode string")
return errors.New("invalid Unicode string")
}
ctx := c2[:i]
c2 = c2[i+1:]
@ -154,10 +113,11 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
case "<RL>":
e.Direction = DirectionRL
default:
return e, fmt.Errorf("unknown context: %q", ctx)
return fmt.Errorf("unknown context: %q", ctx)
}
}
r.unicode.Reset()
var ubuf [8]rune
var ulen int
for c2 != nil {
var cp []byte
if i := bytes.IndexByte(c2, '+'); i != -1 {
@ -168,44 +128,104 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
c2 = nil
}
if len(cp) < 2 || string(cp[0:2]) != "0x" {
return e, errUnicode
return errUnicode
}
cp = cp[2:]
x, err := strconv.ParseUint(string(cp), 16, 32)
if err != nil {
return e, err
return err
}
if x > unicode.MaxRune {
return e, errCodePointRange
return errCodePointRange
}
r.unicode.WriteRune(rune(x))
if ulen >= len(ubuf) {
return errors.New("Unicode sequence too long")
}
ubuf[ulen] = rune(x)
ulen++
}
e.Unicode = r.unicode.Bytes()
return e, nil
e.Unicode = make([]rune, ulen)
copy(e.Unicode, ubuf[:])
// Parse platform encoded value, store value there.
switch len(c1) {
case 4:
if string(c1[0:2]) != "0x" {
return errBadType
}
var k [1]byte
if _, err := hex.Decode(k[:], c1[2:]); err != nil {
return err
}
ch := k[0]
if m.OneByte == nil {
m.OneByte = make(map[byte]Entry)
}
if _, ok := m.OneByte[ch]; ok {
return errDuplicate
}
m.OneByte[ch] = e
case 6:
if string(c1[0:2]) != "0x" {
return errBadType
}
var k [2]byte
if _, err := hex.Decode(k[:], c1[2:]); err != nil {
return err
}
if m.TwoByte == nil {
m.TwoByte = make(map[[2]byte]Entry)
}
if _, ok := m.TwoByte[k]; ok {
return errDuplicate
}
m.TwoByte[k] = e
case 9:
if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" {
return errBadType
}
var k [2]byte
if _, err := hex.Decode(k[0:1], c1[2:4]); err != nil {
return err
}
if _, err := hex.Decode(k[1:2], c1[7:9]); err != nil {
return err
}
if m.Digraph == nil {
m.Digraph = make(map[[2]byte]Entry)
}
if _, ok := m.Digraph[k]; ok {
return errDuplicate
}
m.Digraph[k] = e
default:
return errBadType
}
return nil
}
// Next returns the next entry in the character map. Returns io.EOF if there are
// no more entries.
func (r *Reader) Next() (e Entry, err error) {
for {
if !r.scanner.Scan() {
if err := r.scanner.Err(); err != nil {
return e, err
}
return e, io.EOF
}
r.lineno++
line := r.scanner.Bytes()
// Remove comment.
if i := bytes.IndexByte(line, '#'); i != -1 {
line = line[:i]
}
if len(line) != 0 {
e, err = r.parseEntry(line)
if err != nil {
err = &Error{r.name, r.lineno, err}
}
return
// Read reads a charmap from a stream.
func Read(r io.Reader, name string) (*Charmap, error) {
sc := bufio.NewScanner(r)
var m Charmap
for lineno := 1; sc.Scan(); lineno++ {
if err := m.parseLine(sc.Bytes()); err != nil {
return nil, &Error{name, lineno, err}
}
}
if err := sc.Err(); err != nil {
return nil, err
}
return &m, nil
}
// ReadFile reads a charmap from a file on disk.
func ReadFile(name string) (*Charmap, error) {
fp, err := os.Open(name)
if err != nil {
return nil, err
}
defer fp.Close()
return Read(fp, name)
}

View File

@ -1,24 +1,20 @@
package main
import (
"errors"
"flag"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"moria.us/macscript/charmap"
"moria.us/macscript/table"
)
const header = "/* This file is automatically generated. */\n"
var (
flagDumpSequences bool
flagDumpTransitions bool
)
func init() {
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
}
func getSrcdir() (string, error) {
exe, err := os.Executable()
if err != nil {
@ -35,15 +31,53 @@ func mainE() error {
if err := os.Chdir(srcdir); err != nil {
return err
}
// Read metadata.
d, err := readData()
if err != nil {
return err
}
// Compile and emit charmap data.
cms := make([]string, len(d.charmaps))
var hascmap bool
for i, c := range d.charmaps {
if c.file == "" {
continue
}
cm, err := charmap.ReadFile(filepath.Join("charmap", c.file))
if err != nil {
return err
}
t, err := table.Create(cm)
if err != nil {
if e, ok := err.(*table.UnsupportedError); ok {
fmt.Fprintf(os.Stderr, "Warning: unsupported charmap %q: %s\n", c.file, e.Message)
continue
}
fmt.Fprintf(os.Stderr, "Error: %s: %v", c.file, err)
os.Exit(1)
}
data := t.Data()
name := "charmap_" + strings.ToLower(strings.TrimSuffix(c.file, ".TXT")) + ".dat"
fpath := filepath.Join("src", name)
fmt.Fprintln(os.Stderr, "Writing:", fpath)
if err := ioutil.WriteFile(fpath, data, 0666); err != nil {
return err
}
cms[i] = name
hascmap = true
}
if !hascmap {
return errors.New("could not compile any character map")
}
// Write generated output.
m := genMap(&d)
if err := writeMap(&d, m, "src/getcharmap.c"); err != nil {
return err
}
if err := writeRez(&d, "src/charmaps.r"); err != nil {
if err := writeRez(&d, cms, "src/charmaps.r"); err != nil {
return err
}
return nil
@ -51,6 +85,10 @@ func mainE() error {
func main() {
flag.Parse()
if args := flag.Args(); len(args) != 0 {
fmt.Fprintf(os.Stderr, "Error: unexpected argument: %q\n", args[0])
os.Exit(2)
}
if err := mainE(); err != nil {
fmt.Fprintln(os.Stderr, "Error:", err)
os.Exit(1)

View File

@ -41,7 +41,7 @@ func constStrings(c *constmap) []string {
return r
}
func writeRez(d *scriptdata, filename string) error {
func writeRez(d *scriptdata, charmaps []string, filename string) error {
fmt.Fprintln(os.Stderr, "Writing:", filename)
fp, err := os.Create(filename)
if err != nil {
@ -52,9 +52,14 @@ func writeRez(d *scriptdata, filename string) error {
w.WriteString(header)
w.WriteString("#include \"resources.h\"\n")
writeStrings(w, "rSTRS_Charmaps", charmapNames(d))
writeStrings(w, "rSTRS_Scripts", constStrings(&d.scripts))
writeStrings(w, "rSTRS_Regions", constStrings(&d.regions))
writeStrings(w, `rSTRS_Charmaps, "Character Maps"`, charmapNames(d))
writeStrings(w, `rSTRS_Scripts, "Scripts"`, constStrings(&d.scripts))
writeStrings(w, `rSTRS_Regions, "Regions"`, constStrings(&d.regions))
for i, cm := range charmaps {
if cm != "" {
fmt.Fprintf(w, "read 'cmap' (%d, %q) %q;\n", 128+i, d.charmaps[i].name, cm)
}
}
if err := w.Flush(); err != nil {
return err

95
gen/table/table.go Normal file
View File

@ -0,0 +1,95 @@
package table
import (
"bytes"
"errors"
"fmt"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
"moria.us/macscript/charmap"
)
// An UnsupportedError indicates that the charmap is not supported by the
// conversion routines.
type UnsupportedError struct {
Message string
}
func (e *UnsupportedError) Error() string {
return "unsupported charmap: " + e.Message
}
// Table type identifiers.
const (
extendedASCIITable = iota + 1
)
type Table interface {
Data() []byte
}
func Create(m *charmap.Charmap) (Table, error) {
if m.OneByte == nil {
return nil, errors.New("missing one-byte map")
}
if m.TwoByte != nil {
return nil, &UnsupportedError{"multibyte encoding"}
}
if m.Digraph != nil {
return nil, &UnsupportedError{"contains digraphs"}
}
var t ExtendedASCII
for c, e := range m.OneByte {
if e.Direction != charmap.DirectionAny {
return nil, &UnsupportedError{
fmt.Sprintf("character has bidirectional context: 0x%02x", c)}
}
var u rune
switch len(e.Unicode) {
case 0:
case 1:
u = e.Unicode[0]
default:
return nil, &UnsupportedError{
fmt.Sprintf("character maps to multiple code points: 0x%02x", c)}
}
if c < 128 {
if u != rune(c) {
return nil, &UnsupportedError{
fmt.Sprintf("character is not equal to ASCII equivalent: 0x%02x", c)}
}
} else {
t.HighCharacters[c-128] = u
}
}
return &t, nil
}
// An ExtendedASCII is a table for converting from extended ASCII.
type ExtendedASCII struct {
HighCharacters [128]rune
}
func (t *ExtendedASCII) Data() []byte {
var ubuf [4]byte
var nbuf [16]byte
d := []byte{extendedASCIITable}
for _, u := range t.HighCharacters {
var udata, ndata []byte
if u != 0 {
n := utf8.EncodeRune(ubuf[:], u)
udata = ubuf[:n]
ndata = norm.NFD.Append(nbuf[:0], udata...)
if bytes.Equal(udata, ndata) {
ndata = nil
}
}
d = append(d, byte(len(udata)))
d = append(d, udata...)
d = append(d, byte(len(ndata)))
d = append(d, ndata...)
}
return d
}

1
src/.gitignore vendored
View File

@ -1,2 +1,3 @@
/getcharmap.c
/charmaps.r
/*.dat