mirror of
https://github.com/depp/syncfiles.git
synced 2025-01-03 05:29:42 +00:00
Generate conversion data for extended ASCII
This creates conversion tables for 11 of the 21 character sets.
This commit is contained in:
parent
f49e81388c
commit
f94a65d245
16
Formats.md
Normal file
16
Formats.md
Normal file
@ -0,0 +1,16 @@
|
||||
# Table Formats
|
||||
|
||||
Each character map table starts with a single nonzero byte, indicating the table's format.
|
||||
|
||||
## Extended ASCII
|
||||
|
||||
Format 1 is for "extended ASCII". Encoded values 0-127 are identical to ASCII, and encoded values 128-255 are mapped to single Unicode characters.
|
||||
|
||||
The table contains 128 entries, for encoded values 128-255, with the following format:
|
||||
|
||||
u8 Length of Unicode character
|
||||
u8[] Unicode character, UTF-8
|
||||
u8 Length of normalized Unicode character, may be zero
|
||||
u8[] Unicode character in NFD normal form, UTF-8
|
||||
|
||||
The second copy of the character is only present if the character decomposes into multiple characters.
|
@ -8,6 +8,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
"unicode"
|
||||
)
|
||||
@ -38,19 +39,7 @@ var (
|
||||
errColumns = errors.New("expected 2 columns")
|
||||
errUnicode = errors.New("invalid unicode sequence")
|
||||
errCodePointRange = errors.New("code point out of range")
|
||||
)
|
||||
|
||||
// An EntryType is a type of entry in a character mapping.
|
||||
type EntryType uint32
|
||||
|
||||
const (
|
||||
// EntryOneByte is an entry where a one-byte character is mapped to Unicode.
|
||||
EntryOneByte EntryType = iota
|
||||
// EntryTwoByte is an entry where a two-byte character is mapped to Unicode.
|
||||
EntryTwoByte
|
||||
// EntryDigraph is an entry where two one-byte characters are mapped to
|
||||
// Unicode.
|
||||
EntryDigraph
|
||||
errDuplicate = errors.New("duplicate entry")
|
||||
)
|
||||
|
||||
// A Direction is a direction context for Unicode characters.
|
||||
@ -68,34 +57,37 @@ const (
|
||||
|
||||
// An Entry is a single entry in a character map.
|
||||
type Entry struct {
|
||||
Type EntryType
|
||||
Encoded []byte
|
||||
Direction Direction
|
||||
Unicode []byte
|
||||
Unicode []rune
|
||||
}
|
||||
|
||||
// A Reader is a character map reader.
|
||||
type Reader struct {
|
||||
name string
|
||||
lineno int
|
||||
scanner *bufio.Scanner
|
||||
encoded [2]byte
|
||||
unicode bytes.Buffer
|
||||
// A Charmap is a character map, mapping characters from a platform encoding to
|
||||
// Unicode.
|
||||
type Charmap struct {
|
||||
// Pairs of valid one-byte characters that have an alternate Unicode
|
||||
// representation.
|
||||
Digraph map[[2]byte]Entry
|
||||
|
||||
// Valid single-byte characters.
|
||||
OneByte map[byte]Entry
|
||||
|
||||
// Valid two-byte characters.
|
||||
TwoByte map[[2]byte]Entry
|
||||
}
|
||||
|
||||
// NewReader creates a character map reader.
|
||||
func NewReader(r io.Reader, name string) *Reader {
|
||||
return &Reader{
|
||||
name: name,
|
||||
scanner: bufio.NewScanner(r),
|
||||
func (m *Charmap) parseLine(line []byte) error {
|
||||
// Remove comment.
|
||||
if i := bytes.IndexByte(line, '#'); i != -1 {
|
||||
line = line[:i]
|
||||
}
|
||||
if len(line) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
|
||||
// Split into two columns.
|
||||
i := bytes.IndexByte(line, '\t')
|
||||
if i == -1 || i == 0 {
|
||||
return e, errColumns
|
||||
return errColumns
|
||||
}
|
||||
c1 := line[:i]
|
||||
c2 := line[i+1:]
|
||||
@ -103,48 +95,15 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
|
||||
c2 = c2[:i]
|
||||
}
|
||||
if len(c2) == 0 {
|
||||
return e, errColumns
|
||||
return errColumns
|
||||
}
|
||||
|
||||
// Parse columns.
|
||||
switch len(c1) {
|
||||
case 4:
|
||||
if string(c1[0:2]) != "0x" {
|
||||
return e, errBadType
|
||||
}
|
||||
if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil {
|
||||
return e, err
|
||||
}
|
||||
e.Type = EntryOneByte
|
||||
e.Encoded = r.encoded[0:1]
|
||||
case 6:
|
||||
if string(c1[0:2]) != "0x" {
|
||||
return e, errBadType
|
||||
}
|
||||
if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil {
|
||||
return e, err
|
||||
}
|
||||
e.Type = EntryTwoByte
|
||||
e.Encoded = r.encoded[0:2]
|
||||
case 9:
|
||||
if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" {
|
||||
return e, errBadType
|
||||
}
|
||||
if _, err := hex.Decode(r.encoded[0:1], c1[2:4]); err != nil {
|
||||
return e, err
|
||||
}
|
||||
if _, err := hex.Decode(r.encoded[1:2], c1[7:9]); err != nil {
|
||||
return e, err
|
||||
}
|
||||
e.Type = EntryDigraph
|
||||
e.Encoded = r.encoded[0:2]
|
||||
default:
|
||||
return e, errBadType
|
||||
}
|
||||
// Parse Unicode sequence and context.
|
||||
var e Entry
|
||||
if c2[0] == '<' {
|
||||
i := bytes.IndexByte(c2, '+')
|
||||
if i == -1 {
|
||||
return e, errors.New("invalid Unicode string")
|
||||
return errors.New("invalid Unicode string")
|
||||
}
|
||||
ctx := c2[:i]
|
||||
c2 = c2[i+1:]
|
||||
@ -154,10 +113,11 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
|
||||
case "<RL>":
|
||||
e.Direction = DirectionRL
|
||||
default:
|
||||
return e, fmt.Errorf("unknown context: %q", ctx)
|
||||
return fmt.Errorf("unknown context: %q", ctx)
|
||||
}
|
||||
}
|
||||
r.unicode.Reset()
|
||||
var ubuf [8]rune
|
||||
var ulen int
|
||||
for c2 != nil {
|
||||
var cp []byte
|
||||
if i := bytes.IndexByte(c2, '+'); i != -1 {
|
||||
@ -168,44 +128,104 @@ func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
|
||||
c2 = nil
|
||||
}
|
||||
if len(cp) < 2 || string(cp[0:2]) != "0x" {
|
||||
return e, errUnicode
|
||||
return errUnicode
|
||||
}
|
||||
cp = cp[2:]
|
||||
x, err := strconv.ParseUint(string(cp), 16, 32)
|
||||
if err != nil {
|
||||
return e, err
|
||||
return err
|
||||
}
|
||||
if x > unicode.MaxRune {
|
||||
return e, errCodePointRange
|
||||
return errCodePointRange
|
||||
}
|
||||
r.unicode.WriteRune(rune(x))
|
||||
if ulen >= len(ubuf) {
|
||||
return errors.New("Unicode sequence too long")
|
||||
}
|
||||
e.Unicode = r.unicode.Bytes()
|
||||
return e, nil
|
||||
ubuf[ulen] = rune(x)
|
||||
ulen++
|
||||
}
|
||||
e.Unicode = make([]rune, ulen)
|
||||
copy(e.Unicode, ubuf[:])
|
||||
|
||||
// Parse platform encoded value, store value there.
|
||||
switch len(c1) {
|
||||
case 4:
|
||||
if string(c1[0:2]) != "0x" {
|
||||
return errBadType
|
||||
}
|
||||
var k [1]byte
|
||||
if _, err := hex.Decode(k[:], c1[2:]); err != nil {
|
||||
return err
|
||||
}
|
||||
ch := k[0]
|
||||
if m.OneByte == nil {
|
||||
m.OneByte = make(map[byte]Entry)
|
||||
}
|
||||
if _, ok := m.OneByte[ch]; ok {
|
||||
return errDuplicate
|
||||
}
|
||||
m.OneByte[ch] = e
|
||||
case 6:
|
||||
if string(c1[0:2]) != "0x" {
|
||||
return errBadType
|
||||
}
|
||||
var k [2]byte
|
||||
if _, err := hex.Decode(k[:], c1[2:]); err != nil {
|
||||
return err
|
||||
}
|
||||
if m.TwoByte == nil {
|
||||
m.TwoByte = make(map[[2]byte]Entry)
|
||||
}
|
||||
if _, ok := m.TwoByte[k]; ok {
|
||||
return errDuplicate
|
||||
}
|
||||
m.TwoByte[k] = e
|
||||
case 9:
|
||||
if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" {
|
||||
return errBadType
|
||||
}
|
||||
var k [2]byte
|
||||
if _, err := hex.Decode(k[0:1], c1[2:4]); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := hex.Decode(k[1:2], c1[7:9]); err != nil {
|
||||
return err
|
||||
}
|
||||
if m.Digraph == nil {
|
||||
m.Digraph = make(map[[2]byte]Entry)
|
||||
}
|
||||
if _, ok := m.Digraph[k]; ok {
|
||||
return errDuplicate
|
||||
}
|
||||
m.Digraph[k] = e
|
||||
default:
|
||||
return errBadType
|
||||
}
|
||||
|
||||
// Next returns the next entry in the character map. Returns io.EOF if there are
|
||||
// no more entries.
|
||||
func (r *Reader) Next() (e Entry, err error) {
|
||||
for {
|
||||
if !r.scanner.Scan() {
|
||||
if err := r.scanner.Err(); err != nil {
|
||||
return e, err
|
||||
return nil
|
||||
}
|
||||
return e, io.EOF
|
||||
|
||||
// Read reads a charmap from a stream.
|
||||
func Read(r io.Reader, name string) (*Charmap, error) {
|
||||
sc := bufio.NewScanner(r)
|
||||
var m Charmap
|
||||
for lineno := 1; sc.Scan(); lineno++ {
|
||||
if err := m.parseLine(sc.Bytes()); err != nil {
|
||||
return nil, &Error{name, lineno, err}
|
||||
}
|
||||
r.lineno++
|
||||
line := r.scanner.Bytes()
|
||||
// Remove comment.
|
||||
if i := bytes.IndexByte(line, '#'); i != -1 {
|
||||
line = line[:i]
|
||||
}
|
||||
if len(line) != 0 {
|
||||
e, err = r.parseEntry(line)
|
||||
if err := sc.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &m, nil
|
||||
}
|
||||
|
||||
// ReadFile reads a charmap from a file on disk.
|
||||
func ReadFile(name string) (*Charmap, error) {
|
||||
fp, err := os.Open(name)
|
||||
if err != nil {
|
||||
err = &Error{r.name, r.lineno, err}
|
||||
}
|
||||
return
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer fp.Close()
|
||||
return Read(fp, name)
|
||||
}
|
||||
|
60
gen/main.go
60
gen/main.go
@ -1,24 +1,20 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"moria.us/macscript/charmap"
|
||||
"moria.us/macscript/table"
|
||||
)
|
||||
|
||||
const header = "/* This file is automatically generated. */\n"
|
||||
|
||||
var (
|
||||
flagDumpSequences bool
|
||||
flagDumpTransitions bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
|
||||
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
|
||||
}
|
||||
|
||||
func getSrcdir() (string, error) {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
@ -35,15 +31,53 @@ func mainE() error {
|
||||
if err := os.Chdir(srcdir); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Read metadata.
|
||||
d, err := readData()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Compile and emit charmap data.
|
||||
cms := make([]string, len(d.charmaps))
|
||||
var hascmap bool
|
||||
for i, c := range d.charmaps {
|
||||
if c.file == "" {
|
||||
continue
|
||||
}
|
||||
cm, err := charmap.ReadFile(filepath.Join("charmap", c.file))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
t, err := table.Create(cm)
|
||||
if err != nil {
|
||||
if e, ok := err.(*table.UnsupportedError); ok {
|
||||
fmt.Fprintf(os.Stderr, "Warning: unsupported charmap %q: %s\n", c.file, e.Message)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "Error: %s: %v", c.file, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
data := t.Data()
|
||||
name := "charmap_" + strings.ToLower(strings.TrimSuffix(c.file, ".TXT")) + ".dat"
|
||||
fpath := filepath.Join("src", name)
|
||||
fmt.Fprintln(os.Stderr, "Writing:", fpath)
|
||||
if err := ioutil.WriteFile(fpath, data, 0666); err != nil {
|
||||
return err
|
||||
}
|
||||
cms[i] = name
|
||||
hascmap = true
|
||||
}
|
||||
if !hascmap {
|
||||
return errors.New("could not compile any character map")
|
||||
}
|
||||
|
||||
// Write generated output.
|
||||
m := genMap(&d)
|
||||
if err := writeMap(&d, m, "src/getcharmap.c"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeRez(&d, "src/charmaps.r"); err != nil {
|
||||
if err := writeRez(&d, cms, "src/charmaps.r"); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
@ -51,6 +85,10 @@ func mainE() error {
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
if args := flag.Args(); len(args) != 0 {
|
||||
fmt.Fprintf(os.Stderr, "Error: unexpected argument: %q\n", args[0])
|
||||
os.Exit(2)
|
||||
}
|
||||
if err := mainE(); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error:", err)
|
||||
os.Exit(1)
|
||||
|
13
gen/rez.go
13
gen/rez.go
@ -41,7 +41,7 @@ func constStrings(c *constmap) []string {
|
||||
return r
|
||||
}
|
||||
|
||||
func writeRez(d *scriptdata, filename string) error {
|
||||
func writeRez(d *scriptdata, charmaps []string, filename string) error {
|
||||
fmt.Fprintln(os.Stderr, "Writing:", filename)
|
||||
fp, err := os.Create(filename)
|
||||
if err != nil {
|
||||
@ -52,9 +52,14 @@ func writeRez(d *scriptdata, filename string) error {
|
||||
|
||||
w.WriteString(header)
|
||||
w.WriteString("#include \"resources.h\"\n")
|
||||
writeStrings(w, "rSTRS_Charmaps", charmapNames(d))
|
||||
writeStrings(w, "rSTRS_Scripts", constStrings(&d.scripts))
|
||||
writeStrings(w, "rSTRS_Regions", constStrings(&d.regions))
|
||||
writeStrings(w, `rSTRS_Charmaps, "Character Maps"`, charmapNames(d))
|
||||
writeStrings(w, `rSTRS_Scripts, "Scripts"`, constStrings(&d.scripts))
|
||||
writeStrings(w, `rSTRS_Regions, "Regions"`, constStrings(&d.regions))
|
||||
for i, cm := range charmaps {
|
||||
if cm != "" {
|
||||
fmt.Fprintf(w, "read 'cmap' (%d, %q) %q;\n", 128+i, d.charmaps[i].name, cm)
|
||||
}
|
||||
}
|
||||
|
||||
if err := w.Flush(); err != nil {
|
||||
return err
|
||||
|
95
gen/table/table.go
Normal file
95
gen/table/table.go
Normal file
@ -0,0 +1,95 @@
|
||||
package table
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/unicode/norm"
|
||||
|
||||
"moria.us/macscript/charmap"
|
||||
)
|
||||
|
||||
// An UnsupportedError indicates that the charmap is not supported by the
|
||||
// conversion routines.
|
||||
type UnsupportedError struct {
|
||||
Message string
|
||||
}
|
||||
|
||||
func (e *UnsupportedError) Error() string {
|
||||
return "unsupported charmap: " + e.Message
|
||||
}
|
||||
|
||||
// Table type identifiers.
|
||||
const (
|
||||
extendedASCIITable = iota + 1
|
||||
)
|
||||
|
||||
type Table interface {
|
||||
Data() []byte
|
||||
}
|
||||
|
||||
func Create(m *charmap.Charmap) (Table, error) {
|
||||
if m.OneByte == nil {
|
||||
return nil, errors.New("missing one-byte map")
|
||||
}
|
||||
if m.TwoByte != nil {
|
||||
return nil, &UnsupportedError{"multibyte encoding"}
|
||||
}
|
||||
if m.Digraph != nil {
|
||||
return nil, &UnsupportedError{"contains digraphs"}
|
||||
}
|
||||
var t ExtendedASCII
|
||||
for c, e := range m.OneByte {
|
||||
if e.Direction != charmap.DirectionAny {
|
||||
return nil, &UnsupportedError{
|
||||
fmt.Sprintf("character has bidirectional context: 0x%02x", c)}
|
||||
}
|
||||
var u rune
|
||||
switch len(e.Unicode) {
|
||||
case 0:
|
||||
case 1:
|
||||
u = e.Unicode[0]
|
||||
default:
|
||||
return nil, &UnsupportedError{
|
||||
fmt.Sprintf("character maps to multiple code points: 0x%02x", c)}
|
||||
}
|
||||
if c < 128 {
|
||||
if u != rune(c) {
|
||||
return nil, &UnsupportedError{
|
||||
fmt.Sprintf("character is not equal to ASCII equivalent: 0x%02x", c)}
|
||||
}
|
||||
} else {
|
||||
t.HighCharacters[c-128] = u
|
||||
}
|
||||
}
|
||||
return &t, nil
|
||||
}
|
||||
|
||||
// An ExtendedASCII is a table for converting from extended ASCII.
|
||||
type ExtendedASCII struct {
|
||||
HighCharacters [128]rune
|
||||
}
|
||||
|
||||
func (t *ExtendedASCII) Data() []byte {
|
||||
var ubuf [4]byte
|
||||
var nbuf [16]byte
|
||||
d := []byte{extendedASCIITable}
|
||||
for _, u := range t.HighCharacters {
|
||||
var udata, ndata []byte
|
||||
if u != 0 {
|
||||
n := utf8.EncodeRune(ubuf[:], u)
|
||||
udata = ubuf[:n]
|
||||
ndata = norm.NFD.Append(nbuf[:0], udata...)
|
||||
if bytes.Equal(udata, ndata) {
|
||||
ndata = nil
|
||||
}
|
||||
}
|
||||
d = append(d, byte(len(udata)))
|
||||
d = append(d, udata...)
|
||||
d = append(d, byte(len(ndata)))
|
||||
d = append(d, ndata...)
|
||||
}
|
||||
return d
|
||||
}
|
1
src/.gitignore
vendored
1
src/.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
/getcharmap.c
|
||||
/charmaps.r
|
||||
/*.dat
|
||||
|
Loading…
Reference in New Issue
Block a user