Add charmap parser

This commit is contained in:
Dietrich Epp 2021-12-14 23:48:47 -05:00
parent d2401b963a
commit faa2460468

211
gen/charmap/charmap.go Normal file
View File

@ -0,0 +1,211 @@
// Package charmap provides a way to read character maps.
package charmap
import (
"bufio"
"bytes"
"encoding/hex"
"errors"
"fmt"
"io"
"strconv"
"unicode"
)
// An Error indicates an error parsing a character mapping file.
type Error struct {
File string
Line int
Err error
}
func (e *Error) Error() (s string) {
if e.File != "" {
s += e.File + ":"
}
if e.Line != 0 {
s += strconv.Itoa(e.Line) + ":"
}
m := e.Err.Error()
if s == "" {
return m
}
return s + " " + m
}
var (
errBadType = errors.New("unknown entry type")
errColumns = errors.New("expected 2 columns")
errUnicode = errors.New("invalid unicode sequence")
errCodePointRange = errors.New("code point out of range")
)
// An EntryType is a type of entry in a character mapping.
type EntryType uint32
const (
// EntryOneByte is an entry where a one-byte character is mapped to Unicode.
EntryOneByte EntryType = iota
// EntryTwoByte is an entry where a two-byte character is mapped to Unicode.
EntryTwoByte
// EntryDigraph is an entry where two one-byte characters are mapped to
// Unicode.
EntryDigraph
)
// A Direction is a direction context for Unicode characters.
type Direction uint32
const (
// DirectionAny indicates that the character can be omitted in any
// direction.
DirectionAny Direction = iota
// DirectionLR indicates that the character requires left-to-right context.
DirectionLR
// DirectionRL indicates that the character requires right-to-left context.
DirectionRL
)
// An Entry is a single entry in a character map.
type Entry struct {
Type EntryType
Encoded []byte
Direction Direction
Unicode []byte
}
// A Reader is a character map reader.
type Reader struct {
name string
lineno int
scanner *bufio.Scanner
encoded [2]byte
unicode bytes.Buffer
}
// NewReader creates a character map reader.
func NewReader(r io.Reader, name string) *Reader {
return &Reader{
name: name,
scanner: bufio.NewScanner(r),
}
}
func (r *Reader) parseEntry(line []byte) (e Entry, err error) {
// Split into two columns.
i := bytes.IndexByte(line, '\t')
if i == -1 || i == 0 {
return e, errColumns
}
c1 := line[:i]
c2 := line[i+1:]
if i := bytes.IndexByte(c2, '\t'); i != -1 {
c2 = c2[:i]
}
if len(c2) == 0 {
return e, errColumns
}
// Parse columns.
switch len(c1) {
case 4:
if string(c1[0:2]) != "0x" {
return e, errBadType
}
if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil {
return e, err
}
e.Type = EntryOneByte
e.Encoded = r.encoded[0:1]
case 6:
if string(c1[0:2]) != "0x" {
return e, errBadType
}
if _, err := hex.Decode(r.encoded[:], c1[2:]); err != nil {
return e, err
}
e.Type = EntryTwoByte
e.Encoded = r.encoded[0:2]
case 9:
if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" {
return e, errBadType
}
if _, err := hex.Decode(r.encoded[0:1], c1[2:4]); err != nil {
return e, err
}
if _, err := hex.Decode(r.encoded[1:2], c1[7:9]); err != nil {
return e, err
}
e.Type = EntryDigraph
e.Encoded = r.encoded[0:2]
default:
return e, errBadType
}
if c2[0] == '<' {
i := bytes.IndexByte(c2, '+')
if i == -1 {
return e, errors.New("invalid Unicode string")
}
ctx := c2[:i]
c2 = c2[i+1:]
switch string(ctx) {
case "<LR>":
e.Direction = DirectionLR
case "<RL>":
e.Direction = DirectionRL
default:
return e, fmt.Errorf("unknown context: %q", ctx)
}
}
r.unicode.Reset()
for c2 != nil {
var cp []byte
if i := bytes.IndexByte(c2, '+'); i != -1 {
cp = c2[:i]
c2 = c2[i+1:]
} else {
cp = c2
c2 = nil
}
if len(cp) < 2 || string(cp[0:2]) != "0x" {
return e, errUnicode
}
cp = cp[2:]
x, err := strconv.ParseUint(string(cp), 16, 32)
if err != nil {
return e, err
}
if x > unicode.MaxRune {
return e, errCodePointRange
}
r.unicode.WriteRune(rune(x))
}
e.Unicode = r.unicode.Bytes()
return e, nil
}
// Next returns the next entry in the character map. Returns io.EOF if there are
// no more entries.
func (r *Reader) Next() (e Entry, err error) {
for {
if !r.scanner.Scan() {
if err := r.scanner.Err(); err != nil {
return e, err
}
return e, io.EOF
}
r.lineno++
line := r.scanner.Bytes()
// Remove comment.
if i := bytes.IndexByte(line, '#'); i != -1 {
line = line[:i]
}
if len(line) != 0 {
e, err = r.parseEntry(line)
if err != nil {
err = &Error{r.name, r.lineno, err}
}
return
}
}
}