2021-12-15 04:48:47 +00:00
|
|
|
// Package charmap provides a way to read character maps.
|
|
|
|
package charmap
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"encoding/hex"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
2022-03-15 20:19:56 +00:00
|
|
|
"os"
|
2021-12-15 04:48:47 +00:00
|
|
|
"strconv"
|
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
|
|
|
|
// An Error indicates an error parsing a character mapping file.
|
|
|
|
type Error struct {
|
|
|
|
File string
|
|
|
|
Line int
|
|
|
|
Err error
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Error) Error() (s string) {
|
|
|
|
if e.File != "" {
|
|
|
|
s += e.File + ":"
|
|
|
|
}
|
|
|
|
if e.Line != 0 {
|
|
|
|
s += strconv.Itoa(e.Line) + ":"
|
|
|
|
}
|
|
|
|
m := e.Err.Error()
|
|
|
|
if s == "" {
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
return s + " " + m
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
errBadType = errors.New("unknown entry type")
|
|
|
|
errColumns = errors.New("expected 2 columns")
|
|
|
|
errUnicode = errors.New("invalid unicode sequence")
|
|
|
|
errCodePointRange = errors.New("code point out of range")
|
2022-03-15 20:19:56 +00:00
|
|
|
errDuplicate = errors.New("duplicate entry")
|
2021-12-15 04:48:47 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// A Direction is a direction context for Unicode characters.
|
|
|
|
type Direction uint32
|
|
|
|
|
|
|
|
const (
|
|
|
|
// DirectionAny indicates that the character can be omitted in any
|
|
|
|
// direction.
|
|
|
|
DirectionAny Direction = iota
|
|
|
|
// DirectionLR indicates that the character requires left-to-right context.
|
|
|
|
DirectionLR
|
|
|
|
// DirectionRL indicates that the character requires right-to-left context.
|
|
|
|
DirectionRL
|
|
|
|
)
|
|
|
|
|
|
|
|
// An Entry is a single entry in a character map.
|
|
|
|
type Entry struct {
|
|
|
|
Direction Direction
|
2022-03-15 20:19:56 +00:00
|
|
|
Unicode []rune
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
|
2022-03-15 20:19:56 +00:00
|
|
|
// A Charmap is a character map, mapping characters from a platform encoding to
|
|
|
|
// Unicode.
|
|
|
|
type Charmap struct {
|
|
|
|
// Pairs of valid one-byte characters that have an alternate Unicode
|
|
|
|
// representation.
|
|
|
|
Digraph map[[2]byte]Entry
|
|
|
|
|
|
|
|
// Valid single-byte characters.
|
|
|
|
OneByte map[byte]Entry
|
|
|
|
|
|
|
|
// Valid two-byte characters.
|
|
|
|
TwoByte map[[2]byte]Entry
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
|
2022-03-15 20:19:56 +00:00
|
|
|
func (m *Charmap) parseLine(line []byte) error {
|
|
|
|
// Remove comment.
|
|
|
|
if i := bytes.IndexByte(line, '#'); i != -1 {
|
|
|
|
line = line[:i]
|
|
|
|
}
|
|
|
|
if len(line) == 0 {
|
|
|
|
return nil
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Split into two columns.
|
|
|
|
i := bytes.IndexByte(line, '\t')
|
|
|
|
if i == -1 || i == 0 {
|
2022-03-15 20:19:56 +00:00
|
|
|
return errColumns
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
c1 := line[:i]
|
|
|
|
c2 := line[i+1:]
|
|
|
|
if i := bytes.IndexByte(c2, '\t'); i != -1 {
|
|
|
|
c2 = c2[:i]
|
|
|
|
}
|
|
|
|
if len(c2) == 0 {
|
2022-03-15 20:19:56 +00:00
|
|
|
return errColumns
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
|
2022-03-15 20:19:56 +00:00
|
|
|
// Parse Unicode sequence and context.
|
|
|
|
var e Entry
|
2021-12-15 04:48:47 +00:00
|
|
|
if c2[0] == '<' {
|
|
|
|
i := bytes.IndexByte(c2, '+')
|
|
|
|
if i == -1 {
|
2022-03-15 20:19:56 +00:00
|
|
|
return errors.New("invalid Unicode string")
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
ctx := c2[:i]
|
|
|
|
c2 = c2[i+1:]
|
|
|
|
switch string(ctx) {
|
|
|
|
case "<LR>":
|
|
|
|
e.Direction = DirectionLR
|
|
|
|
case "<RL>":
|
|
|
|
e.Direction = DirectionRL
|
|
|
|
default:
|
2022-03-15 20:19:56 +00:00
|
|
|
return fmt.Errorf("unknown context: %q", ctx)
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
}
|
2022-03-15 20:19:56 +00:00
|
|
|
var ubuf [8]rune
|
|
|
|
var ulen int
|
2021-12-15 04:48:47 +00:00
|
|
|
for c2 != nil {
|
|
|
|
var cp []byte
|
|
|
|
if i := bytes.IndexByte(c2, '+'); i != -1 {
|
|
|
|
cp = c2[:i]
|
|
|
|
c2 = c2[i+1:]
|
|
|
|
} else {
|
|
|
|
cp = c2
|
|
|
|
c2 = nil
|
|
|
|
}
|
|
|
|
if len(cp) < 2 || string(cp[0:2]) != "0x" {
|
2022-03-15 20:19:56 +00:00
|
|
|
return errUnicode
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
cp = cp[2:]
|
|
|
|
x, err := strconv.ParseUint(string(cp), 16, 32)
|
|
|
|
if err != nil {
|
2022-03-15 20:19:56 +00:00
|
|
|
return err
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
if x > unicode.MaxRune {
|
2022-03-15 20:19:56 +00:00
|
|
|
return errCodePointRange
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
2022-03-15 20:19:56 +00:00
|
|
|
if ulen >= len(ubuf) {
|
|
|
|
return errors.New("Unicode sequence too long")
|
|
|
|
}
|
|
|
|
ubuf[ulen] = rune(x)
|
|
|
|
ulen++
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
2022-03-15 20:19:56 +00:00
|
|
|
e.Unicode = make([]rune, ulen)
|
|
|
|
copy(e.Unicode, ubuf[:])
|
|
|
|
|
|
|
|
// Parse platform encoded value, store value there.
|
|
|
|
switch len(c1) {
|
|
|
|
case 4:
|
|
|
|
if string(c1[0:2]) != "0x" {
|
|
|
|
return errBadType
|
|
|
|
}
|
|
|
|
var k [1]byte
|
|
|
|
if _, err := hex.Decode(k[:], c1[2:]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ch := k[0]
|
|
|
|
if m.OneByte == nil {
|
|
|
|
m.OneByte = make(map[byte]Entry)
|
|
|
|
}
|
|
|
|
if _, ok := m.OneByte[ch]; ok {
|
|
|
|
return errDuplicate
|
|
|
|
}
|
|
|
|
m.OneByte[ch] = e
|
|
|
|
case 6:
|
|
|
|
if string(c1[0:2]) != "0x" {
|
|
|
|
return errBadType
|
|
|
|
}
|
|
|
|
var k [2]byte
|
|
|
|
if _, err := hex.Decode(k[:], c1[2:]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if m.TwoByte == nil {
|
|
|
|
m.TwoByte = make(map[[2]byte]Entry)
|
|
|
|
}
|
|
|
|
if _, ok := m.TwoByte[k]; ok {
|
|
|
|
return errDuplicate
|
|
|
|
}
|
|
|
|
m.TwoByte[k] = e
|
|
|
|
case 9:
|
|
|
|
if string(c1[0:2]) != "0x" || string(c1[4:7]) != "+0x" {
|
|
|
|
return errBadType
|
|
|
|
}
|
|
|
|
var k [2]byte
|
|
|
|
if _, err := hex.Decode(k[0:1], c1[2:4]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if _, err := hex.Decode(k[1:2], c1[7:9]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if m.Digraph == nil {
|
|
|
|
m.Digraph = make(map[[2]byte]Entry)
|
|
|
|
}
|
|
|
|
if _, ok := m.Digraph[k]; ok {
|
|
|
|
return errDuplicate
|
|
|
|
}
|
|
|
|
m.Digraph[k] = e
|
|
|
|
default:
|
|
|
|
return errBadType
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
|
2022-03-15 20:19:56 +00:00
|
|
|
// Read reads a charmap from a stream.
|
|
|
|
func Read(r io.Reader, name string) (*Charmap, error) {
|
|
|
|
sc := bufio.NewScanner(r)
|
|
|
|
var m Charmap
|
|
|
|
for lineno := 1; sc.Scan(); lineno++ {
|
|
|
|
if err := m.parseLine(sc.Bytes()); err != nil {
|
|
|
|
return nil, &Error{name, lineno, err}
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|
|
|
|
}
|
2022-03-15 20:19:56 +00:00
|
|
|
if err := sc.Err(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return &m, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// ReadFile reads a charmap from a file on disk.
|
|
|
|
func ReadFile(name string) (*Charmap, error) {
|
|
|
|
fp, err := os.Open(name)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer fp.Close()
|
|
|
|
return Read(fp, name)
|
2021-12-15 04:48:47 +00:00
|
|
|
}
|