Refactor table generator, handle line breaks

Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
This commit is contained in:
Dietrich Epp 2021-12-14 13:12:52 -05:00
parent d77ccf009e
commit db4187b65b
3 changed files with 231 additions and 175 deletions

View File

@ -1,6 +1,7 @@
package main package main
import ( import (
"bufio"
"flag" "flag"
"fmt" "fmt"
"os" "os"
@ -20,7 +21,7 @@ func init() {
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables") flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
} }
var characters [256]uint16 var characters [256]rune
func init() { func init() {
hichars := [128]uint16{ hichars := [128]uint16{
@ -42,131 +43,42 @@ func init() {
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
} }
for i := 0; i < 128; i++ { for i := 0; i < 128; i++ {
characters[i] = uint16(i) characters[i] = rune(i)
} }
for i, c := range hichars { for i, c := range hichars {
characters[i+128] = c characters[i+128] = rune(c)
}
characters['\n'] = '\r'
}
type state struct {
chars [256]uint8
states [256]*state
}
func genStates() *state {
root := new(state)
// Iterate over each Unicode normalization form.
// Omit norm.NFKC, norm.NFKD
for _, form := range []norm.Form{norm.NFC, norm.NFD} {
// Iterate over Macintosh, Unicode characters.
for m, u := range characters {
st := root
bytes := []byte(form.String(string(rune(u))))
for _, b := range bytes[:len(bytes)-1] {
ost := st
st = st.states[b]
if st == nil {
st = new(state)
ost.states[b] = st
}
}
b := bytes[len(bytes)-1]
if st.chars[b] == 0 {
st.chars[b] = uint8(m)
if flagDumpSequences {
fmt.Fprintf(os.Stderr, "%02x: %x\n", m, bytes)
}
}
}
}
return root
}
func (s *state) count() int {
n := 1
for _, s := range s.states {
if s != nil {
n += s.count()
}
}
return n
}
func (s *state) writeTable(table []uint16, pos int) int {
data := table[pos*256 : pos*256+256 : pos*256+256]
pos++
for i, c := range s.chars {
data[i] = uint16(c)
}
for i, c := range s.states {
if c != nil {
data[i] |= uint16(pos << 8)
pos = c.writeTable(table, pos)
}
}
return pos
}
func (s *state) genTable() []uint16 {
n := s.count()
table := make([]uint16, 256*n)
pos := s.writeTable(table, 0)
if pos != n {
panic("bad table")
}
return table
}
func dumpTransitions(table []uint16) {
n := len(table) >> 8
for i := 0; i < n; i++ {
t := table[i<<8 : (i+1)<<8]
fmt.Fprintf(os.Stderr, "State $%02x\n", i)
for m, v := range t {
if v != 0 {
fmt.Fprintf(os.Stderr, " $%02x ->", m)
st := v >> 8
chr := v & 255
if st != 0 {
fmt.Fprintf(os.Stderr, " state $%02x", st)
}
if chr != 0 {
fmt.Fprintf(os.Stderr, " char $%02x", chr)
}
fmt.Fprintln(os.Stderr)
}
}
fmt.Fprintln(os.Stderr)
} }
} }
func tableToBytes(t []uint16) []byte { var (
b := make([]byte, len(t)*2) // lineBreaks is the set of all sequences recognized as line breaks.
for i, x := range t { lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}
b[i*2] = byte(x >> 8) // normForms is the set of Unicode normalization forms recognized.
b[i*2+1] = byte(x) normForms = []norm.Form{norm.NFC, norm.NFD}
} )
return b
}
func printTable(table []uint16) error { func makeConverter(cmap *[256]rune) (*node, error) {
if _, err := fmt.Print("static const unsigned short kFromUnixTable[] = {"); err != nil { var n node
return err // Special case for CR and LF.
} for _, b := range lineBreaks {
for i, n := range table { if err := n.add('\r', b); err != nil {
if i&15 == 0 { return nil, err
if _, err := fmt.Println(); err != nil {
return err
} }
} }
if _, err := fmt.Printf("%d,", n); err != nil { for m, u := range *cmap {
return err if m == '\r' || m == '\n' {
continue
}
us := string(u)
for _, form := range normForms {
bytes := []byte(form.String(us))
fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)
if err := n.add(byte(m), bytes); err != nil {
return nil, err
} }
} }
_, err := fmt.Print("\n};\n") }
return err return &n, nil
} }
func printData(f *os.File, ulen int, data []byte) error { func printData(f *os.File, ulen int, data []byte) error {
@ -202,17 +114,25 @@ func printData(f *os.File, ulen int, data []byte) error {
return err return err
} }
func mainE() error {
n, err := makeConverter(&characters)
if err != nil {
return err
}
table := n.genTable()
if flagDumpTransitions {
w := bufio.NewWriter(os.Stderr)
table.dumpTransitions(w)
w.Flush()
}
bytes := table.toBytes()
bits := packbits.Pack(bytes)
return printData(os.Stdout, len(bytes), bits)
}
func main() { func main() {
flag.Parse() flag.Parse()
if err := mainE(); err != nil {
root := genStates()
table := root.genTable()
if flagDumpTransitions {
dumpTransitions(table)
}
bytes := tableToBytes(table)
bits := packbits.Pack(bytes)
if err := printData(os.Stdout, len(bytes), bits); err != nil {
fmt.Fprintln(os.Stderr, "Error:", err) fmt.Fprintln(os.Stderr, "Error:", err)
os.Exit(1) os.Exit(1)
} }

136
gen/table.go Normal file
View File

@ -0,0 +1,136 @@
package main
import (
"bufio"
"errors"
"fmt"
)
var (
errEmptyString = errors.New("empty input")
errZeroInput = errors.New("zero byte input")
errZeroOutput = errors.New("zero byte output")
)
type inputConflictErr struct {
input []byte
out1 byte
out2 byte
}
func (e *inputConflictErr) Error() string {
return fmt.Sprintf("table conflict: %d maps to both %d and %d", e.input, e.out1, e.out2)
}
// A node is an element in a Unicode decoding graph.
type node struct {
chars [256]uint8
children [256]*node
}
// add adds the mapping from "in" to "out", creating additional nodes as
// necessary.
func (n *node) add(out byte, in []byte) error {
if len(in) == 0 {
return errEmptyString
}
if in[0] == 0 {
if out == 0 {
return nil
}
}
if out == 0 {
return errZeroOutput
}
for _, b := range in[:len(in)-1] {
old := n
n = n.children[b]
if n == nil {
n = new(node)
old.children[b] = n
}
}
b := in[len(in)-1]
x := n.chars[b]
if x == 0 {
n.chars[b] = out
return nil
}
if x == out {
return nil
}
return &inputConflictErr{
input: in,
out1: x,
out2: out,
}
}
func (n *node) size() int {
sz := 1
for _, c := range n.children {
if c != nil {
sz += c.size()
}
}
return sz
}
func (n *node) writeTable(table decoderTable, pos int) int {
data := table[pos*256 : pos*256+256 : pos*256+256]
pos++
for i, c := range n.chars {
data[i] = uint16(c)
}
for i, c := range n.children {
if c != nil {
data[i] |= uint16(pos << 8)
pos = c.writeTable(table, pos)
}
}
return pos
}
func (n *node) genTable() decoderTable {
sz := n.size()
table := make(decoderTable, 256*sz)
pos := n.writeTable(table, 0)
if pos != sz {
panic("bad table")
}
return table
}
type decoderTable []uint16
func (t decoderTable) dumpTransitions(w *bufio.Writer) {
n := len(t) >> 8
for i := 0; i < n; i++ {
t := t[i<<8 : (i+1)<<8]
fmt.Fprintf(w, "State $%02x\n", i)
for m, v := range t {
if v != 0 {
fmt.Fprintf(w, " $%02x ->", m)
st := v >> 8
chr := v & 255
if st != 0 {
fmt.Fprintf(w, " state $%02x", st)
}
if chr != 0 {
fmt.Fprintf(w, " char $%02x", chr)
}
w.WriteByte('\n')
}
}
w.WriteByte('\n')
}
}
func (t decoderTable) toBytes() []byte {
b := make([]byte, len(t)*2)
for i, x := range t {
b[i*2] = byte(x >> 8)
b[i*2+1] = byte(x)
}
return b
}

View File

@ -1,52 +1,52 @@
/* This file is automatically generated. */ /* This file is automatically generated. */
// clang-format off // clang-format off
#define FROM_UNIX_DATALEN 27648 #define FROM_UNIX_DATALEN 28160
static const unsigned char kFromUnixData[1256] = { static const unsigned char kFromUnixData[1268] = {
254,0,16,1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0,9,254,0,127,11,0,12,0,10,0,14,0,15,0, 254,0,127,1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0,9,0,13,0,11,0,12,1,13,0,14,0,15,0,16,
16,0,17,0,18,0,19,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,28,0,29,0,30,0,31,0, 0,17,0,18,0,19,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,28,0,29,0,30,0,31,0,32,
32,0,33,0,34,0,35,0,36,0,37,0,38,0,39,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0, 0,33,0,34,0,35,0,36,0,37,0,38,0,39,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0,48,
48,0,49,0,50,0,51,0,52,0,53,0,54,0,55,0,56,0,57,0,58,0,59,0,60,1,61,0,62,0,63,0, 0,49,0,50,0,51,0,52,0,53,0,54,0,55,0,56,0,57,0,58,0,59,0,60,2,61,0,62,0,63,0,64,
64,3,65,0,66,5,67,0,68,7,69,0,70,0,71,0,72,9,73,0,74,0,104,75,0,76,0,77,11,78, 4,124,65,0,66,6,67,0,68,8,69,0,70,0,71,0,72,10,73,0,74,0,75,0,76,0,77,12,78,14,
13,79,0,80,0,81,0,82,0,83,0,84,15,85,0,86,0,87,0,88,17,89,0,90,0,91,0,92,0,93,0, 79,0,80,0,81,0,82,0,83,0,84,16,85,0,86,0,87,0,88,18,89,0,90,0,91,0,92,0,93,0,94,
94,0,95,0,96,19,97,0,98,21,99,0,100,23,101,0,102,0,103,0,104,25,105,0,106,0,107, 0,95,0,96,20,97,0,98,22,99,0,100,24,101,0,102,0,103,0,104,26,105,0,106,0,107,0,
0,108,0,109,27,110,29,111,0,112,0,113,0,114,0,115,0,116,31,117,0,118,0,119,0, 108,0,109,28,110,30,111,0,112,0,113,0,114,0,115,0,116,32,117,0,118,0,119,0,120,
120,33,121,0,122,0,123,0,124,0,125,0,126,0,127,129,0,253,0,8,35,0,36,0,37,0,38, 34,121,0,122,0,123,0,124,0,125,0,126,0,127,129,0,253,0,8,36,0,37,0,38,0,39,0,40,
0,39,248,0,0,40,252,0,2,41,0,42,220,0,0,43,232,0,0,51,129,0,129,0,129,0,200,0,0, 248,0,0,41,252,0,2,42,0,43,220,0,0,44,232,0,0,52,203,0,0,13,129,0,129,0,129,0,
2,129,0,129,0,129,0,169,0,0,173,129,0,129,0,129,0,129,0,219,0,0,4,129,0,129,0, 129,0,129,0,129,0,129,0,255,0,0,3,129,0,129,0,129,0,169,0,0,173,129,0,129,0,129,
153,0,6,203,0,231,0,229,0,204,248,0,0,128,254,0,0,129,129,0,129,0,129,0,129,0, 0,129,0,219,0,0,5,129,0,129,0,153,0,6,203,0,231,0,229,0,204,248,0,0,128,254,0,0,
129,0,255,0,0,6,129,0,129,0,129,0,203,0,0,130,129,0,129,0,129,0,129,0,185,0,0,8, 129,129,0,129,0,129,0,129,0,129,0,255,0,0,7,129,0,129,0,129,0,203,0,0,130,129,0,
129,0,129,0,153,0,4,233,0,131,0,230,246,0,0,232,129,0,129,0,129,0,129,0,129,0, 129,0,129,0,129,0,185,0,0,9,129,0,129,0,153,0,4,233,0,131,0,230,246,0,0,232,129,
251,0,0,10,129,0,129,0,153,0,4,237,0,234,0,235,246,0,0,236,129,0,129,0,129,0, 0,129,0,129,0,129,0,129,0,251,0,0,11,129,0,129,0,153,0,4,237,0,234,0,235,246,0,
129,0,129,0,251,0,0,12,129,0,129,0,147,0,0,132,129,0,129,0,129,0,129,0,129,0, 0,236,129,0,129,0,129,0,129,0,129,0,251,0,0,13,129,0,129,0,147,0,0,132,129,0,
241,0,0,14,129,0,129,0,153,0,6,241,0,238,0,239,0,205,248,0,0,133,129,0,129,0, 129,0,129,0,129,0,129,0,241,0,0,15,129,0,129,0,153,0,6,241,0,238,0,239,0,205,
129,0,129,0,129,0,251,0,0,16,129,0,129,0,153,0,4,244,0,242,0,243,246,0,0,134, 248,0,0,133,129,0,129,0,129,0,129,0,129,0,251,0,0,17,129,0,129,0,153,0,4,244,0,
129,0,129,0,129,0,129,0,129,0,251,0,0,18,129,0,129,0,137,0,0,217,129,0,129,0, 242,0,243,246,0,0,134,129,0,129,0,129,0,129,0,129,0,251,0,0,19,129,0,129,0,137,
129,0,129,0,129,0,251,0,0,20,129,0,129,0,153,0,6,136,0,135,0,137,0,139,248,0,0, 0,0,217,129,0,129,0,129,0,129,0,129,0,251,0,0,21,129,0,129,0,153,0,6,136,0,135,
138,254,0,0,140,129,0,129,0,129,0,129,0,129,0,255,0,0,22,129,0,129,0,129,0,203, 0,137,0,139,248,0,0,138,254,0,0,140,129,0,129,0,129,0,129,0,129,0,255,0,0,23,
0,0,141,129,0,129,0,129,0,129,0,185,0,0,24,129,0,129,0,153,0,4,143,0,142,0,144, 129,0,129,0,129,0,203,0,0,141,129,0,129,0,129,0,129,0,185,0,0,25,129,0,129,0,
246,0,0,145,129,0,129,0,129,0,129,0,129,0,251,0,0,26,129,0,129,0,153,0,4,147,0, 153,0,4,143,0,142,0,144,246,0,0,145,129,0,129,0,129,0,129,0,129,0,251,0,0,27,
146,0,148,246,0,0,149,129,0,129,0,129,0,129,0,129,0,251,0,0,28,129,0,129,0,147, 129,0,129,0,153,0,4,147,0,146,0,148,246,0,0,149,129,0,129,0,129,0,129,0,129,0,
0,0,150,129,0,129,0,129,0,129,0,129,0,241,0,0,30,129,0,129,0,153,0,6,152,0,151, 251,0,0,29,129,0,129,0,147,0,0,150,129,0,129,0,129,0,129,0,129,0,241,0,0,31,129,
0,153,0,155,248,0,0,154,129,0,129,0,129,0,129,0,129,0,251,0,0,32,129,0,129,0, 0,129,0,153,0,6,152,0,151,0,153,0,155,248,0,0,154,129,0,129,0,129,0,129,0,129,0,
153,0,4,157,0,156,0,158,246,0,0,159,129,0,129,0,129,0,129,0,129,0,251,0,0,34, 251,0,0,33,129,0,129,0,153,0,4,157,0,156,0,158,246,0,0,159,129,0,129,0,129,0,
129,0,129,0,137,0,0,216,129,0,129,0,129,0,129,0,210,0,6,202,0,193,0,162,0,163, 129,0,129,0,251,0,0,35,129,0,129,0,137,0,0,216,129,0,129,0,129,0,129,0,210,0,6,
254,0,0,180,254,0,10,164,0,172,0,169,0,187,0,199,0,194,254,0,6,168,0,248,0,161, 202,0,193,0,162,0,163,254,0,0,180,254,0,10,164,0,172,0,169,0,187,0,199,0,194,
0,177,252,0,8,171,0,181,0,166,0,225,0,252,254,0,2,188,0,200,250,0,1,192,0,129,0, 254,0,6,168,0,248,0,161,0,177,252,0,8,171,0,181,0,166,0,225,0,252,254,0,2,188,0,
129,0,129,0,30,203,0,231,0,229,0,204,0,128,0,129,0,174,0,130,0,233,0,131,0,230, 200,250,0,1,192,0,129,0,129,0,129,0,30,203,0,231,0,229,0,204,0,128,0,129,0,174,
0,232,0,237,0,234,0,235,0,236,254,0,10,132,0,241,0,238,0,239,0,205,0,133,254,0, 0,130,0,233,0,131,0,230,0,232,0,237,0,234,0,235,0,236,254,0,10,132,0,241,0,238,
8,175,0,244,0,242,0,243,0,134,252,0,32,167,0,136,0,135,0,137,0,139,0,138,0,140, 0,239,0,205,0,133,254,0,8,175,0,244,0,242,0,243,0,134,252,0,32,167,0,136,0,135,
0,190,0,141,0,143,0,142,0,144,0,145,0,147,0,146,0,148,0,149,254,0,22,150,0,152, 0,137,0,139,0,138,0,140,0,190,0,141,0,143,0,142,0,144,0,145,0,147,0,146,0,148,0,
0,151,0,153,0,155,0,154,0,214,0,191,0,157,0,156,0,158,0,159,252,0,0,216,129,0, 149,254,0,22,150,0,152,0,151,0,153,0,155,0,154,0,214,0,191,0,157,0,156,0,158,0,
129,0,129,0,158,0,0,245,129,0,129,0,129,0,192,0,2,206,0,207,184,0,0,217,129,0, 159,252,0,0,216,129,0,129,0,129,0,158,0,0,245,129,0,129,0,129,0,192,0,2,206,0,
129,0,129,0,206,0,0,196,129,0,129,0,129,0,154,0,2,246,0,255,224,0,10,249,0,250, 207,184,0,0,217,129,0,129,0,129,0,206,0,0,196,129,0,129,0,129,0,154,0,2,246,0,
0,251,0,254,0,247,0,253,129,0,129,0,129,0,129,0,234,0,0,189,129,0,129,0,129,0, 255,224,0,10,249,0,250,0,251,0,254,0,247,0,253,129,0,129,0,129,0,129,0,234,0,0,
212,0,0,185,129,0,129,0,129,0,131,0,4,44,0,45,0,46,254,0,0,47,250,0,2,48,0,49, 189,129,0,129,0,129,0,212,0,0,185,129,0,129,0,129,0,131,0,4,45,0,46,0,47,254,0,
230,0,0,50,129,0,129,0,129,0,137,0,2,208,0,209,250,0,4,212,0,213,0,226,254,0,4, 0,48,250,0,2,49,0,50,230,0,0,51,129,0,129,0,129,0,137,0,2,208,0,209,250,0,4,212,
210,0,211,0,227,254,0,4,160,0,224,0,165,250,0,0,201,238,0,0,228,240,0,2,220,0, 0,213,0,226,254,0,4,210,0,211,0,227,254,0,4,160,0,224,0,165,250,0,0,201,238,0,0,
221,129,0,129,0,129,0,238,0,0,218,129,0,129,0,129,0,129,0,178,0,0,219,129,0,129, 228,240,0,2,220,0,221,129,0,129,0,129,0,238,0,0,218,129,0,129,0,129,0,129,0,178,
0,129,0,150,0,0,170,129,0,129,0,129,0,194,0,0,182,250,0,0,198,240,0,0,184,254,0, 0,0,219,129,0,129,0,129,0,150,0,0,170,129,0,129,0,129,0,194,0,0,182,250,0,0,198,
0,183,240,0,0,195,250,0,0,176,232,0,0,186,129,0,129,0,129,0,200,0,0,197,210,0,0, 240,0,0,184,254,0,0,183,240,0,0,195,250,0,0,176,232,0,0,186,129,0,129,0,129,0,
173,250,0,2,178,0,179,129,0,129,0,129,0,184,0,0,215,129,0,129,0,129,0,129,0,209, 200,0,0,197,210,0,0,173,250,0,2,178,0,179,129,0,129,0,129,0,184,0,0,215,129,0,
0,0,52,240,0,0,53,129,0,129,0,129,0,129,0,219,0,0,240,129,0,129,0,129,0,254,0,2, 129,0,129,0,129,0,209,0,0,53,240,0,0,54,129,0,129,0,129,0,129,0,219,0,0,240,129,
222,0,223,129,0,135,0, 0,129,0,129,0,254,0,2,222,0,223,129,0,135,0,
}; };