From db4187b65b832f9327739715ddbcb74d0b92705e Mon Sep 17 00:00:00 2001 From: Dietrich Epp Date: Tue, 14 Dec 2021 13:12:52 -0500 Subject: [PATCH] Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. --- gen/macroman.go | 172 ++++++++++++------------------------------- gen/table.go | 136 ++++++++++++++++++++++++++++++++++ mac_from_unix_data.h | 98 ++++++++++++------------ 3 files changed, 231 insertions(+), 175 deletions(-) create mode 100644 gen/table.go diff --git a/gen/macroman.go b/gen/macroman.go index 213396a..1f7343c 100644 --- a/gen/macroman.go +++ b/gen/macroman.go @@ -1,6 +1,7 @@ package main import ( + "bufio" "flag" "fmt" "os" @@ -20,7 +21,7 @@ func init() { flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables") } -var characters [256]uint16 +var characters [256]rune func init() { hichars := [128]uint16{ @@ -42,131 +43,42 @@ func init() { 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7, } for i := 0; i < 128; i++ { - characters[i] = uint16(i) + characters[i] = rune(i) } for i, c := range hichars { - characters[i+128] = c + characters[i+128] = rune(c) } - characters['\n'] = '\r' } -type state struct { - chars [256]uint8 - states [256]*state -} +var ( + // lineBreaks is the set of all sequences recognized as line breaks. + lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}} + // normForms is the set of Unicode normalization forms recognized. + normForms = []norm.Form{norm.NFC, norm.NFD} +) -func genStates() *state { - root := new(state) - // Iterate over each Unicode normalization form. - // Omit norm.NFKC, norm.NFKD - for _, form := range []norm.Form{norm.NFC, norm.NFD} { - // Iterate over Macintosh, Unicode characters. - for m, u := range characters { - st := root - bytes := []byte(form.String(string(rune(u)))) - for _, b := range bytes[:len(bytes)-1] { - ost := st - st = st.states[b] - if st == nil { - st = new(state) - ost.states[b] = st - } - } - b := bytes[len(bytes)-1] - if st.chars[b] == 0 { - st.chars[b] = uint8(m) - if flagDumpSequences { - fmt.Fprintf(os.Stderr, "%02x: %x\n", m, bytes) - } +func makeConverter(cmap *[256]rune) (*node, error) { + var n node + // Special case for CR and LF. + for _, b := range lineBreaks { + if err := n.add('\r', b); err != nil { + return nil, err + } + } + for m, u := range *cmap { + if m == '\r' || m == '\n' { + continue + } + us := string(u) + for _, form := range normForms { + bytes := []byte(form.String(us)) + fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes) + if err := n.add(byte(m), bytes); err != nil { + return nil, err } } } - return root -} - -func (s *state) count() int { - n := 1 - for _, s := range s.states { - if s != nil { - n += s.count() - } - } - return n -} - -func (s *state) writeTable(table []uint16, pos int) int { - data := table[pos*256 : pos*256+256 : pos*256+256] - pos++ - for i, c := range s.chars { - data[i] = uint16(c) - } - for i, c := range s.states { - if c != nil { - data[i] |= uint16(pos << 8) - pos = c.writeTable(table, pos) - } - } - return pos -} - -func (s *state) genTable() []uint16 { - n := s.count() - table := make([]uint16, 256*n) - pos := s.writeTable(table, 0) - if pos != n { - panic("bad table") - } - return table -} - -func dumpTransitions(table []uint16) { - n := len(table) >> 8 - for i := 0; i < n; i++ { - t := table[i<<8 : (i+1)<<8] - fmt.Fprintf(os.Stderr, "State $%02x\n", i) - for m, v := range t { - if v != 0 { - fmt.Fprintf(os.Stderr, " $%02x ->", m) - st := v >> 8 - chr := v & 255 - if st != 0 { - fmt.Fprintf(os.Stderr, " state $%02x", st) - } - if chr != 0 { - fmt.Fprintf(os.Stderr, " char $%02x", chr) - } - fmt.Fprintln(os.Stderr) - } - } - fmt.Fprintln(os.Stderr) - } -} - -func tableToBytes(t []uint16) []byte { - b := make([]byte, len(t)*2) - for i, x := range t { - b[i*2] = byte(x >> 8) - b[i*2+1] = byte(x) - } - return b -} - -func printTable(table []uint16) error { - if _, err := fmt.Print("static const unsigned short kFromUnixTable[] = {"); err != nil { - return err - } - for i, n := range table { - if i&15 == 0 { - if _, err := fmt.Println(); err != nil { - return err - } - } - if _, err := fmt.Printf("%d,", n); err != nil { - return err - } - } - _, err := fmt.Print("\n};\n") - return err + return &n, nil } func printData(f *os.File, ulen int, data []byte) error { @@ -202,17 +114,25 @@ func printData(f *os.File, ulen int, data []byte) error { return err } +func mainE() error { + n, err := makeConverter(&characters) + if err != nil { + return err + } + table := n.genTable() + if flagDumpTransitions { + w := bufio.NewWriter(os.Stderr) + table.dumpTransitions(w) + w.Flush() + } + bytes := table.toBytes() + bits := packbits.Pack(bytes) + return printData(os.Stdout, len(bytes), bits) +} + func main() { flag.Parse() - - root := genStates() - table := root.genTable() - if flagDumpTransitions { - dumpTransitions(table) - } - bytes := tableToBytes(table) - bits := packbits.Pack(bytes) - if err := printData(os.Stdout, len(bytes), bits); err != nil { + if err := mainE(); err != nil { fmt.Fprintln(os.Stderr, "Error:", err) os.Exit(1) } diff --git a/gen/table.go b/gen/table.go new file mode 100644 index 0000000..888db1d --- /dev/null +++ b/gen/table.go @@ -0,0 +1,136 @@ +package main + +import ( + "bufio" + "errors" + "fmt" +) + +var ( + errEmptyString = errors.New("empty input") + errZeroInput = errors.New("zero byte input") + errZeroOutput = errors.New("zero byte output") +) + +type inputConflictErr struct { + input []byte + out1 byte + out2 byte +} + +func (e *inputConflictErr) Error() string { + return fmt.Sprintf("table conflict: %d maps to both %d and %d", e.input, e.out1, e.out2) +} + +// A node is an element in a Unicode decoding graph. +type node struct { + chars [256]uint8 + children [256]*node +} + +// add adds the mapping from "in" to "out", creating additional nodes as +// necessary. +func (n *node) add(out byte, in []byte) error { + if len(in) == 0 { + return errEmptyString + } + if in[0] == 0 { + if out == 0 { + return nil + } + } + if out == 0 { + return errZeroOutput + } + for _, b := range in[:len(in)-1] { + old := n + n = n.children[b] + if n == nil { + n = new(node) + old.children[b] = n + } + } + b := in[len(in)-1] + x := n.chars[b] + if x == 0 { + n.chars[b] = out + return nil + } + if x == out { + return nil + } + return &inputConflictErr{ + input: in, + out1: x, + out2: out, + } +} + +func (n *node) size() int { + sz := 1 + for _, c := range n.children { + if c != nil { + sz += c.size() + } + } + return sz +} + +func (n *node) writeTable(table decoderTable, pos int) int { + data := table[pos*256 : pos*256+256 : pos*256+256] + pos++ + for i, c := range n.chars { + data[i] = uint16(c) + } + for i, c := range n.children { + if c != nil { + data[i] |= uint16(pos << 8) + pos = c.writeTable(table, pos) + } + } + return pos +} + +func (n *node) genTable() decoderTable { + sz := n.size() + table := make(decoderTable, 256*sz) + pos := n.writeTable(table, 0) + if pos != sz { + panic("bad table") + } + return table +} + +type decoderTable []uint16 + +func (t decoderTable) dumpTransitions(w *bufio.Writer) { + n := len(t) >> 8 + for i := 0; i < n; i++ { + t := t[i<<8 : (i+1)<<8] + fmt.Fprintf(w, "State $%02x\n", i) + for m, v := range t { + if v != 0 { + fmt.Fprintf(w, " $%02x ->", m) + st := v >> 8 + chr := v & 255 + if st != 0 { + fmt.Fprintf(w, " state $%02x", st) + } + if chr != 0 { + fmt.Fprintf(w, " char $%02x", chr) + } + w.WriteByte('\n') + } + } + w.WriteByte('\n') + } +} + +func (t decoderTable) toBytes() []byte { + b := make([]byte, len(t)*2) + for i, x := range t { + b[i*2] = byte(x >> 8) + b[i*2+1] = byte(x) + } + return b +} diff --git a/mac_from_unix_data.h b/mac_from_unix_data.h index 6ab6eb7..3d7281d 100644 --- a/mac_from_unix_data.h +++ b/mac_from_unix_data.h @@ -1,52 +1,52 @@ /* This file is automatically generated. */ // clang-format off -#define FROM_UNIX_DATALEN 27648 -static const unsigned char kFromUnixData[1256] = { -254,0,16,1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0,9,254,0,127,11,0,12,0,10,0,14,0,15,0, -16,0,17,0,18,0,19,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,28,0,29,0,30,0,31,0, -32,0,33,0,34,0,35,0,36,0,37,0,38,0,39,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0, -48,0,49,0,50,0,51,0,52,0,53,0,54,0,55,0,56,0,57,0,58,0,59,0,60,1,61,0,62,0,63,0, -64,3,65,0,66,5,67,0,68,7,69,0,70,0,71,0,72,9,73,0,74,0,104,75,0,76,0,77,11,78, -13,79,0,80,0,81,0,82,0,83,0,84,15,85,0,86,0,87,0,88,17,89,0,90,0,91,0,92,0,93,0, -94,0,95,0,96,19,97,0,98,21,99,0,100,23,101,0,102,0,103,0,104,25,105,0,106,0,107, -0,108,0,109,27,110,29,111,0,112,0,113,0,114,0,115,0,116,31,117,0,118,0,119,0, -120,33,121,0,122,0,123,0,124,0,125,0,126,0,127,129,0,253,0,8,35,0,36,0,37,0,38, -0,39,248,0,0,40,252,0,2,41,0,42,220,0,0,43,232,0,0,51,129,0,129,0,129,0,200,0,0, -2,129,0,129,0,129,0,169,0,0,173,129,0,129,0,129,0,129,0,219,0,0,4,129,0,129,0, -153,0,6,203,0,231,0,229,0,204,248,0,0,128,254,0,0,129,129,0,129,0,129,0,129,0, -129,0,255,0,0,6,129,0,129,0,129,0,203,0,0,130,129,0,129,0,129,0,129,0,185,0,0,8, -129,0,129,0,153,0,4,233,0,131,0,230,246,0,0,232,129,0,129,0,129,0,129,0,129,0, -251,0,0,10,129,0,129,0,153,0,4,237,0,234,0,235,246,0,0,236,129,0,129,0,129,0, -129,0,129,0,251,0,0,12,129,0,129,0,147,0,0,132,129,0,129,0,129,0,129,0,129,0, -241,0,0,14,129,0,129,0,153,0,6,241,0,238,0,239,0,205,248,0,0,133,129,0,129,0, -129,0,129,0,129,0,251,0,0,16,129,0,129,0,153,0,4,244,0,242,0,243,246,0,0,134, -129,0,129,0,129,0,129,0,129,0,251,0,0,18,129,0,129,0,137,0,0,217,129,0,129,0, -129,0,129,0,129,0,251,0,0,20,129,0,129,0,153,0,6,136,0,135,0,137,0,139,248,0,0, -138,254,0,0,140,129,0,129,0,129,0,129,0,129,0,255,0,0,22,129,0,129,0,129,0,203, -0,0,141,129,0,129,0,129,0,129,0,185,0,0,24,129,0,129,0,153,0,4,143,0,142,0,144, -246,0,0,145,129,0,129,0,129,0,129,0,129,0,251,0,0,26,129,0,129,0,153,0,4,147,0, -146,0,148,246,0,0,149,129,0,129,0,129,0,129,0,129,0,251,0,0,28,129,0,129,0,147, -0,0,150,129,0,129,0,129,0,129,0,129,0,241,0,0,30,129,0,129,0,153,0,6,152,0,151, -0,153,0,155,248,0,0,154,129,0,129,0,129,0,129,0,129,0,251,0,0,32,129,0,129,0, -153,0,4,157,0,156,0,158,246,0,0,159,129,0,129,0,129,0,129,0,129,0,251,0,0,34, -129,0,129,0,137,0,0,216,129,0,129,0,129,0,129,0,210,0,6,202,0,193,0,162,0,163, -254,0,0,180,254,0,10,164,0,172,0,169,0,187,0,199,0,194,254,0,6,168,0,248,0,161, -0,177,252,0,8,171,0,181,0,166,0,225,0,252,254,0,2,188,0,200,250,0,1,192,0,129,0, -129,0,129,0,30,203,0,231,0,229,0,204,0,128,0,129,0,174,0,130,0,233,0,131,0,230, -0,232,0,237,0,234,0,235,0,236,254,0,10,132,0,241,0,238,0,239,0,205,0,133,254,0, -8,175,0,244,0,242,0,243,0,134,252,0,32,167,0,136,0,135,0,137,0,139,0,138,0,140, -0,190,0,141,0,143,0,142,0,144,0,145,0,147,0,146,0,148,0,149,254,0,22,150,0,152, -0,151,0,153,0,155,0,154,0,214,0,191,0,157,0,156,0,158,0,159,252,0,0,216,129,0, -129,0,129,0,158,0,0,245,129,0,129,0,129,0,192,0,2,206,0,207,184,0,0,217,129,0, -129,0,129,0,206,0,0,196,129,0,129,0,129,0,154,0,2,246,0,255,224,0,10,249,0,250, -0,251,0,254,0,247,0,253,129,0,129,0,129,0,129,0,234,0,0,189,129,0,129,0,129,0, -212,0,0,185,129,0,129,0,129,0,131,0,4,44,0,45,0,46,254,0,0,47,250,0,2,48,0,49, -230,0,0,50,129,0,129,0,129,0,137,0,2,208,0,209,250,0,4,212,0,213,0,226,254,0,4, -210,0,211,0,227,254,0,4,160,0,224,0,165,250,0,0,201,238,0,0,228,240,0,2,220,0, -221,129,0,129,0,129,0,238,0,0,218,129,0,129,0,129,0,129,0,178,0,0,219,129,0,129, -0,129,0,150,0,0,170,129,0,129,0,129,0,194,0,0,182,250,0,0,198,240,0,0,184,254,0, -0,183,240,0,0,195,250,0,0,176,232,0,0,186,129,0,129,0,129,0,200,0,0,197,210,0,0, -173,250,0,2,178,0,179,129,0,129,0,129,0,184,0,0,215,129,0,129,0,129,0,129,0,209, -0,0,52,240,0,0,53,129,0,129,0,129,0,129,0,219,0,0,240,129,0,129,0,129,0,254,0,2, -222,0,223,129,0,135,0, +#define FROM_UNIX_DATALEN 28160 +static const unsigned char kFromUnixData[1268] = { +254,0,127,1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0,9,0,13,0,11,0,12,1,13,0,14,0,15,0,16, +0,17,0,18,0,19,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,28,0,29,0,30,0,31,0,32, +0,33,0,34,0,35,0,36,0,37,0,38,0,39,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0,48, +0,49,0,50,0,51,0,52,0,53,0,54,0,55,0,56,0,57,0,58,0,59,0,60,2,61,0,62,0,63,0,64, +4,124,65,0,66,6,67,0,68,8,69,0,70,0,71,0,72,10,73,0,74,0,75,0,76,0,77,12,78,14, +79,0,80,0,81,0,82,0,83,0,84,16,85,0,86,0,87,0,88,18,89,0,90,0,91,0,92,0,93,0,94, +0,95,0,96,20,97,0,98,22,99,0,100,24,101,0,102,0,103,0,104,26,105,0,106,0,107,0, +108,0,109,28,110,30,111,0,112,0,113,0,114,0,115,0,116,32,117,0,118,0,119,0,120, +34,121,0,122,0,123,0,124,0,125,0,126,0,127,129,0,253,0,8,36,0,37,0,38,0,39,0,40, +248,0,0,41,252,0,2,42,0,43,220,0,0,44,232,0,0,52,203,0,0,13,129,0,129,0,129,0, +129,0,129,0,129,0,129,0,255,0,0,3,129,0,129,0,129,0,169,0,0,173,129,0,129,0,129, +0,129,0,219,0,0,5,129,0,129,0,153,0,6,203,0,231,0,229,0,204,248,0,0,128,254,0,0, +129,129,0,129,0,129,0,129,0,129,0,255,0,0,7,129,0,129,0,129,0,203,0,0,130,129,0, +129,0,129,0,129,0,185,0,0,9,129,0,129,0,153,0,4,233,0,131,0,230,246,0,0,232,129, +0,129,0,129,0,129,0,129,0,251,0,0,11,129,0,129,0,153,0,4,237,0,234,0,235,246,0, +0,236,129,0,129,0,129,0,129,0,129,0,251,0,0,13,129,0,129,0,147,0,0,132,129,0, +129,0,129,0,129,0,129,0,241,0,0,15,129,0,129,0,153,0,6,241,0,238,0,239,0,205, +248,0,0,133,129,0,129,0,129,0,129,0,129,0,251,0,0,17,129,0,129,0,153,0,4,244,0, +242,0,243,246,0,0,134,129,0,129,0,129,0,129,0,129,0,251,0,0,19,129,0,129,0,137, +0,0,217,129,0,129,0,129,0,129,0,129,0,251,0,0,21,129,0,129,0,153,0,6,136,0,135, +0,137,0,139,248,0,0,138,254,0,0,140,129,0,129,0,129,0,129,0,129,0,255,0,0,23, +129,0,129,0,129,0,203,0,0,141,129,0,129,0,129,0,129,0,185,0,0,25,129,0,129,0, +153,0,4,143,0,142,0,144,246,0,0,145,129,0,129,0,129,0,129,0,129,0,251,0,0,27, +129,0,129,0,153,0,4,147,0,146,0,148,246,0,0,149,129,0,129,0,129,0,129,0,129,0, +251,0,0,29,129,0,129,0,147,0,0,150,129,0,129,0,129,0,129,0,129,0,241,0,0,31,129, +0,129,0,153,0,6,152,0,151,0,153,0,155,248,0,0,154,129,0,129,0,129,0,129,0,129,0, +251,0,0,33,129,0,129,0,153,0,4,157,0,156,0,158,246,0,0,159,129,0,129,0,129,0, +129,0,129,0,251,0,0,35,129,0,129,0,137,0,0,216,129,0,129,0,129,0,129,0,210,0,6, +202,0,193,0,162,0,163,254,0,0,180,254,0,10,164,0,172,0,169,0,187,0,199,0,194, +254,0,6,168,0,248,0,161,0,177,252,0,8,171,0,181,0,166,0,225,0,252,254,0,2,188,0, +200,250,0,1,192,0,129,0,129,0,129,0,30,203,0,231,0,229,0,204,0,128,0,129,0,174, +0,130,0,233,0,131,0,230,0,232,0,237,0,234,0,235,0,236,254,0,10,132,0,241,0,238, +0,239,0,205,0,133,254,0,8,175,0,244,0,242,0,243,0,134,252,0,32,167,0,136,0,135, +0,137,0,139,0,138,0,140,0,190,0,141,0,143,0,142,0,144,0,145,0,147,0,146,0,148,0, +149,254,0,22,150,0,152,0,151,0,153,0,155,0,154,0,214,0,191,0,157,0,156,0,158,0, +159,252,0,0,216,129,0,129,0,129,0,158,0,0,245,129,0,129,0,129,0,192,0,2,206,0, +207,184,0,0,217,129,0,129,0,129,0,206,0,0,196,129,0,129,0,129,0,154,0,2,246,0, +255,224,0,10,249,0,250,0,251,0,254,0,247,0,253,129,0,129,0,129,0,129,0,234,0,0, +189,129,0,129,0,129,0,212,0,0,185,129,0,129,0,129,0,131,0,4,45,0,46,0,47,254,0, +0,48,250,0,2,49,0,50,230,0,0,51,129,0,129,0,129,0,137,0,2,208,0,209,250,0,4,212, +0,213,0,226,254,0,4,210,0,211,0,227,254,0,4,160,0,224,0,165,250,0,0,201,238,0,0, +228,240,0,2,220,0,221,129,0,129,0,129,0,238,0,0,218,129,0,129,0,129,0,129,0,178, +0,0,219,129,0,129,0,129,0,150,0,0,170,129,0,129,0,129,0,194,0,0,182,250,0,0,198, +240,0,0,184,254,0,0,183,240,0,0,195,250,0,0,176,232,0,0,186,129,0,129,0,129,0, +200,0,0,197,210,0,0,173,250,0,2,178,0,179,129,0,129,0,129,0,184,0,0,215,129,0, +129,0,129,0,129,0,209,0,0,53,240,0,0,54,129,0,129,0,129,0,129,0,219,0,0,240,129, +0,129,0,129,0,254,0,2,222,0,223,129,0,135,0, };