package main import ( "flag" "fmt" "os" "strconv" "golang.org/x/text/unicode/norm" ) var ( flagDumpSequences bool flagDumpTransitions bool ) func init() { flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences") flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables") } var characters [256]uint16 func init() { hichars := [128]uint16{ 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7, } for i := 0; i < 128; i++ { characters[i] = uint16(i) } for i, c := range hichars { characters[i+128] = c } characters['\n'] = '\r' } type state struct { chars [256]uint8 states [256]*state } func genStates() *state { root := new(state) // Iterate over each Unicode normalization form. // Omit norm.NFKC, norm.NFKD for _, form := range []norm.Form{norm.NFC, norm.NFD} { // Iterate over Macintosh, Unicode characters. for m, u := range characters { st := root bytes := []byte(form.String(string(rune(u)))) for _, b := range bytes[:len(bytes)-1] { ost := st st = st.states[b] if st == nil { st = new(state) ost.states[b] = st } } b := bytes[len(bytes)-1] if st.chars[b] == 0 { st.chars[b] = uint8(m) if flagDumpSequences { fmt.Fprintf(os.Stderr, "%02x: %x\n", m, bytes) } } } } return root } func (s *state) count() int { n := 1 for _, s := range s.states { if s != nil { n += s.count() } } return n } func (s *state) writeTable(table []uint16, pos int) int { data := table[pos*256 : pos*256+256 : pos*256+256] pos++ for i, c := range s.chars { data[i] = uint16(c) } for i, c := range s.states { if c != nil { data[i] |= uint16(pos << 8) pos = c.writeTable(table, pos) } } return pos } func (s *state) genTable() []uint16 { n := s.count() table := make([]uint16, 256*n) pos := s.writeTable(table, 0) if pos != n { panic("bad table") } return table } func dumpTransitions(table []uint16) { n := len(table) >> 8 for i := 0; i < n; i++ { t := table[i<<8 : (i+1)<<8] fmt.Fprintf(os.Stderr, "State $%02x\n", i) for m, v := range t { if v != 0 { fmt.Fprintf(os.Stderr, " $%02x ->", m) st := v >> 8 chr := v & 255 if st != 0 { fmt.Fprintf(os.Stderr, " state $%02x", st) } if chr != 0 { fmt.Fprintf(os.Stderr, " char $%02x", chr) } fmt.Fprintln(os.Stderr) } } fmt.Fprintln(os.Stderr) } } func tableToBytes(t []uint16) []byte { b := make([]byte, len(t)*2) for i, x := range t { b[i*2] = byte(x >> 8) b[i*2+1] = byte(x) } return b } func getRun(bytes []byte) (repeat bool, run []byte) { if len(bytes) == 0 { return } ref := bytes[0] n := 1 for n < len(bytes) && bytes[n] == ref { n++ } if n >= 2 { return true, bytes[:n] } for i, b := range bytes[1:] { if b == ref { return false, bytes[:i] } ref = b } return false, bytes } func packBits(bytes []byte) []byte { var result []byte for len(bytes) > 0 { repeat, run := getRun(bytes) if len(run) > 128 { run = run[:128] } if repeat { result = append(result, byte(1-len(run)), run[0]) } else { result = append(result, byte(len(run)-1)) result = append(result, run...) } bytes = bytes[len(run):] } return result } func printTable(table []uint16) error { if _, err := fmt.Print("static const unsigned short kFromUnixTable[] = {"); err != nil { return err } for i, n := range table { if i&15 == 0 { if _, err := fmt.Println(); err != nil { return err } } if _, err := fmt.Printf("%d,", n); err != nil { return err } } _, err := fmt.Print("\n};\n") return err } func printData(f *os.File, ulen int, data []byte) error { if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+ "// clang-format off\n"); err != nil { return err } if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil { return err } if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil { return err } var line []byte for _, n := range data { sv := len(line) line = strconv.AppendUint(line, uint64(n), 10) line = append(line, ',') if len(line) > 80 { line = append(line[:sv], '\n') if _, err := f.Write(line); err != nil { return err } line = strconv.AppendUint(line[:0], uint64(n), 10) line = append(line, ',') } } line = append(line, '\n') if _, err := f.Write(line); err != nil { return err } _, err := fmt.Print("};\n") return err } func main() { flag.Parse() root := genStates() table := root.genTable() if flagDumpTransitions { dumpTransitions(table) } bytes := tableToBytes(table) // printTable(table) bits := packBits(bytes) if err := printData(os.Stdout, len(bytes), bits); err != nil { fmt.Fprintln(os.Stderr, "Error:", err) os.Exit(1) } }