2021-03-11 01:47:59 -05:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
"bufio"
|
2021-03-16 13:07:56 -04:00
|
|
|
"flag"
|
2021-03-11 01:47:59 -05:00
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"strconv"
|
|
|
|
|
2021-12-14 12:23:14 -05:00
|
|
|
"github.com/depp/packbits"
|
2021-03-11 01:47:59 -05:00
|
|
|
"golang.org/x/text/unicode/norm"
|
|
|
|
)
|
|
|
|
|
2021-03-16 13:07:56 -04:00
|
|
|
var (
|
|
|
|
flagDumpSequences bool
|
|
|
|
flagDumpTransitions bool
|
|
|
|
)
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
|
|
|
|
flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
|
|
|
|
}
|
|
|
|
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
var characters [256]rune
|
2021-03-11 01:47:59 -05:00
|
|
|
|
|
|
|
func init() {
|
|
|
|
hichars := [128]uint16{
|
|
|
|
0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
|
|
|
|
0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
|
|
|
|
0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
|
|
|
|
0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
|
|
|
|
0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
|
|
|
|
0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
|
|
|
|
0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
|
|
|
|
0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
|
|
|
|
0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
|
|
|
|
0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
|
|
|
|
0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
|
|
|
|
0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
|
|
|
|
0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
|
|
|
|
0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
|
|
|
|
0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
|
|
|
|
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
|
|
|
|
}
|
|
|
|
for i := 0; i < 128; i++ {
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
characters[i] = rune(i)
|
2021-03-11 01:47:59 -05:00
|
|
|
}
|
|
|
|
for i, c := range hichars {
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
characters[i+128] = rune(c)
|
2021-03-11 01:47:59 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
var (
|
|
|
|
// lineBreaks is the set of all sequences recognized as line breaks.
|
|
|
|
lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}
|
|
|
|
// normForms is the set of Unicode normalization forms recognized.
|
|
|
|
normForms = []norm.Form{norm.NFC, norm.NFD}
|
|
|
|
)
|
2021-03-11 01:47:59 -05:00
|
|
|
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
func makeConverter(cmap *[256]rune) (*node, error) {
|
|
|
|
var n node
|
|
|
|
// Special case for CR and LF.
|
|
|
|
for _, b := range lineBreaks {
|
|
|
|
if err := n.add('\r', b); err != nil {
|
|
|
|
return nil, err
|
2021-03-11 01:47:59 -05:00
|
|
|
}
|
|
|
|
}
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
for m, u := range *cmap {
|
|
|
|
if m == '\r' || m == '\n' {
|
|
|
|
continue
|
2021-03-16 13:07:56 -04:00
|
|
|
}
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
us := string(u)
|
|
|
|
for _, form := range normForms {
|
|
|
|
bytes := []byte(form.String(us))
|
|
|
|
fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)
|
|
|
|
if err := n.add(byte(m), bytes); err != nil {
|
|
|
|
return nil, err
|
2021-03-11 01:47:59 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
return &n, nil
|
2021-03-11 01:47:59 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
func printData(f *os.File, ulen int, data []byte) error {
|
|
|
|
if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+
|
|
|
|
"// clang-format off\n"); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
var line []byte
|
|
|
|
for _, n := range data {
|
|
|
|
sv := len(line)
|
|
|
|
line = strconv.AppendUint(line, uint64(n), 10)
|
|
|
|
line = append(line, ',')
|
|
|
|
if len(line) > 80 {
|
|
|
|
line = append(line[:sv], '\n')
|
|
|
|
if _, err := f.Write(line); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
line = strconv.AppendUint(line[:0], uint64(n), 10)
|
|
|
|
line = append(line, ',')
|
|
|
|
}
|
|
|
|
}
|
|
|
|
line = append(line, '\n')
|
|
|
|
if _, err := f.Write(line); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
_, err := fmt.Print("};\n")
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
func mainE() error {
|
|
|
|
n, err := makeConverter(&characters)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
table := n.genTable()
|
2021-03-16 13:07:56 -04:00
|
|
|
if flagDumpTransitions {
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
w := bufio.NewWriter(os.Stderr)
|
|
|
|
table.dumpTransitions(w)
|
|
|
|
w.Flush()
|
2021-03-16 13:07:56 -04:00
|
|
|
}
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
bytes := table.toBytes()
|
2021-12-14 12:23:14 -05:00
|
|
|
bits := packbits.Pack(bytes)
|
Refactor table generator, handle line breaks
Extract table generation to its own file, table.go, and refactor the
interface. This exposed an inconsistency in the way that line breaks
were handled: both CR and LF on the Mac side were mapped to LF on the
UTF-8 side, but when the conversion table was inverted, the reverse
mappings would conflict. Previously, there was no explicit handling for
it, and whichever Mac charecter had a higher byte value would take
precedence. Conflicts are now detected and return an error, so line
breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to
CR when converting UTF-8 to Mac.
2021-12-14 13:12:52 -05:00
|
|
|
return printData(os.Stdout, len(bytes), bits)
|
|
|
|
}
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
flag.Parse()
|
|
|
|
if err := mainE(); err != nil {
|
2021-03-11 01:47:59 -05:00
|
|
|
fmt.Fprintln(os.Stderr, "Error:", err)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
}
|