syncfiles/gen/macroman.go

package main

import (
	"bufio"
	"flag"
	"fmt"
	"os"
	"strconv"

	"github.com/depp/packbits"
	"golang.org/x/text/unicode/norm"
)

var (
	flagDumpSequences   bool
	flagDumpTransitions bool
)

func init() {
	flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")
	flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")
}

var characters [256]rune

func init() {
	hichars := [128]uint16{
		0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
		0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
		0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
		0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
		0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
		0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
		0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
		0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
		0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
		0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
		0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
		0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
		0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
		0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
		0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
		0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
	}
	for i := 0; i < 128; i++ {
		characters[i] = rune(i)
	}
	for i, c := range hichars {
		characters[i+128] = rune(c)
	}
}

var (
	// lineBreaks is the set of all sequences recognized as line breaks.
	lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}
	// normForms is the set of Unicode normalization forms recognized.
	normForms = []norm.Form{norm.NFC, norm.NFD}
)

func makeConverter(cmap *[256]rune) (*node, error) {
	var n node
	// Special case for CR and LF.
	for _, b := range lineBreaks {
		if err := n.add('\r', b); err != nil {
			return nil, err
		}
	}
	for m, u := range *cmap {
		if m == '\r' || m == '\n' {
			continue
		}
		us := string(u)
		for _, form := range normForms {
			bytes := []byte(form.String(us))
			fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)
			if err := n.add(byte(m), bytes); err != nil {
				return nil, err
			}
		}
	}
	return &n, nil
}

func printData(f *os.File, ulen int, data []byte) error {
	if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+
		"// clang-format off\n"); err != nil {
		return err
	}
	if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil {
		return err
	}
	if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil {
		return err
	}
	var line []byte
	for _, n := range data {
		sv := len(line)
		line = strconv.AppendUint(line, uint64(n), 10)
		line = append(line, ',')
		if len(line) > 80 {
			line = append(line[:sv], '\n')
			if _, err := f.Write(line); err != nil {
				return err
			}
			line = strconv.AppendUint(line[:0], uint64(n), 10)
			line = append(line, ',')
		}
	}
	line = append(line, '\n')
	if _, err := f.Write(line); err != nil {
		return err
	}
	_, err := fmt.Print("};\n")
	return err
}

func mainE() error {
	n, err := makeConverter(&characters)
	if err != nil {
		return err
	}
	table := n.genTable()
	if flagDumpTransitions {
		w := bufio.NewWriter(os.Stderr)
		table.dumpTransitions(w)
		w.Flush()
	}
	bytes := table.toBytes()
	bits := packbits.Pack(bytes)
	return printData(os.Stdout, len(bytes), bits)
}

func main() {
	flag.Parse()
	if err := mainE(); err != nil {
		fmt.Fprintln(os.Stderr, "Error:", err)
		os.Exit(1)
	}
}
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`package main`

			`import (`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`"bufio"`
Add flags for debugging charset conversion tables GitOrigin-RevId: d993358c037d8edd00d5819cac852c7822a89d3f 2021-03-16 13:07:56 -04:00			`"flag"`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`"fmt"`
			`"os"`
			`"strconv"`

Extract packbits into separate repository 2021-12-14 12:23:14 -05:00			`"github.com/depp/packbits"`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`"golang.org/x/text/unicode/norm"`
			`)`

Add flags for debugging charset conversion tables GitOrigin-RevId: d993358c037d8edd00d5819cac852c7822a89d3f 2021-03-16 13:07:56 -04:00			`var (`
			`flagDumpSequences bool`
			`flagDumpTransitions bool`
			`)`

			`func init() {`
			`flag.BoolVar(&flagDumpSequences, "dump-sequences", false, "dump Unicode sequences")`
			`flag.BoolVar(&flagDumpTransitions, "dump-transitions", false, "dump state machine state transition tables")`
			`}`

Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`var characters [256]rune`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00
			`func init() {`
			`hichars := [128]uint16{`
			`0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,`
			`0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,`
			`0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,`
			`0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,`
			`0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,`
			`0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,`
			`0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,`
			`0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,`
			`0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,`
			`0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,`
			`0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,`
			`0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,`
			`0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,`
			`0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,`
			`0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,`
			`0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,`
			`}`
			`for i := 0; i < 128; i++ {`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`characters[i] = rune(i)`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`}`
			`for i, c := range hichars {`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`characters[i+128] = rune(c)`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`}`
			`}`

Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`var (`
			`// lineBreaks is the set of all sequences recognized as line breaks.`
			`lineBreaks = [][]byte{{'\n'}, {'\r'}, {'\r', '\n'}}`
			`// normForms is the set of Unicode normalization forms recognized.`
			`normForms = []norm.Form{norm.NFC, norm.NFD}`
			`)`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`func makeConverter(cmap [256]rune) (node, error) {`
			`var n node`
			`// Special case for CR and LF.`
			`for _, b := range lineBreaks {`
			`if err := n.add('\r', b); err != nil {`
			`return nil, err`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`}`
			`}`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`for m, u := range *cmap {`
			`if m == '\r' \|\| m == '\n' {`
			`continue`
Add flags for debugging charset conversion tables GitOrigin-RevId: d993358c037d8edd00d5819cac852c7822a89d3f 2021-03-16 13:07:56 -04:00			`}`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`us := string(u)`
			`for _, form := range normForms {`
			`bytes := []byte(form.String(us))`
			`fmt.Fprintf(os.Stderr, "%d -> %v\n", u, bytes)`
			`if err := n.add(byte(m), bytes); err != nil {`
			`return nil, err`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`}`
			`}`
			`}`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`return &n, nil`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`}`

			`func printData(f *os.File, ulen int, data []byte) error {`
			`if _, err := fmt.Fprint(f, "/* This file is automatically generated. */\n"+`
			`"// clang-format off\n"); err != nil {`
			`return err`
			`}`
			`if _, err := fmt.Fprintf(f, "#define FROM_UNIX_DATALEN %d\n", ulen); err != nil {`
			`return err`
			`}`
			`if _, err := fmt.Fprintf(f, "static const unsigned char kFromUnixData[%d] = {\n", len(data)); err != nil {`
			`return err`
			`}`
			`var line []byte`
			`for _, n := range data {`
			`sv := len(line)`
			`line = strconv.AppendUint(line, uint64(n), 10)`
			`line = append(line, ',')`
			`if len(line) > 80 {`
			`line = append(line[:sv], '\n')`
			`if _, err := f.Write(line); err != nil {`
			`return err`
			`}`
			`line = strconv.AppendUint(line[:0], uint64(n), 10)`
			`line = append(line, ',')`
			`}`
			`}`
			`line = append(line, '\n')`
			`if _, err := f.Write(line); err != nil {`
			`return err`
			`}`
			`_, err := fmt.Print("};\n")`
			`return err`
			`}`

Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`func mainE() error {`
			`n, err := makeConverter(&characters)`
			`if err != nil {`
			`return err`
			`}`
			`table := n.genTable()`
Add flags for debugging charset conversion tables GitOrigin-RevId: d993358c037d8edd00d5819cac852c7822a89d3f 2021-03-16 13:07:56 -04:00			`if flagDumpTransitions {`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`w := bufio.NewWriter(os.Stderr)`
			`table.dumpTransitions(w)`
			`w.Flush()`
Add flags for debugging charset conversion tables GitOrigin-RevId: d993358c037d8edd00d5819cac852c7822a89d3f 2021-03-16 13:07:56 -04:00			`}`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`bytes := table.toBytes()`
Extract packbits into separate repository 2021-12-14 12:23:14 -05:00			`bits := packbits.Pack(bytes)`
Refactor table generator, handle line breaks Extract table generation to its own file, table.go, and refactor the interface. This exposed an inconsistency in the way that line breaks were handled: both CR and LF on the Mac side were mapped to LF on the UTF-8 side, but when the conversion table was inverted, the reverse mappings would conflict. Previously, there was no explicit handling for it, and whichever Mac charecter had a higher byte value would take precedence. Conflicts are now detected and return an error, so line breaks must be mapped explicitly. The new code maps CR, LF, and CRLF to CR when converting UTF-8 to Mac. 2021-12-14 13:12:52 -05:00			`return printData(os.Stdout, len(bytes), bits)`
			`}`

			`func main() {`
			`flag.Parse()`
			`if err := mainE(); err != nil {`
Add generator for UTF-8 to Macintosh tables GitOrigin-RevId: 96f2909330b94c895688bfa2f5b50c3e4408fe54 2021-03-11 01:47:59 -05:00			`fmt.Fprintln(os.Stderr, "Error:", err)`
			`os.Exit(1)`
			`}`
			`}`