mirror of
https://github.com/autc04/Retro68.git
synced 2025-03-03 02:30:58 +00:00
479 lines
10 KiB
Go
479 lines
10 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package norm contains types and functions for normalizing Unicode strings.
|
|
package norm
|
|
|
|
import "unicode/utf8"
|
|
|
|
// A Form denotes a canonical representation of Unicode code points.
|
|
// The Unicode-defined normalization and equivalence forms are:
|
|
//
|
|
// NFC Unicode Normalization Form C
|
|
// NFD Unicode Normalization Form D
|
|
// NFKC Unicode Normalization Form KC
|
|
// NFKD Unicode Normalization Form KD
|
|
//
|
|
// For a Form f, this documentation uses the notation f(x) to mean
|
|
// the bytes or string x converted to the given form.
|
|
// A position n in x is called a boundary if conversion to the form can
|
|
// proceed independently on both sides:
|
|
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
|
//
|
|
// References: http://unicode.org/reports/tr15/ and
|
|
// http://unicode.org/notes/tn5/.
|
|
type Form int
|
|
|
|
const (
|
|
NFC Form = iota
|
|
NFD
|
|
NFKC
|
|
NFKD
|
|
)
|
|
|
|
// Bytes returns f(b). May return b if f(b) = b.
|
|
func (f Form) Bytes(b []byte) []byte {
|
|
rb := reorderBuffer{}
|
|
rb.init(f, b)
|
|
n := quickSpan(&rb, 0)
|
|
if n == len(b) {
|
|
return b
|
|
}
|
|
out := make([]byte, n, len(b))
|
|
copy(out, b[0:n])
|
|
return doAppend(&rb, out, n)
|
|
}
|
|
|
|
// String returns f(s).
|
|
func (f Form) String(s string) string {
|
|
rb := reorderBuffer{}
|
|
rb.initString(f, s)
|
|
n := quickSpan(&rb, 0)
|
|
if n == len(s) {
|
|
return s
|
|
}
|
|
out := make([]byte, n, len(s))
|
|
copy(out, s[0:n])
|
|
return string(doAppend(&rb, out, n))
|
|
}
|
|
|
|
// IsNormal returns true if b == f(b).
|
|
func (f Form) IsNormal(b []byte) bool {
|
|
rb := reorderBuffer{}
|
|
rb.init(f, b)
|
|
bp := quickSpan(&rb, 0)
|
|
if bp == len(b) {
|
|
return true
|
|
}
|
|
for bp < len(b) {
|
|
decomposeSegment(&rb, bp)
|
|
if rb.f.composing {
|
|
rb.compose()
|
|
}
|
|
for i := 0; i < rb.nrune; i++ {
|
|
info := rb.rune[i]
|
|
if bp+int(info.size) > len(b) {
|
|
return false
|
|
}
|
|
p := info.pos
|
|
pe := p + info.size
|
|
for ; p < pe; p++ {
|
|
if b[bp] != rb.byte[p] {
|
|
return false
|
|
}
|
|
bp++
|
|
}
|
|
}
|
|
rb.reset()
|
|
bp = quickSpan(&rb, bp)
|
|
}
|
|
return true
|
|
}
|
|
|
|
// IsNormalString returns true if s == f(s).
|
|
func (f Form) IsNormalString(s string) bool {
|
|
rb := reorderBuffer{}
|
|
rb.initString(f, s)
|
|
bp := quickSpan(&rb, 0)
|
|
if bp == len(s) {
|
|
return true
|
|
}
|
|
for bp < len(s) {
|
|
decomposeSegment(&rb, bp)
|
|
if rb.f.composing {
|
|
rb.compose()
|
|
}
|
|
for i := 0; i < rb.nrune; i++ {
|
|
info := rb.rune[i]
|
|
if bp+int(info.size) > len(s) {
|
|
return false
|
|
}
|
|
p := info.pos
|
|
pe := p + info.size
|
|
for ; p < pe; p++ {
|
|
if s[bp] != rb.byte[p] {
|
|
return false
|
|
}
|
|
bp++
|
|
}
|
|
}
|
|
rb.reset()
|
|
bp = quickSpan(&rb, bp)
|
|
}
|
|
return true
|
|
}
|
|
|
|
// patchTail fixes a case where a rune may be incorrectly normalized
|
|
// if it is followed by illegal continuation bytes. It returns the
|
|
// patched buffer and whether there were trailing continuation bytes.
|
|
func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {
|
|
info, p := lastRuneStart(&rb.f, buf)
|
|
if p == -1 || info.size == 0 {
|
|
return buf, false
|
|
}
|
|
end := p + int(info.size)
|
|
extra := len(buf) - end
|
|
if extra > 0 {
|
|
// Potentially allocating memory. However, this only
|
|
// happens with ill-formed UTF-8.
|
|
x := make([]byte, 0)
|
|
x = append(x, buf[len(buf)-extra:]...)
|
|
buf = decomposeToLastBoundary(rb, buf[:end])
|
|
if rb.f.composing {
|
|
rb.compose()
|
|
}
|
|
buf = rb.flush(buf)
|
|
return append(buf, x...), true
|
|
}
|
|
return buf, false
|
|
}
|
|
|
|
func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
|
|
if rb.nsrc == i {
|
|
return dst, i
|
|
}
|
|
end := quickSpan(rb, i)
|
|
return rb.src.appendSlice(dst, i, end), end
|
|
}
|
|
|
|
// Append returns f(append(out, b...)).
|
|
// The buffer out must be nil, empty, or equal to f(out).
|
|
func (f Form) Append(out []byte, src ...byte) []byte {
|
|
if len(src) == 0 {
|
|
return out
|
|
}
|
|
rb := reorderBuffer{}
|
|
rb.init(f, src)
|
|
return doAppend(&rb, out, 0)
|
|
}
|
|
|
|
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
|
|
src, n := rb.src, rb.nsrc
|
|
doMerge := len(out) > 0
|
|
if q := src.skipNonStarter(p); q > p {
|
|
// Move leading non-starters to destination.
|
|
out = src.appendSlice(out, p, q)
|
|
buf, endsInError := patchTail(rb, out)
|
|
if endsInError {
|
|
out = buf
|
|
doMerge = false // no need to merge, ends with illegal UTF-8
|
|
} else {
|
|
out = decomposeToLastBoundary(rb, buf) // force decomposition
|
|
}
|
|
p = q
|
|
}
|
|
fd := &rb.f
|
|
if doMerge {
|
|
var info runeInfo
|
|
if p < n {
|
|
info = fd.info(src, p)
|
|
if p == 0 && !info.boundaryBefore() {
|
|
out = decomposeToLastBoundary(rb, out)
|
|
}
|
|
}
|
|
if info.size == 0 || info.boundaryBefore() {
|
|
if fd.composing {
|
|
rb.compose()
|
|
}
|
|
out = rb.flush(out)
|
|
if info.size == 0 {
|
|
// Append incomplete UTF-8 encoding.
|
|
return src.appendSlice(out, p, n)
|
|
}
|
|
}
|
|
}
|
|
if rb.nrune == 0 {
|
|
out, p = appendQuick(rb, out, p)
|
|
}
|
|
for p < n {
|
|
p = decomposeSegment(rb, p)
|
|
if fd.composing {
|
|
rb.compose()
|
|
}
|
|
out = rb.flush(out)
|
|
out, p = appendQuick(rb, out, p)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// AppendString returns f(append(out, []byte(s))).
|
|
// The buffer out must be nil, empty, or equal to f(out).
|
|
func (f Form) AppendString(out []byte, src string) []byte {
|
|
if len(src) == 0 {
|
|
return out
|
|
}
|
|
rb := reorderBuffer{}
|
|
rb.initString(f, src)
|
|
return doAppend(&rb, out, 0)
|
|
}
|
|
|
|
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) QuickSpan(b []byte) int {
|
|
rb := reorderBuffer{}
|
|
rb.init(f, b)
|
|
n := quickSpan(&rb, 0)
|
|
return n
|
|
}
|
|
|
|
func quickSpan(rb *reorderBuffer, i int) int {
|
|
var lastCC uint8
|
|
var nc int
|
|
lastSegStart := i
|
|
src, n := rb.src, rb.nsrc
|
|
for i < n {
|
|
if j := src.skipASCII(i, n); i != j {
|
|
i = j
|
|
lastSegStart = i - 1
|
|
lastCC = 0
|
|
nc = 0
|
|
continue
|
|
}
|
|
info := rb.f.info(src, i)
|
|
if info.size == 0 {
|
|
// include incomplete runes
|
|
return n
|
|
}
|
|
cc := info.ccc
|
|
if rb.f.composing {
|
|
if !info.isYesC() {
|
|
break
|
|
}
|
|
} else {
|
|
if !info.isYesD() {
|
|
break
|
|
}
|
|
}
|
|
if cc == 0 {
|
|
lastSegStart = i
|
|
nc = 0
|
|
} else {
|
|
if nc >= maxCombiningChars {
|
|
lastSegStart = i
|
|
lastCC = cc
|
|
nc = 1
|
|
} else {
|
|
if lastCC > cc {
|
|
return lastSegStart
|
|
}
|
|
nc++
|
|
}
|
|
}
|
|
lastCC = cc
|
|
i += int(info.size)
|
|
}
|
|
if i == n {
|
|
return n
|
|
}
|
|
if rb.f.composing {
|
|
return lastSegStart
|
|
}
|
|
return i
|
|
}
|
|
|
|
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) QuickSpanString(s string) int {
|
|
rb := reorderBuffer{}
|
|
rb.initString(f, s)
|
|
return quickSpan(&rb, 0)
|
|
}
|
|
|
|
// FirstBoundary returns the position i of the first boundary in b
|
|
// or -1 if b contains no boundary.
|
|
func (f Form) FirstBoundary(b []byte) int {
|
|
rb := reorderBuffer{}
|
|
rb.init(f, b)
|
|
return firstBoundary(&rb)
|
|
}
|
|
|
|
func firstBoundary(rb *reorderBuffer) int {
|
|
src, nsrc := rb.src, rb.nsrc
|
|
i := src.skipNonStarter(0)
|
|
if i >= nsrc {
|
|
return -1
|
|
}
|
|
fd := &rb.f
|
|
info := fd.info(src, i)
|
|
for n := 0; info.size != 0 && !info.boundaryBefore(); {
|
|
i += int(info.size)
|
|
if n++; n >= maxCombiningChars {
|
|
return i
|
|
}
|
|
if i >= nsrc {
|
|
if !info.boundaryAfter() {
|
|
return -1
|
|
}
|
|
return nsrc
|
|
}
|
|
info = fd.info(src, i)
|
|
}
|
|
if info.size == 0 {
|
|
return -1
|
|
}
|
|
return i
|
|
}
|
|
|
|
// FirstBoundaryInString returns the position i of the first boundary in s
|
|
// or -1 if s contains no boundary.
|
|
func (f Form) FirstBoundaryInString(s string) int {
|
|
rb := reorderBuffer{}
|
|
rb.initString(f, s)
|
|
return firstBoundary(&rb)
|
|
}
|
|
|
|
// LastBoundary returns the position i of the last boundary in b
|
|
// or -1 if b contains no boundary.
|
|
func (f Form) LastBoundary(b []byte) int {
|
|
return lastBoundary(formTable[f], b)
|
|
}
|
|
|
|
func lastBoundary(fd *formInfo, b []byte) int {
|
|
i := len(b)
|
|
info, p := lastRuneStart(fd, b)
|
|
if p == -1 {
|
|
return -1
|
|
}
|
|
if info.size == 0 { // ends with incomplete rune
|
|
if p == 0 { // starts wtih incomplete rune
|
|
return -1
|
|
}
|
|
i = p
|
|
info, p = lastRuneStart(fd, b[:i])
|
|
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
|
return i
|
|
}
|
|
}
|
|
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
|
return i
|
|
}
|
|
if info.boundaryAfter() {
|
|
return i
|
|
}
|
|
i = p
|
|
for n := 0; i >= 0 && !info.boundaryBefore(); {
|
|
info, p = lastRuneStart(fd, b[:i])
|
|
if n++; n >= maxCombiningChars {
|
|
return len(b)
|
|
}
|
|
if p+int(info.size) != i {
|
|
if p == -1 { // no boundary found
|
|
return -1
|
|
}
|
|
return i // boundary after an illegal UTF-8 encoding
|
|
}
|
|
i = p
|
|
}
|
|
return i
|
|
}
|
|
|
|
// decomposeSegment scans the first segment in src into rb.
|
|
// It returns the number of bytes consumed from src.
|
|
// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
|
|
// when we detect a sequence of 30+ non-starter chars.
|
|
func decomposeSegment(rb *reorderBuffer, sp int) int {
|
|
// Force one character to be consumed.
|
|
info := rb.f.info(rb.src, sp)
|
|
if info.size == 0 {
|
|
return 0
|
|
}
|
|
for rb.insert(rb.src, sp, info) {
|
|
sp += int(info.size)
|
|
if sp >= rb.nsrc {
|
|
break
|
|
}
|
|
info = rb.f.info(rb.src, sp)
|
|
bound := info.boundaryBefore()
|
|
if bound || info.size == 0 {
|
|
break
|
|
}
|
|
}
|
|
return sp
|
|
}
|
|
|
|
// lastRuneStart returns the runeInfo and position of the last
|
|
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
|
func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
|
|
p := len(buf) - 1
|
|
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
|
|
}
|
|
if p < 0 {
|
|
return runeInfo{}, -1
|
|
}
|
|
return fd.info(inputBytes(buf), p), p
|
|
}
|
|
|
|
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
|
// and scans it into rb. Returns the buffer minus the last segment.
|
|
func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
|
|
fd := &rb.f
|
|
info, i := lastRuneStart(fd, buf)
|
|
if int(info.size) != len(buf)-i {
|
|
// illegal trailing continuation bytes
|
|
return buf
|
|
}
|
|
if info.boundaryAfter() {
|
|
return buf
|
|
}
|
|
var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
|
|
add[0] = info
|
|
padd := 1
|
|
n := 1
|
|
p := len(buf) - int(info.size)
|
|
for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {
|
|
info, i = lastRuneStart(fd, buf[:p])
|
|
if int(info.size) != p-i {
|
|
break
|
|
}
|
|
// Check that decomposition doesn't result in overflow.
|
|
if info.hasDecomposition() {
|
|
if isHangul(buf) {
|
|
i += int(info.size)
|
|
n++
|
|
} else {
|
|
dcomp := info.decomposition()
|
|
for i := 0; i < len(dcomp); {
|
|
inf := rb.f.info(inputBytes(dcomp), i)
|
|
i += int(inf.size)
|
|
n++
|
|
}
|
|
}
|
|
} else {
|
|
n++
|
|
}
|
|
if n > maxBackRunes {
|
|
break
|
|
}
|
|
add[padd] = info
|
|
padd++
|
|
}
|
|
pp := p
|
|
for padd--; padd >= 0; padd-- {
|
|
info = add[padd]
|
|
rb.insert(inputBytes(buf), pp, info)
|
|
pp += int(info.size)
|
|
}
|
|
return buf[:p]
|
|
}
|