// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package norm // This file contains Form-specific logic and wrappers for data in tables.go. // Rune info is stored in a separate trie per composing form. A composing form // and its corresponding decomposing form share the same trie. Each trie maps // a rune to a uint16. The values take two forms. For v >= 0x8000: // bits // 0..8: ccc // 9..12: qcInfo (see below). isYesD is always true (no decompostion). // 16: 1 // For v < 0x8000, the respective rune has a decomposition and v is an index // into a byte array of UTF-8 decomposition sequences and additional info and // has the form: //
* [ []] // The header contains the number of bytes in the decomposition (excluding this // length byte). The two most significant bits of this length byte correspond // to bit 2 and 3 of qcIfo (see below). The byte sequence itself starts at v+1. // The byte sequence is followed by a trailing and leading CCC if the values // for these are not zero. The value of v determines which ccc are appended // to the sequences. For v < firstCCC, there are none, for v >= firstCCC, // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC // there is an additional leading ccc. const ( qcInfoMask = 0xF // to clear all but the relevant bits in a qcInfo headerLenMask = 0x3F // extract the length value from the header byte headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte ) // runeInfo is a representation for the data stored in charinfoTrie. type runeInfo struct { pos uint8 // start position in reorderBuffer; used in composition.go size uint8 // length of UTF-8 encoding of this rune ccc uint8 // leading canonical combining class (ccc if not decomposition) tccc uint8 // trailing canonical combining class (ccc if not decomposition) flags qcInfo // quick check flags index uint16 } // functions dispatchable per form type lookupFunc func(b input, i int) runeInfo // formInfo holds Form-specific functions and tables. type formInfo struct { form Form composing, compatibility bool // form type info lookupFunc } var formTable []*formInfo func init() { formTable = make([]*formInfo, 4) for i := range formTable { f := &formInfo{} formTable[i] = f f.form = Form(i) if Form(i) == NFKD || Form(i) == NFKC { f.compatibility = true f.info = lookupInfoNFKC } else { f.info = lookupInfoNFC } if Form(i) == NFC || Form(i) == NFKC { f.composing = true } } } // We do not distinguish between boundaries for NFC, NFD, etc. to avoid // unexpected behavior for the user. For example, in NFD, there is a boundary // after 'a'. However, a might combine with modifiers, so from the application's // perspective it is not a good boundary. We will therefore always use the // boundaries for the combining variants. func (i runeInfo) boundaryBefore() bool { if i.ccc == 0 && !i.combinesBackward() { return true } // We assume that the CCC of the first character in a decomposition // is always non-zero if different from info.ccc and that we can return // false at this point. This is verified by maketables. return false } func (i runeInfo) boundaryAfter() bool { return i.isInert() } // We pack quick check data in 4 bits: // 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition. // 1..2: NFC_QC Yes(00), No (10), or Maybe (11) // 3: Combines forward (0 == false, 1 == true) // // When all 4 bits are zero, the character is inert, meaning it is never // influenced by normalization. type qcInfo uint8 func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 } func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 } func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 } func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD func (r runeInfo) isInert() bool { return r.flags&0xf == 0 && r.ccc == 0 } func (r runeInfo) decomposition() []byte { if r.index == 0 { return nil } p := r.index n := decomps[p] & 0x3F p++ return decomps[p : p+uint16(n)] } // Recomposition // We use 32-bit keys instead of 64-bit for the two codepoint keys. // This clips off the bits of three entries, but we know this will not // result in a collision. In the unlikely event that changes to // UnicodeData.txt introduce collisions, the compiler will catch it. // Note that the recomposition map for NFC and NFKC are identical. // combine returns the combined rune or 0 if it doesn't exist. func combine(a, b rune) rune { key := uint32(uint16(a))<<16 + uint32(uint16(b)) return recompMap[key] } func lookupInfoNFC(b input, i int) runeInfo { v, sz := b.charinfoNFC(i) return compInfo(v, sz) } func lookupInfoNFKC(b input, i int) runeInfo { v, sz := b.charinfoNFKC(i) return compInfo(v, sz) } // compInfo converts the information contained in v and sz // to a runeInfo. See the comment at the top of the file // for more information on the format. func compInfo(v uint16, sz int) runeInfo { if v == 0 { return runeInfo{size: uint8(sz)} } else if v >= 0x8000 { return runeInfo{ size: uint8(sz), ccc: uint8(v), tccc: uint8(v), flags: qcInfo(v>>8) & qcInfoMask, } } // has decomposition h := decomps[v] f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1 ri := runeInfo{size: uint8(sz), flags: f, index: v} if v >= firstCCC { v += uint16(h&headerLenMask) + 1 ri.tccc = decomps[v] if v >= firstLeadingCCC { ri.ccc = decomps[v+1] } } return ri }