310 lines
9.3 KiB
Go
310 lines
9.3 KiB
Go
// Copyright 2012 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package build
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"reflect"
|
|
"sort"
|
|
"strings"
|
|
|
|
"golang.org/x/text/internal/colltab"
|
|
)
|
|
|
|
// This file contains code for detecting contractions and generating
|
|
// the necessary tables.
|
|
// Any Unicode Collation Algorithm (UCA) table entry that has more than
|
|
// one rune one the left-hand side is called a contraction.
|
|
// See http://www.unicode.org/reports/tr10/#Contractions for more details.
|
|
//
|
|
// We define the following terms:
|
|
// initial: a rune that appears as the first rune in a contraction.
|
|
// suffix: a sequence of runes succeeding the initial rune
|
|
// in a given contraction.
|
|
// non-initial: a rune that appears in a suffix.
|
|
//
|
|
// A rune may be both an initial and a non-initial and may be so in
|
|
// many contractions. An initial may typically also appear by itself.
|
|
// In case of ambiguities, the UCA requires we match the longest
|
|
// contraction.
|
|
//
|
|
// Many contraction rules share the same set of possible suffixes.
|
|
// We store sets of suffixes in a trie that associates an index with
|
|
// each suffix in the set. This index can be used to look up a
|
|
// collation element associated with the (starter rune, suffix) pair.
|
|
//
|
|
// The trie is defined on a UTF-8 byte sequence.
|
|
// The overall trie is represented as an array of ctEntries. Each node of the trie
|
|
// is represented as a subsequence of ctEntries, where each entry corresponds to
|
|
// a possible match of a next character in the search string. An entry
|
|
// also includes the length and offset to the next sequence of entries
|
|
// to check in case of a match.
|
|
|
|
const (
|
|
final = 0
|
|
noIndex = 0xFF
|
|
)
|
|
|
|
// ctEntry associates to a matching byte an offset and/or next sequence of
|
|
// bytes to check. A ctEntry c is called final if a match means that the
|
|
// longest suffix has been found. An entry c is final if c.N == 0.
|
|
// A single final entry can match a range of characters to an offset.
|
|
// A non-final entry always matches a single byte. Note that a non-final
|
|
// entry might still resemble a completed suffix.
|
|
// Examples:
|
|
// The suffix strings "ab" and "ac" can be represented as:
|
|
// []ctEntry{
|
|
// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
|
|
// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
|
|
// }
|
|
//
|
|
// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
|
|
// []ctEntry{
|
|
// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
|
|
// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
|
|
// {'d', 'd', final, 3}, // "abd" -> 3
|
|
// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
|
|
// {'d', 'd', final, 4}, // "abcd" -> 4
|
|
// }
|
|
// See genStateTests in contract_test.go for more examples.
|
|
type ctEntry struct {
|
|
L uint8 // non-final: byte value to match; final: lowest match in range.
|
|
H uint8 // non-final: relative index to next block; final: highest match in range.
|
|
N uint8 // non-final: length of next block; final: final
|
|
I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
|
|
}
|
|
|
|
// contractTrieSet holds a set of contraction tries. The tries are stored
|
|
// consecutively in the entry field.
|
|
type contractTrieSet []struct{ l, h, n, i uint8 }
|
|
|
|
// ctHandle is used to identify a trie in the trie set, consisting in an offset
|
|
// in the array and the size of the first node.
|
|
type ctHandle struct {
|
|
index, n int
|
|
}
|
|
|
|
// appendTrie adds a new trie for the given suffixes to the trie set and returns
|
|
// a handle to it. The handle will be invalid on error.
|
|
func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
|
|
es := make([]stridx, len(suffixes))
|
|
for i, s := range suffixes {
|
|
es[i].str = s
|
|
}
|
|
sort.Sort(offsetSort(es))
|
|
for i := range es {
|
|
es[i].index = i + 1
|
|
}
|
|
sort.Sort(genidxSort(es))
|
|
i := len(*ct)
|
|
n, err := genStates(ct, es)
|
|
if err != nil {
|
|
*ct = (*ct)[:i]
|
|
return ctHandle{}, err
|
|
}
|
|
return ctHandle{i, n}, nil
|
|
}
|
|
|
|
// genStates generates ctEntries for a given suffix set and returns
|
|
// the number of entries for the first node.
|
|
func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
|
|
if len(sis) == 0 {
|
|
return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
|
|
}
|
|
start := len(*ct)
|
|
// create entries for differing first bytes.
|
|
for _, si := range sis {
|
|
s := si.str
|
|
if len(s) == 0 {
|
|
continue
|
|
}
|
|
added := false
|
|
c := s[0]
|
|
if len(s) > 1 {
|
|
for j := len(*ct) - 1; j >= start; j-- {
|
|
if (*ct)[j].L == c {
|
|
added = true
|
|
break
|
|
}
|
|
}
|
|
if !added {
|
|
*ct = append(*ct, ctEntry{L: c, I: noIndex})
|
|
}
|
|
} else {
|
|
for j := len(*ct) - 1; j >= start; j-- {
|
|
// Update the offset for longer suffixes with the same byte.
|
|
if (*ct)[j].L == c {
|
|
(*ct)[j].I = uint8(si.index)
|
|
added = true
|
|
}
|
|
// Extend range of final ctEntry, if possible.
|
|
if (*ct)[j].H+1 == c {
|
|
(*ct)[j].H = c
|
|
added = true
|
|
}
|
|
}
|
|
if !added {
|
|
*ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
|
|
}
|
|
}
|
|
}
|
|
n := len(*ct) - start
|
|
// Append nodes for the remainder of the suffixes for each ctEntry.
|
|
sp := 0
|
|
for i, end := start, len(*ct); i < end; i++ {
|
|
fe := (*ct)[i]
|
|
if fe.H == 0 { // uninitialized non-final
|
|
ln := len(*ct) - start - n
|
|
if ln > 0xFF {
|
|
return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
|
|
}
|
|
fe.H = uint8(ln)
|
|
// Find first non-final strings with same byte as current entry.
|
|
for ; sis[sp].str[0] != fe.L; sp++ {
|
|
}
|
|
se := sp + 1
|
|
for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
|
|
}
|
|
sl := sis[sp:se]
|
|
sp = se
|
|
for i, si := range sl {
|
|
sl[i].str = si.str[1:]
|
|
}
|
|
nn, err := genStates(ct, sl)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
fe.N = uint8(nn)
|
|
(*ct)[i] = fe
|
|
}
|
|
}
|
|
sort.Sort(entrySort((*ct)[start : start+n]))
|
|
return n, nil
|
|
}
|
|
|
|
// There may be both a final and non-final entry for a byte if the byte
|
|
// is implied in a range of matches in the final entry.
|
|
// We need to ensure that the non-final entry comes first in that case.
|
|
type entrySort colltab.ContractTrieSet
|
|
|
|
func (fe entrySort) Len() int { return len(fe) }
|
|
func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
|
|
func (fe entrySort) Less(i, j int) bool {
|
|
return fe[i].L > fe[j].L
|
|
}
|
|
|
|
// stridx is used for sorting suffixes and their associated offsets.
|
|
type stridx struct {
|
|
str string
|
|
index int
|
|
}
|
|
|
|
// For computing the offsets, we first sort by size, and then by string.
|
|
// This ensures that strings that only differ in the last byte by 1
|
|
// are sorted consecutively in increasing order such that they can
|
|
// be packed as a range in a final ctEntry.
|
|
type offsetSort []stridx
|
|
|
|
func (si offsetSort) Len() int { return len(si) }
|
|
func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
|
|
func (si offsetSort) Less(i, j int) bool {
|
|
if len(si[i].str) != len(si[j].str) {
|
|
return len(si[i].str) > len(si[j].str)
|
|
}
|
|
return si[i].str < si[j].str
|
|
}
|
|
|
|
// For indexing, we want to ensure that strings are sorted in string order, where
|
|
// for strings with the same prefix, we put longer strings before shorter ones.
|
|
type genidxSort []stridx
|
|
|
|
func (si genidxSort) Len() int { return len(si) }
|
|
func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
|
|
func (si genidxSort) Less(i, j int) bool {
|
|
if strings.HasPrefix(si[j].str, si[i].str) {
|
|
return false
|
|
}
|
|
if strings.HasPrefix(si[i].str, si[j].str) {
|
|
return true
|
|
}
|
|
return si[i].str < si[j].str
|
|
}
|
|
|
|
// lookup matches the longest suffix in str and returns the associated offset
|
|
// and the number of bytes consumed.
|
|
func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
|
|
states := (*ct)[h.index:]
|
|
p := 0
|
|
n := h.n
|
|
for i := 0; i < n && p < len(str); {
|
|
e := states[i]
|
|
c := str[p]
|
|
if c >= e.L {
|
|
if e.L == c {
|
|
p++
|
|
if e.I != noIndex {
|
|
index, ns = int(e.I), p
|
|
}
|
|
if e.N != final {
|
|
// set to new state
|
|
i, states, n = 0, states[int(e.H)+n:], int(e.N)
|
|
} else {
|
|
return
|
|
}
|
|
continue
|
|
} else if e.N == final && c <= e.H {
|
|
p++
|
|
return int(c-e.L) + int(e.I), p
|
|
}
|
|
}
|
|
i++
|
|
}
|
|
return
|
|
}
|
|
|
|
// print writes the contractTrieSet t as compilable Go code to w. It returns
|
|
// the total number of bytes written and the size of the resulting data structure in bytes.
|
|
func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
|
|
update3 := func(nn, sz int, e error) {
|
|
n += nn
|
|
if err == nil {
|
|
err = e
|
|
}
|
|
size += sz
|
|
}
|
|
update2 := func(nn int, e error) { update3(nn, 0, e) }
|
|
|
|
update3(printArray(*t, w, name))
|
|
update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
|
|
update3(printStruct(*t, w, name))
|
|
update2(fmt.Fprintln(w))
|
|
return
|
|
}
|
|
|
|
func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
|
|
p := func(f string, a ...interface{}) {
|
|
nn, e := fmt.Fprintf(w, f, a...)
|
|
n += nn
|
|
if err == nil {
|
|
err = e
|
|
}
|
|
}
|
|
size = len(ct) * 4
|
|
p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
|
|
p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
|
|
for _, fe := range ct {
|
|
p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
|
|
}
|
|
p("}\n")
|
|
return
|
|
}
|
|
|
|
func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
|
|
n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
|
|
size = int(reflect.TypeOf(ct).Size())
|
|
return
|
|
}
|