295 lines
8.5 KiB
Go
295 lines
8.5 KiB
Go
|
// Copyright 2012 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package build
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"unicode"
|
||
|
|
||
|
"golang.org/x/text/internal/colltab"
|
||
|
)
|
||
|
|
||
|
const (
|
||
|
defaultSecondary = 0x20
|
||
|
defaultTertiary = 0x2
|
||
|
maxTertiary = 0x1F
|
||
|
)
|
||
|
|
||
|
type rawCE struct {
|
||
|
w []int
|
||
|
ccc uint8
|
||
|
}
|
||
|
|
||
|
func makeRawCE(w []int, ccc uint8) rawCE {
|
||
|
ce := rawCE{w: make([]int, 4), ccc: ccc}
|
||
|
copy(ce.w, w)
|
||
|
return ce
|
||
|
}
|
||
|
|
||
|
// A collation element is represented as an uint32.
|
||
|
// In the typical case, a rune maps to a single collation element. If a rune
|
||
|
// can be the start of a contraction or expands into multiple collation elements,
|
||
|
// then the collation element that is associated with a rune will have a special
|
||
|
// form to represent such m to n mappings. Such special collation elements
|
||
|
// have a value >= 0x80000000.
|
||
|
|
||
|
const (
|
||
|
maxPrimaryBits = 21
|
||
|
maxSecondaryBits = 12
|
||
|
maxTertiaryBits = 8
|
||
|
)
|
||
|
|
||
|
func makeCE(ce rawCE) (uint32, error) {
|
||
|
v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
|
||
|
return uint32(v), e
|
||
|
}
|
||
|
|
||
|
// For contractions, collation elements are of the form
|
||
|
// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
|
||
|
// - n* is the size of the first node in the contraction trie.
|
||
|
// - i* is the index of the first node in the contraction trie.
|
||
|
// - b* is the offset into the contraction collation element table.
|
||
|
// See contract.go for details on the contraction trie.
|
||
|
const (
|
||
|
contractID = 0xC0000000
|
||
|
maxNBits = 4
|
||
|
maxTrieIndexBits = 12
|
||
|
maxContractOffsetBits = 13
|
||
|
)
|
||
|
|
||
|
func makeContractIndex(h ctHandle, offset int) (uint32, error) {
|
||
|
if h.n >= 1<<maxNBits {
|
||
|
return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
|
||
|
}
|
||
|
if h.index >= 1<<maxTrieIndexBits {
|
||
|
return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
|
||
|
}
|
||
|
if offset >= 1<<maxContractOffsetBits {
|
||
|
return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
|
||
|
}
|
||
|
ce := uint32(contractID)
|
||
|
ce += uint32(offset << (maxNBits + maxTrieIndexBits))
|
||
|
ce += uint32(h.index << maxNBits)
|
||
|
ce += uint32(h.n)
|
||
|
return ce, nil
|
||
|
}
|
||
|
|
||
|
// For expansions, collation elements are of the form
|
||
|
// 11100000 00000000 bbbbbbbb bbbbbbbb,
|
||
|
// where b* is the index into the expansion sequence table.
|
||
|
const (
|
||
|
expandID = 0xE0000000
|
||
|
maxExpandIndexBits = 16
|
||
|
)
|
||
|
|
||
|
func makeExpandIndex(index int) (uint32, error) {
|
||
|
if index >= 1<<maxExpandIndexBits {
|
||
|
return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
|
||
|
}
|
||
|
return expandID + uint32(index), nil
|
||
|
}
|
||
|
|
||
|
// Each list of collation elements corresponding to an expansion starts with
|
||
|
// a header indicating the length of the sequence.
|
||
|
func makeExpansionHeader(n int) (uint32, error) {
|
||
|
return uint32(n), nil
|
||
|
}
|
||
|
|
||
|
// Some runes can be expanded using NFKD decomposition. Instead of storing the full
|
||
|
// sequence of collation elements, we decompose the rune and lookup the collation
|
||
|
// elements for each rune in the decomposition and modify the tertiary weights.
|
||
|
// The collation element, in this case, is of the form
|
||
|
// 11110000 00000000 wwwwwwww vvvvvvvv, where
|
||
|
// - v* is the replacement tertiary weight for the first rune,
|
||
|
// - w* is the replacement tertiary weight for the second rune,
|
||
|
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
|
||
|
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
|
||
|
const (
|
||
|
decompID = 0xF0000000
|
||
|
)
|
||
|
|
||
|
func makeDecompose(t1, t2 int) (uint32, error) {
|
||
|
if t1 >= 256 || t1 < 0 {
|
||
|
return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
|
||
|
}
|
||
|
if t2 >= 256 || t2 < 0 {
|
||
|
return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
|
||
|
}
|
||
|
return uint32(t2<<8+t1) + decompID, nil
|
||
|
}
|
||
|
|
||
|
const (
|
||
|
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
|
||
|
minUnified rune = 0x4E00
|
||
|
maxUnified = 0x9FFF
|
||
|
minCompatibility = 0xF900
|
||
|
maxCompatibility = 0xFAFF
|
||
|
minRare = 0x3400
|
||
|
maxRare = 0x4DBF
|
||
|
)
|
||
|
const (
|
||
|
commonUnifiedOffset = 0x10000
|
||
|
rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
|
||
|
otherOffset = 0x50000 // largest rune in rare is U+2FA1D
|
||
|
illegalOffset = otherOffset + int(unicode.MaxRune)
|
||
|
maxPrimary = illegalOffset + 1
|
||
|
)
|
||
|
|
||
|
// implicitPrimary returns the primary weight for the a rune
|
||
|
// for which there is no entry for the rune in the collation table.
|
||
|
// We take a different approach from the one specified in
|
||
|
// http://unicode.org/reports/tr10/#Implicit_Weights,
|
||
|
// but preserve the resulting relative ordering of the runes.
|
||
|
func implicitPrimary(r rune) int {
|
||
|
if unicode.Is(unicode.Ideographic, r) {
|
||
|
if r >= minUnified && r <= maxUnified {
|
||
|
// The most common case for CJK.
|
||
|
return int(r) + commonUnifiedOffset
|
||
|
}
|
||
|
if r >= minCompatibility && r <= maxCompatibility {
|
||
|
// This will typically not hit. The DUCET explicitly specifies mappings
|
||
|
// for all characters that do not decompose.
|
||
|
return int(r) + commonUnifiedOffset
|
||
|
}
|
||
|
return int(r) + rareUnifiedOffset
|
||
|
}
|
||
|
return int(r) + otherOffset
|
||
|
}
|
||
|
|
||
|
// convertLargeWeights converts collation elements with large
|
||
|
// primaries (either double primaries or for illegal runes)
|
||
|
// to our own representation.
|
||
|
// A CJK character C is represented in the DUCET as
|
||
|
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
|
||
|
// We will rewrite these characters to a single CE.
|
||
|
// We assume the CJK values start at 0x8000.
|
||
|
// See http://unicode.org/reports/tr10/#Implicit_Weights
|
||
|
func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
|
||
|
const (
|
||
|
cjkPrimaryStart = 0xFB40
|
||
|
rarePrimaryStart = 0xFB80
|
||
|
otherPrimaryStart = 0xFBC0
|
||
|
illegalPrimary = 0xFFFE
|
||
|
highBitsMask = 0x3F
|
||
|
lowBitsMask = 0x7FFF
|
||
|
lowBitsFlag = 0x8000
|
||
|
shiftBits = 15
|
||
|
)
|
||
|
for i := 0; i < len(elems); i++ {
|
||
|
ce := elems[i].w
|
||
|
p := ce[0]
|
||
|
if p < cjkPrimaryStart {
|
||
|
continue
|
||
|
}
|
||
|
if p > 0xFFFF {
|
||
|
return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
|
||
|
}
|
||
|
if p >= illegalPrimary {
|
||
|
ce[0] = illegalOffset + p - illegalPrimary
|
||
|
} else {
|
||
|
if i+1 >= len(elems) {
|
||
|
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
|
||
|
}
|
||
|
if elems[i+1].w[0]&lowBitsFlag == 0 {
|
||
|
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
|
||
|
}
|
||
|
np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
|
||
|
switch {
|
||
|
case p < rarePrimaryStart:
|
||
|
np += commonUnifiedOffset
|
||
|
case p < otherPrimaryStart:
|
||
|
np += rareUnifiedOffset
|
||
|
default:
|
||
|
p += otherOffset
|
||
|
}
|
||
|
ce[0] = np
|
||
|
for j := i + 1; j+1 < len(elems); j++ {
|
||
|
elems[j] = elems[j+1]
|
||
|
}
|
||
|
elems = elems[:len(elems)-1]
|
||
|
}
|
||
|
}
|
||
|
return elems, nil
|
||
|
}
|
||
|
|
||
|
// nextWeight computes the first possible collation weights following elems
|
||
|
// for the given level.
|
||
|
func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
|
||
|
if level == colltab.Identity {
|
||
|
next := make([]rawCE, len(elems))
|
||
|
copy(next, elems)
|
||
|
return next
|
||
|
}
|
||
|
next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
|
||
|
next[0].w[level]++
|
||
|
if level < colltab.Secondary {
|
||
|
next[0].w[colltab.Secondary] = defaultSecondary
|
||
|
}
|
||
|
if level < colltab.Tertiary {
|
||
|
next[0].w[colltab.Tertiary] = defaultTertiary
|
||
|
}
|
||
|
// Filter entries that cannot influence ordering.
|
||
|
for _, ce := range elems[1:] {
|
||
|
skip := true
|
||
|
for i := colltab.Primary; i < level; i++ {
|
||
|
skip = skip && ce.w[i] == 0
|
||
|
}
|
||
|
if !skip {
|
||
|
next = append(next, ce)
|
||
|
}
|
||
|
}
|
||
|
return next
|
||
|
}
|
||
|
|
||
|
func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
|
||
|
for ; i < len(elems) && elems[i].w[level] == 0; i++ {
|
||
|
}
|
||
|
if i < len(elems) {
|
||
|
return i, elems[i].w[level]
|
||
|
}
|
||
|
return i, 0
|
||
|
}
|
||
|
|
||
|
// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
|
||
|
// It also returns the collation level at which the difference is found.
|
||
|
func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
|
||
|
for level := colltab.Primary; level < colltab.Identity; level++ {
|
||
|
var va, vb int
|
||
|
for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
|
||
|
ia, va = nextVal(a, ia, level)
|
||
|
ib, vb = nextVal(b, ib, level)
|
||
|
if va != vb {
|
||
|
if va < vb {
|
||
|
return -1, level
|
||
|
} else {
|
||
|
return 1, level
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return 0, colltab.Identity
|
||
|
}
|
||
|
|
||
|
func equalCE(a, b rawCE) bool {
|
||
|
for i := 0; i < 3; i++ {
|
||
|
if b.w[i] != a.w[i] {
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func equalCEArrays(a, b []rawCE) bool {
|
||
|
if len(a) != len(b) {
|
||
|
return false
|
||
|
}
|
||
|
for i := range a {
|
||
|
if !equalCE(a[i], b[i]) {
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
return true
|
||
|
}
|