mirror of https://github.com/golang/go.git
exp/locale/collate: Added Builder type for generating a complete
collation table. At this moment, it only implements the generation of a root table. R=r CC=golang-dev https://golang.org/cl/6039047
This commit is contained in:
parent
52f0afe0db
commit
fdce27f7b8
|
|
@ -0,0 +1,501 @@
|
|||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package build
|
||||
|
||||
import (
|
||||
"exp/locale/collate"
|
||||
"exp/norm"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// TODO: optimizations:
|
||||
// - expandElem is currently 20K. By putting unique colElems in a separate
|
||||
// table and having a byte array of indexes into this table, we can reduce
|
||||
// the total size to about 7K. By also factoring out the length bytes, we
|
||||
// can reduce this to about 6K.
|
||||
// - trie valueBlocks are currently 100K. There are a lot of sparse blocks
|
||||
// and many consecutive values with the same stride. This can be further
|
||||
// compacted.
|
||||
|
||||
// entry is used to keep track of a single entry in the collation element table
|
||||
// during building. Examples of entries can be found in the Default Unicode
|
||||
// Collation Element Table.
|
||||
// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
|
||||
type entry struct {
|
||||
runes []rune
|
||||
elems [][]int // the collation elements for runes
|
||||
str string // same as string(runes)
|
||||
|
||||
decompose bool // can use NFKD decomposition to generate elems
|
||||
expansionIndex int // used to store index into expansion table
|
||||
contractionHandle ctHandle
|
||||
contractionIndex int // index into contraction elements
|
||||
}
|
||||
|
||||
func (e *entry) String() string {
|
||||
return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)",
|
||||
e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
|
||||
}
|
||||
|
||||
func (e *entry) skip() bool {
|
||||
return e.contraction()
|
||||
}
|
||||
|
||||
func (e *entry) expansion() bool {
|
||||
return !e.decompose && len(e.elems) > 1
|
||||
}
|
||||
|
||||
func (e *entry) contraction() bool {
|
||||
return len(e.runes) > 1
|
||||
}
|
||||
|
||||
func (e *entry) contractionStarter() bool {
|
||||
return e.contractionHandle.n != 0
|
||||
}
|
||||
|
||||
// A Builder builds collation tables. It can generate both the root table and
|
||||
// locale-specific tables defined as tailorings to the root table.
|
||||
// The typical use case is to specify the data for the root table and all locale-specific
|
||||
// tables using Add and AddTailoring before making any call to Build. This allows
|
||||
// Builder to ensure that a root table can support tailorings for each locale.
|
||||
type Builder struct {
|
||||
entryMap map[string]*entry
|
||||
entry []*entry
|
||||
t *table
|
||||
err error
|
||||
}
|
||||
|
||||
// NewBuilder returns a new Builder.
|
||||
func NewBuilder() *Builder {
|
||||
b := &Builder{
|
||||
entryMap: make(map[string]*entry),
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// Add adds an entry for the root collation element table, mapping
|
||||
// a slice of runes to a sequence of collation elements.
|
||||
// A collation element is specified as list of weights: []int{primary, secondary, ...}.
|
||||
// The entries are typically obtained from a collation element table
|
||||
// as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format.
|
||||
// Note that the collation elements specified by colelems are only used
|
||||
// as a guide. The actual weights generated by Builder may differ.
|
||||
func (b *Builder) Add(str []rune, colelems [][]int) error {
|
||||
e := &entry{
|
||||
runes: make([]rune, len(str)),
|
||||
elems: make([][]int, len(colelems)),
|
||||
str: string(str),
|
||||
}
|
||||
copy(e.runes, str)
|
||||
for i, ce := range colelems {
|
||||
e.elems[i] = append(e.elems[i], ce...)
|
||||
if len(ce) == 0 {
|
||||
e.elems[i] = append(e.elems[i], []int{0, 0, 0, 0}...)
|
||||
break
|
||||
}
|
||||
if len(ce) == 1 {
|
||||
e.elems[i] = append(e.elems[i], defaultSecondary)
|
||||
}
|
||||
if len(ce) <= 2 {
|
||||
e.elems[i] = append(e.elems[i], defaultTertiary)
|
||||
}
|
||||
if len(ce) <= 3 {
|
||||
e.elems[i] = append(e.elems[i], ce[0])
|
||||
}
|
||||
}
|
||||
b.entryMap[string(str)] = e
|
||||
b.entry = append(b.entry, e)
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddTailoring defines a tailoring x <_level y for the given locale.
|
||||
// For example, AddTailoring("se", "z", "ä", Primary) sorts "ä" after "z"
|
||||
// at the primary level for Swedish. AddTailoring("de", "ue", "ü", Secondary)
|
||||
// sorts "ü" after "ue" at the secondary level for German.
|
||||
// See http://www.unicode.org/reports/tr10/#Tailoring_Example for details
|
||||
// on parametric tailoring.
|
||||
func (b *Builder) AddTailoring(locale, x, y string, l collate.Level) error {
|
||||
// TODO: implement.
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *Builder) baseColElem(e *entry) uint32 {
|
||||
ce := uint32(0)
|
||||
var err error
|
||||
switch {
|
||||
case e.expansion():
|
||||
ce, err = makeExpandIndex(e.expansionIndex)
|
||||
default:
|
||||
if e.decompose {
|
||||
log.Fatal("decompose should be handled elsewhere")
|
||||
}
|
||||
ce, err = makeCE(e.elems[0])
|
||||
}
|
||||
if err != nil {
|
||||
b.error(fmt.Errorf("%s: %X -> %X", err, e.runes, e.elems))
|
||||
}
|
||||
return ce
|
||||
}
|
||||
|
||||
func (b *Builder) colElem(e *entry) uint32 {
|
||||
if e.skip() {
|
||||
log.Fatal("cannot build colElem for entry that should be skipped")
|
||||
}
|
||||
ce := uint32(0)
|
||||
var err error
|
||||
switch {
|
||||
case e.decompose:
|
||||
t1 := e.elems[0][2]
|
||||
t2 := 0
|
||||
if len(e.elems) > 1 {
|
||||
t2 = e.elems[1][2]
|
||||
}
|
||||
ce, err = makeDecompose(t1, t2)
|
||||
case e.contractionStarter():
|
||||
ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
|
||||
default:
|
||||
if len(e.runes) > 1 {
|
||||
log.Fatal("colElem: contractions are handled in contraction trie")
|
||||
}
|
||||
ce = b.baseColElem(e)
|
||||
}
|
||||
if err != nil {
|
||||
b.error(err)
|
||||
}
|
||||
return ce
|
||||
}
|
||||
|
||||
func (b *Builder) error(e error) {
|
||||
if e != nil {
|
||||
b.err = e
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Builder) build() (*table, error) {
|
||||
b.t = &table{}
|
||||
|
||||
b.contractCJK()
|
||||
b.simplify() // requires contractCJK
|
||||
b.processExpansions() // requires simplify
|
||||
b.processContractions() // requires simplify
|
||||
b.buildTrie() // requires process*
|
||||
|
||||
if b.err != nil {
|
||||
return nil, b.err
|
||||
}
|
||||
return b.t, nil
|
||||
}
|
||||
|
||||
// Build builds a Collator for the given locale. To build the root table, set locale to "".
|
||||
func (b *Builder) Build(locale string) (*collate.Collator, error) {
|
||||
t, err := b.build()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// TODO: support multiple locales
|
||||
return collate.Init(t), nil
|
||||
}
|
||||
|
||||
// Print prints all tables to a Go file that can be included in
|
||||
// the Collate package.
|
||||
func (b *Builder) Print(w io.Writer) (int, error) {
|
||||
t, err := b.build()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// TODO: support multiple locales
|
||||
n, _, err := t.print(w, "root")
|
||||
return n, err
|
||||
}
|
||||
|
||||
// reproducibleFromNFKD checks whether the given expansion could be generated
|
||||
// from an NFKD expansion.
|
||||
func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
|
||||
// Length must be equal.
|
||||
if len(exp) != len(nfkd) {
|
||||
return false
|
||||
}
|
||||
for i, ce := range exp {
|
||||
// Primary and secondary values should be equal.
|
||||
if ce[0] != nfkd[i][0] || ce[1] != nfkd[i][1] {
|
||||
return false
|
||||
}
|
||||
// Tertiary values should be equal to maxTertiary for third element onwards.
|
||||
if i >= 2 && ce[2] != maxTertiary {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func equalCE(a, b []int) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := 0; i < 3; i++ {
|
||||
if b[i] != a[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func equalCEArrays(a, b [][]int) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if !equalCE(a[i], b[i]) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// genColElems generates a collation element array from the runes in str. This
|
||||
// assumes that all collation elements have already been added to the Builder.
|
||||
func (b *Builder) genColElems(str string) [][]int {
|
||||
elems := [][]int{}
|
||||
for _, r := range []rune(str) {
|
||||
if ee, ok := b.entryMap[string(r)]; !ok {
|
||||
elem := []int{implicitPrimary(r), defaultSecondary, defaultTertiary, int(r)}
|
||||
elems = append(elems, elem)
|
||||
} else {
|
||||
elems = append(elems, ee.elems...)
|
||||
}
|
||||
}
|
||||
return elems
|
||||
}
|
||||
|
||||
func (b *Builder) simplify() {
|
||||
// Runes that are a starter of a contraction should not be removed.
|
||||
// (To date, there is only Kannada character 0CCA.)
|
||||
keep := make(map[rune]bool)
|
||||
for _, e := range b.entry {
|
||||
if len(e.runes) > 1 {
|
||||
keep[e.runes[0]] = true
|
||||
}
|
||||
}
|
||||
// Remove entries for which the runes normalize (using NFD) to identical values.
|
||||
for _, e := range b.entry {
|
||||
s := e.str
|
||||
nfd := norm.NFD.String(s)
|
||||
if len(e.runes) > 1 || keep[e.runes[0]] || nfd == s {
|
||||
continue
|
||||
}
|
||||
if equalCEArrays(b.genColElems(nfd), e.elems) {
|
||||
delete(b.entryMap, s)
|
||||
}
|
||||
}
|
||||
// Remove entries in b.entry that were removed from b.entryMap
|
||||
k := 0
|
||||
for _, e := range b.entry {
|
||||
if _, ok := b.entryMap[e.str]; ok {
|
||||
b.entry[k] = e
|
||||
k++
|
||||
}
|
||||
}
|
||||
b.entry = b.entry[:k]
|
||||
// Tag entries for which the runes NFKD decompose to identical values.
|
||||
for _, e := range b.entry {
|
||||
s := e.str
|
||||
nfkd := norm.NFKD.String(s)
|
||||
if len(e.runes) > 1 || keep[e.runes[0]] || nfkd == s {
|
||||
continue
|
||||
}
|
||||
if reproducibleFromNFKD(e, e.elems, b.genColElems(nfkd)) {
|
||||
e.decompose = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// convertLargeWeights converts collation elements with large
|
||||
// primaries (either double primaries or for illegal runes)
|
||||
// to our own representation.
|
||||
// See http://unicode.org/reports/tr10/#Implicit_Weights
|
||||
func convertLargeWeights(elems [][]int) (res [][]int, err error) {
|
||||
const (
|
||||
firstLargePrimary = 0xFB40
|
||||
illegalPrimary = 0xFFFE
|
||||
highBitsMask = 0x3F
|
||||
lowBitsMask = 0x7FFF
|
||||
lowBitsFlag = 0x8000
|
||||
shiftBits = 15
|
||||
)
|
||||
for i := 0; i < len(elems); i++ {
|
||||
ce := elems[i]
|
||||
p := ce[0]
|
||||
if p < firstLargePrimary {
|
||||
continue
|
||||
}
|
||||
if p >= illegalPrimary {
|
||||
ce[0] = illegalOffset + p - illegalPrimary
|
||||
} else {
|
||||
if i+1 >= len(elems) {
|
||||
return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
|
||||
}
|
||||
if elems[i+1][0]&lowBitsFlag == 0 {
|
||||
return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
|
||||
}
|
||||
r := rune(((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask)
|
||||
ce[0] = implicitPrimary(r)
|
||||
for j := i + 1; j+1 < len(elems); j++ {
|
||||
elems[j] = elems[j+1]
|
||||
}
|
||||
elems = elems[:len(elems)-1]
|
||||
}
|
||||
}
|
||||
return elems, nil
|
||||
}
|
||||
|
||||
// A CJK character C is represented in the DUCET as
|
||||
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
|
||||
// We will rewrite these characters to a single CE.
|
||||
// We assume the CJK values start at 0x8000.
|
||||
func (b *Builder) contractCJK() {
|
||||
for _, e := range b.entry {
|
||||
elms, err := convertLargeWeights(e.elems)
|
||||
e.elems = elms
|
||||
if err != nil {
|
||||
err = fmt.Errorf("%U: %s", e.runes, err)
|
||||
}
|
||||
b.error(err)
|
||||
}
|
||||
}
|
||||
|
||||
// appendExpansion converts the given collation sequence to
|
||||
// collation elements and adds them to the expansion table.
|
||||
// It returns an index to the expansion table.
|
||||
func (b *Builder) appendExpansion(e *entry) int {
|
||||
t := b.t
|
||||
i := len(t.expandElem)
|
||||
ce := uint32(len(e.elems))
|
||||
t.expandElem = append(t.expandElem, ce)
|
||||
for _, w := range e.elems {
|
||||
ce, err := makeCE(w)
|
||||
if err != nil {
|
||||
b.error(err)
|
||||
return -1
|
||||
}
|
||||
t.expandElem = append(t.expandElem, ce)
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// processExpansions extracts data necessary to generate
|
||||
// the extraction tables.
|
||||
func (b *Builder) processExpansions() {
|
||||
eidx := make(map[string]int)
|
||||
for _, e := range b.entry {
|
||||
if !e.expansion() {
|
||||
continue
|
||||
}
|
||||
key := fmt.Sprintf("%v", e.elems)
|
||||
i, ok := eidx[key]
|
||||
if !ok {
|
||||
i = b.appendExpansion(e)
|
||||
eidx[key] = i
|
||||
}
|
||||
e.expansionIndex = i
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Builder) processContractions() {
|
||||
// Collate contractions per starter rune.
|
||||
starters := []rune{}
|
||||
cm := make(map[rune][]*entry)
|
||||
for _, e := range b.entry {
|
||||
if e.contraction() {
|
||||
r := e.runes[0]
|
||||
if _, ok := cm[r]; !ok {
|
||||
starters = append(starters, r)
|
||||
}
|
||||
cm[r] = append(cm[r], e)
|
||||
}
|
||||
}
|
||||
// Add entries of single runes that are at a start of a contraction.
|
||||
for _, e := range b.entry {
|
||||
if !e.contraction() {
|
||||
r := e.runes[0]
|
||||
if _, ok := cm[r]; ok {
|
||||
cm[r] = append(cm[r], e)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Build the tries for the contractions.
|
||||
t := b.t
|
||||
handlemap := make(map[string]ctHandle)
|
||||
for _, r := range starters {
|
||||
l := cm[r]
|
||||
// Compute suffix strings. There are 31 different contraction suffix
|
||||
// sets for 715 contractions and 82 contraction starter runes as of
|
||||
// version 6.0.0.
|
||||
sufx := []string{}
|
||||
hasSingle := false
|
||||
for _, e := range l {
|
||||
if len(e.runes) > 1 {
|
||||
sufx = append(sufx, string(e.runes[1:]))
|
||||
} else {
|
||||
hasSingle = true
|
||||
}
|
||||
}
|
||||
if !hasSingle {
|
||||
b.error(fmt.Errorf("no single entry for starter rune %U found", r))
|
||||
continue
|
||||
}
|
||||
// Unique the suffix set.
|
||||
sort.Strings(sufx)
|
||||
key := strings.Join(sufx, "\n")
|
||||
handle, ok := handlemap[key]
|
||||
if !ok {
|
||||
var err error
|
||||
handle, err = t.contractTries.appendTrie(sufx)
|
||||
if err != nil {
|
||||
b.error(err)
|
||||
}
|
||||
handlemap[key] = handle
|
||||
}
|
||||
// Bucket sort entries in index order.
|
||||
es := make([]*entry, len(l))
|
||||
for _, e := range l {
|
||||
var o, sn int
|
||||
if len(e.runes) > 1 {
|
||||
str := []byte(string(e.runes[1:]))
|
||||
o, sn = t.contractTries.lookup(handle, str)
|
||||
if sn != len(str) {
|
||||
log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", []rune(string(str)), sn, len(str))
|
||||
}
|
||||
}
|
||||
if es[o] != nil {
|
||||
log.Fatalf("Multiple contractions for position %d for rune %U", o, e.runes[0])
|
||||
}
|
||||
es[o] = e
|
||||
}
|
||||
// Store info in entry for starter rune.
|
||||
es[0].contractionIndex = len(t.contractElem)
|
||||
es[0].contractionHandle = handle
|
||||
// Add collation elements for contractions.
|
||||
for _, e := range es {
|
||||
t.contractElem = append(t.contractElem, b.baseColElem(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Builder) buildTrie() {
|
||||
t := newNode()
|
||||
for _, e := range b.entry {
|
||||
if !e.skip() {
|
||||
ce := b.colElem(e)
|
||||
t.insert(e.runes[0], ce)
|
||||
}
|
||||
}
|
||||
i, err := t.generate()
|
||||
b.t.index = *i
|
||||
b.error(err)
|
||||
}
|
||||
|
|
@ -0,0 +1,265 @@
|
|||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package build
|
||||
|
||||
import "testing"
|
||||
|
||||
// cjk returns an implicit collation element for a CJK rune.
|
||||
func cjk(r rune) [][]int {
|
||||
// A CJK character C is represented in the DUCET as
|
||||
// [.AAAA.0020.0002.C][.BBBB.0000.0000.C]
|
||||
// Where AAAA is the most significant 15 bits plus a base value.
|
||||
// Any base value will work for the test, so we pick the common value of FB40.
|
||||
const base = 0xFB40
|
||||
return [][]int{
|
||||
[]int{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)},
|
||||
[]int{int(r&0x7FFF) | 0x8000, 0, 0, int(r)},
|
||||
}
|
||||
}
|
||||
|
||||
func pCE(p int) [][]int {
|
||||
return [][]int{[]int{p, defaultSecondary, defaultTertiary, 0}}
|
||||
}
|
||||
|
||||
func pqCE(p, q int) [][]int {
|
||||
return [][]int{[]int{p, defaultSecondary, defaultTertiary, q}}
|
||||
}
|
||||
|
||||
func ptCE(p, t int) [][]int {
|
||||
return [][]int{[]int{p, defaultSecondary, t, 0}}
|
||||
}
|
||||
|
||||
func sCE(s int) [][]int {
|
||||
return [][]int{[]int{0, s, defaultTertiary, 0}}
|
||||
}
|
||||
|
||||
func stCE(s, t int) [][]int {
|
||||
return [][]int{[]int{0, s, t, 0}}
|
||||
}
|
||||
|
||||
// ducetElem is used to define test data that is used to generate a table.
|
||||
type ducetElem struct {
|
||||
str string
|
||||
ces [][]int
|
||||
}
|
||||
|
||||
func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
|
||||
b := NewBuilder()
|
||||
for _, e := range ducet {
|
||||
if err := b.Add([]rune(e.str), e.ces); err != nil {
|
||||
t.Errorf(err.Error())
|
||||
}
|
||||
}
|
||||
b.t = &table{}
|
||||
return b
|
||||
}
|
||||
|
||||
type convertTest struct {
|
||||
in, out [][]int
|
||||
err bool
|
||||
}
|
||||
|
||||
var convLargeTests = []convertTest{
|
||||
{pCE(0xFB39), pCE(0xFB39), false},
|
||||
{cjk(0x2F9B2), pqCE(0x7F4F2, 0x2F9B2), false},
|
||||
{pCE(0xFB40), pCE(0), true},
|
||||
{append(pCE(0xFB40), pCE(0)[0]), pCE(0), true},
|
||||
{pCE(0xFFFE), pCE(illegalOffset), false},
|
||||
{pCE(0xFFFF), pCE(illegalOffset + 1), false},
|
||||
}
|
||||
|
||||
func TestConvertLarge(t *testing.T) {
|
||||
for i, tt := range convLargeTests {
|
||||
e := &entry{elems: tt.in}
|
||||
elems, err := convertLargeWeights(e.elems)
|
||||
if tt.err {
|
||||
if err == nil {
|
||||
t.Errorf("%d: expected error; none found", i)
|
||||
}
|
||||
continue
|
||||
} else if err != nil {
|
||||
t.Errorf("%d: unexpected error: %v", i, err)
|
||||
}
|
||||
if !equalCEArrays(elems, tt.out) {
|
||||
t.Errorf("%d: conversion was %x; want %x", i, elems, tt.out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collation element table for simplify tests.
|
||||
var simplifyTest = []ducetElem{
|
||||
{"\u0300", sCE(30)}, // grave
|
||||
{"\u030C", sCE(40)}, // caron
|
||||
{"A", ptCE(100, 8)},
|
||||
{"D", ptCE(104, 8)},
|
||||
{"E", ptCE(105, 8)},
|
||||
{"I", ptCE(110, 8)},
|
||||
{"z", ptCE(130, 8)},
|
||||
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0])},
|
||||
{"\u05B7", sCE(80)},
|
||||
{"\u00C0", append(ptCE(100, 8), sCE(30)...)}, // A with grave, can be removed
|
||||
{"\u00C8", append(ptCE(105, 8), sCE(30)...)}, // E with grave
|
||||
{"\uFB1F", append(ptCE(200, 4), ptCE(200, 4)[0], sCE(80)[0])}, // eliminated by NFD
|
||||
{"\u00C8\u0302", ptCE(106, 8)}, // block previous from simplifying
|
||||
{"\u01C5", append(ptCE(104, 9), ptCE(130, 4)[0], stCE(40, maxTertiary)[0])}, // eliminated by NFKD
|
||||
// no removal: tertiary value of third element is not maxTertiary
|
||||
{"\u2162", append(ptCE(110, 9), ptCE(110, 4)[0], ptCE(110, 8)[0])},
|
||||
}
|
||||
|
||||
var genColTests = []ducetElem{
|
||||
{"\uFA70", pqCE(0x1F5B0, 0xFA70)},
|
||||
{"A\u0300", append(ptCE(100, 8), sCE(30)...)},
|
||||
{"A\u0300\uFA70", append(ptCE(100, 8), sCE(30)[0], pqCE(0x1F5B0, 0xFA70)[0])},
|
||||
{"A\u0300A\u0300", append(ptCE(100, 8), sCE(30)[0], ptCE(100, 8)[0], sCE(30)[0])},
|
||||
}
|
||||
|
||||
func TestGenColElems(t *testing.T) {
|
||||
b := newBuilder(t, simplifyTest[:5])
|
||||
|
||||
for i, tt := range genColTests {
|
||||
res := b.genColElems(tt.str)
|
||||
if !equalCEArrays(tt.ces, res) {
|
||||
t.Errorf("%d: result %X; want %X", i, res, tt.ces)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type strArray []string
|
||||
|
||||
func (sa strArray) contains(s string) bool {
|
||||
for _, e := range sa {
|
||||
if e == s {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var simplifyRemoved = strArray{"\u00C0", "\uFB1F"}
|
||||
var simplifyMarked = strArray{"\u01C5"}
|
||||
|
||||
func TestSimplify(t *testing.T) {
|
||||
b := newBuilder(t, simplifyTest)
|
||||
b.simplify()
|
||||
|
||||
k := 0
|
||||
for i, tt := range simplifyTest {
|
||||
if simplifyRemoved.contains(tt.str) {
|
||||
continue
|
||||
}
|
||||
e := b.entry[k]
|
||||
k++
|
||||
if e.str != tt.str || !equalCEArrays(e.elems, tt.ces) {
|
||||
t.Errorf("%d: found element %s -> %X; want %s -> %X", i, e.str, e.elems, tt.str, tt.ces)
|
||||
break
|
||||
}
|
||||
}
|
||||
k = 0
|
||||
for i, e := range b.entry {
|
||||
gold := simplifyMarked.contains(e.str)
|
||||
if gold {
|
||||
k++
|
||||
}
|
||||
if gold != e.decompose {
|
||||
t.Errorf("%d: %s has decompose %v; want %v", i, e.str, e.decompose, gold)
|
||||
}
|
||||
}
|
||||
if k != len(simplifyMarked) {
|
||||
t.Errorf(" an entry that should be marked as decompose was deleted")
|
||||
}
|
||||
}
|
||||
|
||||
var expandTest = []ducetElem{
|
||||
{"\u00C0", append(ptCE(100, 8), sCE(30)...)},
|
||||
{"\u00C8", append(ptCE(105, 8), sCE(30)...)},
|
||||
{"\u00C9", append(ptCE(105, 8), sCE(30)...)}, // identical expansion
|
||||
{"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])},
|
||||
}
|
||||
|
||||
func TestExpand(t *testing.T) {
|
||||
const (
|
||||
totalExpansions = 3
|
||||
totalElements = 2 + 2 + 3 + totalExpansions
|
||||
)
|
||||
b := newBuilder(t, expandTest)
|
||||
b.processExpansions()
|
||||
|
||||
for i, tt := range expandTest {
|
||||
e := b.entry[i]
|
||||
exp := b.t.expandElem[e.expansionIndex:]
|
||||
if int(exp[0]) != len(tt.ces) {
|
||||
t.Errorf("%U: len(expansion)==%d; want %d", []rune(tt.str)[0], exp[0], len(tt.ces))
|
||||
}
|
||||
exp = exp[1:]
|
||||
for j, w := range tt.ces {
|
||||
if ce, _ := makeCE(w); exp[j] != ce {
|
||||
t.Errorf("%U: element %d is %X; want %X", []rune(tt.str)[0], j, exp[j], ce)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Verify uniquing.
|
||||
if len(b.t.expandElem) != totalElements {
|
||||
t.Errorf("len(expandElem)==%d; want %d", len(b.t.expandElem), totalElements)
|
||||
}
|
||||
}
|
||||
|
||||
var contractTest = []ducetElem{
|
||||
{"abc", pCE(102)},
|
||||
{"abd", pCE(103)},
|
||||
{"a", pCE(100)},
|
||||
{"ab", pCE(101)},
|
||||
{"ac", pCE(104)},
|
||||
{"bcd", pCE(202)},
|
||||
{"b", pCE(200)},
|
||||
{"bc", pCE(201)},
|
||||
{"bd", pCE(203)},
|
||||
// shares suffixes with a*
|
||||
{"Ab", pCE(301)},
|
||||
{"A", pCE(300)},
|
||||
{"Ac", pCE(304)},
|
||||
{"Abc", pCE(302)},
|
||||
{"Abd", pCE(303)},
|
||||
// starter to be ignored
|
||||
{"z", pCE(1000)},
|
||||
}
|
||||
|
||||
func TestContract(t *testing.T) {
|
||||
const (
|
||||
totalElements = 5 + 5 + 4
|
||||
)
|
||||
b := newBuilder(t, contractTest)
|
||||
b.processContractions()
|
||||
|
||||
indexMap := make(map[int]bool)
|
||||
handleMap := make(map[rune]*entry)
|
||||
for _, e := range b.entry {
|
||||
if e.contractionHandle.n > 0 {
|
||||
handleMap[e.runes[0]] = e
|
||||
indexMap[e.contractionHandle.index] = true
|
||||
}
|
||||
}
|
||||
// Verify uniquing.
|
||||
if len(indexMap) != 2 {
|
||||
t.Errorf("number of tries is %d; want %d", len(indexMap), 2)
|
||||
}
|
||||
for _, tt := range contractTest {
|
||||
e, ok := handleMap[[]rune(tt.str)[0]]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
str := tt.str[1:]
|
||||
offset, n := b.t.contractTries.lookup(e.contractionHandle, []byte(str))
|
||||
if len(str) != n {
|
||||
t.Errorf("%s: bytes consumed==%d; want %d", tt.str, n, len(str))
|
||||
}
|
||||
ce := b.t.contractElem[offset+e.contractionIndex]
|
||||
if want, _ := makeCE(tt.ces[0]); want != ce {
|
||||
t.Errorf("%s: element %X; want %X", tt.str, ce, want)
|
||||
}
|
||||
}
|
||||
if len(b.t.contractElem) != totalElements {
|
||||
t.Errorf("len(expandElem)==%d; want %d", len(b.t.contractElem), totalElements)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue