diff --git a/src/pkg/exp/locale/collate/build/builder.go b/src/pkg/exp/locale/collate/build/builder.go index 1528b9d0ce..fbb9c4483f 100644 --- a/src/pkg/exp/locale/collate/build/builder.go +++ b/src/pkg/exp/locale/collate/build/builder.go @@ -12,6 +12,7 @@ import ( "log" "sort" "strings" + "unicode/utf8" ) // TODO: optimizations: @@ -22,7 +23,11 @@ import ( // - trie valueBlocks are currently 100K. There are a lot of sparse blocks // and many consecutive values with the same stride. This can be further // compacted. -// - compress secondary weights into 8 bits. +// - Compress secondary weights into 8 bits. +// - Some LDML specs specify a context element. Currently we simply concatenate +// those. Context can be implemented using the contraction trie. If Builder +// could analyze and detect when using a context makes sense, there is no +// need to expose this construct in the API. // entry is used to keep track of a single entry in the collation element table // during building. Examples of entries can be found in the Default Unicode @@ -60,18 +65,30 @@ func (e *entry) contractionStarter() bool { return e.contractionHandle.n != 0 } -// A Builder builds collation tables. It can generate both the root table and -// locale-specific tables defined as tailorings to the root table. -// The typical use case is to specify the data for the root table and all locale-specific -// tables using Add and AddTailoring before making any call to Build. This allows -// Builder to ensure that a root table can support tailorings for each locale. +// A Builder builds a root collation table. The user must specify the +// collation elements for each entry. A common use will be to base the weights +// on those specified in the allkeys* file as provided by the UCA or CLDR. type Builder struct { index *trieBuilder + locale []*Tailoring entryMap map[string]*entry entry []*entry t *table err error built bool + + minNonVar int // lowest primary recorded for a variable + varTop int // highest primary recorded for a non-variable +} + +// A Tailoring builds a collation table based on another collation table. +// The table is defined by specifying tailorings to the underlying table. +// See http://unicode.org/reports/tr35/ for an overview of tailoring +// collation tables. The CLDR contains pre-defined tailorings for a variety +// of languages (See http://www.unicode.org/Public/cldr/2.0.1/core.zip.) +type Tailoring struct { + id string + // TODO: implement. } // NewBuilder returns a new Builder. @@ -83,14 +100,26 @@ func NewBuilder() *Builder { return b } -// Add adds an entry for the root collation element table, mapping +// Tailoring returns a Tailoring for the given locale. One should +// have completed all calls to Add before calling Tailoring. +func (b *Builder) Tailoring(locale string) *Tailoring { + t := &Tailoring{ + id: locale, + } + b.locale = append(b.locale, t) + return t +} + +// Add adds an entry to the collation element table, mapping // a slice of runes to a sequence of collation elements. // A collation element is specified as list of weights: []int{primary, secondary, ...}. // The entries are typically obtained from a collation element table // as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format. // Note that the collation elements specified by colelems are only used // as a guide. The actual weights generated by Builder may differ. -func (b *Builder) Add(str []rune, colelems [][]int) error { +// The argument variables is a list of indices into colelems that should contain +// a value for each colelem that is a variable. (See the reference above.) +func (b *Builder) Add(str []rune, colelems [][]int, variables []int) error { e := &entry{ runes: make([]rune, len(str)), elems: make([][]int, len(colelems)), @@ -113,6 +142,29 @@ func (b *Builder) Add(str []rune, colelems [][]int) error { e.elems[i] = append(e.elems[i], ce[0]) } } + for i, ce := range e.elems { + isvar := false + for _, j := range variables { + if i == j { + isvar = true + } + } + if isvar { + if ce[0] >= b.minNonVar && b.minNonVar > 0 { + return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar) + } + if ce[0] > b.varTop { + b.varTop = ce[0] + } + } else if ce[0] > 0 { + if ce[0] <= b.varTop { + return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop) + } + if b.minNonVar == 0 || ce[0] < b.minNonVar { + b.minNonVar = ce[0] + } + } + } elems, err := convertLargeWeights(e.elems) if err != nil { return err @@ -123,13 +175,57 @@ func (b *Builder) Add(str []rune, colelems [][]int) error { return nil } -// AddTailoring defines a tailoring x <_level y for the given locale. -// For example, AddTailoring("se", "z", "ä", Primary) sorts "ä" after "z" -// at the primary level for Swedish. AddTailoring("de", "ue", "ü", Secondary) -// sorts "ü" after "ue" at the secondary level for German. +// SetAnchor sets the point after which elements passed in subsequent calls to +// Insert will be inserted. It is equivalent to the reset directive in an LDML +// specification. See Insert for an example. +// SetAnchor supports the following logical reset positions: +// , , , +// and . +func (t *Tailoring) SetAnchor(anchor string) error { + // TODO: implement. + return nil +} + +// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to +// Insert will insert entries before the anchor. +func (t *Tailoring) SetAnchorBefore(anchor string) error { + // TODO: implement. + return nil +} + +// Insert sets the ordering of str relative to the entry set by the previous +// call to SetAnchor or Insert. The argument extend corresponds +// to the extend elements as defined in LDML. A non-empty value for extend +// will cause the collation elements corresponding to extend to be appended +// to the collation elements generated for the entry added by Insert. +// This has the same net effect as sorting str after the string anchor+extend. // See http://www.unicode.org/reports/tr10/#Tailoring_Example for details -// on parametric tailoring. -func (b *Builder) AddTailoring(locale, x, y string, l collate.Level) error { +// on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements +// for full details on LDML. +// +// Examples: create a tailoring for Swedish, where "ä" is ordered after "z" +// at the primary sorting level: +// t := b.Tailoring("se") +// t.SetAnchor("z") +// t.Insert(collate.Primary, "ä", "") +// Order "ü" after "ue" at the secondary sorting level: +// t.SetAnchor("ue") +// t.Insert(collate.Secondary, "ü","") +// or +// t.SetAnchor("u") +// t.Insert(collate.Secondary, "ü", "e") +// Order "q" afer "ab" at the secondary level and "Q" after "q" +// at the tertiary level: +// t.SetAnchor("ab") +// t.Insert(collate.Secondary, "q", "") +// t.Insert(collate.Tertiary, "Q", "") +// Order "b" before "a": +// t.SetAnchorBefore("a") +// t.Insert(collate.Primary, "b", "") +// Order "0" after the last primary ignorable: +// t.SetAnchor("") +// t.Insert(collate.Primary, "0", "") +func (t *Tailoring) Insert(level collate.Level, str, extend string) error { // TODO: implement. return nil } @@ -189,7 +285,10 @@ func (b *Builder) error(e error) { func (b *Builder) build() (*table, error) { if !b.built { b.built = true - b.t = &table{} + b.t = &table{ + maxContractLen: utf8.UTFMax, + variableTop: uint32(b.varTop), + } b.simplify() b.processExpansions() // requires simplify @@ -202,18 +301,23 @@ func (b *Builder) build() (*table, error) { return b.t, nil } -// Build builds a Collator for the given locale. To build the root table, set locale to "". -func (b *Builder) Build(locale string) (*collate.Collator, error) { +// Build builds the root Collator. +func (b *Builder) Build() (*collate.Collator, error) { t, err := b.build() if err != nil { return nil, err } - // TODO: support multiple locales return collate.Init(t), nil } -// Print prints all tables to a Go file that can be included in -// the Collate package. +// Build builds a Collator for Tailoring t. +func (t *Tailoring) Build() (*collate.Collator, error) { + // TODO: implement. + return nil, nil +} + +// Print prints the tables for b and all its Tailorings as a Go file +// that can be included in the Collate package. func (b *Builder) Print(w io.Writer) (int, error) { t, err := b.build() if err != nil { diff --git a/src/pkg/exp/locale/collate/build/builder_test.go b/src/pkg/exp/locale/collate/build/builder_test.go index a113d449aa..ae13dab780 100644 --- a/src/pkg/exp/locale/collate/build/builder_test.go +++ b/src/pkg/exp/locale/collate/build/builder_test.go @@ -48,7 +48,7 @@ type ducetElem struct { func newBuilder(t *testing.T, ducet []ducetElem) *Builder { b := NewBuilder() for _, e := range ducet { - if err := b.Add([]rune(e.str), e.ces); err != nil { + if err := b.Add([]rune(e.str), e.ces, nil); err != nil { t.Errorf(err.Error()) } } diff --git a/src/pkg/exp/locale/collate/build/table.go b/src/pkg/exp/locale/collate/build/table.go index 91ed51b6de..a7973f5510 100644 --- a/src/pkg/exp/locale/collate/build/table.go +++ b/src/pkg/exp/locale/collate/build/table.go @@ -23,6 +23,7 @@ type table struct { contractTries contractTrieSet contractElem []uint32 maxContractLen int + variableTop uint32 } func (t *table) TrieIndex() []uint16 { @@ -53,6 +54,10 @@ func (t *table) MaxContractLen() int { return t.maxContractLen } +func (t *table) VariableTop() uint32 { + return t.variableTop +} + // print writes the table as Go compilable code to w. It prefixes the // variable names with name. It returns the number of bytes written // and the size of the resulting table. @@ -78,6 +83,7 @@ func (t *table) fprint(w io.Writer, name string) (n, size int, err error) { p(",\n") p("%sContractElem[:],\n", name) p("%d,\n", t.maxContractLen) + p("0x%X,\n", t.variableTop) p("}\n\n") // Write arrays needed for the structure. diff --git a/src/pkg/exp/locale/collate/collate.go b/src/pkg/exp/locale/collate/collate.go index 9a4bdcdb96..d59b858766 100644 --- a/src/pkg/exp/locale/collate/collate.go +++ b/src/pkg/exp/locale/collate/collate.go @@ -55,9 +55,6 @@ const ( // Collator provides functionality for comparing strings for a given // collation order. type Collator struct { - // See SetVariableTop. - variableTop uint32 - // Strength sets the maximum level to use in comparison. Strength Level @@ -178,7 +175,7 @@ func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { } func (c *Collator) key(buf *Buffer, w []weights) []byte { - processWeights(c.Alternate, c.variableTop, w) + processWeights(c.Alternate, c.t.variableTop, w) kn := len(buf.key) c.keyFromElems(buf, w) return buf.key[kn:] diff --git a/src/pkg/exp/locale/collate/export.go b/src/pkg/exp/locale/collate/export.go index c152296f57..01750dd070 100644 --- a/src/pkg/exp/locale/collate/export.go +++ b/src/pkg/exp/locale/collate/export.go @@ -4,6 +4,8 @@ package collate +import "exp/norm" + // Init is used by type Builder in exp/locale/collate/build/ // to create Collator instances. It is for internal use only. func Init(data interface{}) *Collator { @@ -21,7 +23,12 @@ func Init(data interface{}) *Collator { t.contractTries = init.ContractTries() t.contractElem = init.ContractElems() t.maxContractLen = init.MaxContractLen() - return &Collator{t: t} + t.variableTop = init.VariableTop() + return &Collator{ + Strength: Quaternary, + f: norm.NFD, + t: t, + } } type tableInitializer interface { @@ -32,4 +39,5 @@ type tableInitializer interface { ContractTries() []struct{ l, h, n, i uint8 } ContractElems() []uint32 MaxContractLen() int + VariableTop() uint32 } diff --git a/src/pkg/exp/locale/collate/export_test.go b/src/pkg/exp/locale/collate/export_test.go index ddbf30d30d..de6e9078b5 100644 --- a/src/pkg/exp/locale/collate/export_test.go +++ b/src/pkg/exp/locale/collate/export_test.go @@ -7,7 +7,6 @@ package collate // Export for testing. import ( - "exp/norm" "fmt" ) @@ -63,18 +62,14 @@ func (t *Table) AppendNext(s []byte) ([]Weights, int) { } func SetTop(c *Collator, top int) { - c.variableTop = uint32(top) -} - -func InitCollator(c *Collator) { - c.Strength = Quaternary - c.f = norm.NFD - c.t.maxContractLen = 30 + if c.t == nil { + c.t = &table{} + } + c.t.variableTop = uint32(top) } func GetColElems(c *Collator, buf *Buffer, str []byte) []Weights { buf.ResetKeys() - InitCollator(c) c.getColElems(buf, str) return convertToWeights(buf.ce) } diff --git a/src/pkg/exp/locale/collate/maketables.go b/src/pkg/exp/locale/collate/maketables.go index f335b363ab..a76e2d0f93 100644 --- a/src/pkg/exp/locale/collate/maketables.go +++ b/src/pkg/exp/locale/collate/maketables.go @@ -33,7 +33,7 @@ var localFiles = flag.Bool("local", false, "data files have been copied to the current directory; for debugging only") -func failonerror(e error) { +func failOnError(e error) { if e != nil { log.Fatal(e) } @@ -62,10 +62,9 @@ func openReader(url string) (io.ReadCloser, error) { // parseUCA parses a Default Unicode Collation Element Table of the format // specified in http://www.unicode.org/reports/tr10/#File_Format. // It returns the variable top. -func parseUCA(builder *build.Builder) int { - maxVar, minNonVar := 0, 1<<30 +func parseUCA(builder *build.Builder) { r, err := openReader(*ducet) - failonerror(err) + failOnError(err) defer r.Close() input := bufio.NewReader(r) colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`) @@ -109,32 +108,25 @@ func parseUCA(builder *build.Builder) int { lhs = append(lhs, rune(convHex(i, v))) } var n int + var vars []int rhs := [][]int{} - for _, m := range colelem.FindAllStringSubmatch(part[1], -1) { + for i, m := range colelem.FindAllStringSubmatch(part[1], -1) { n += len(m[0]) elem := []int{} for _, h := range strings.Split(m[2], ".") { elem = append(elem, convHex(i, h)) } - if p := elem[0]; m[1] == "*" { - if p > maxVar { - maxVar = p - } - } else if p > 0 && p < minNonVar { - minNonVar = p + if m[1] == "*" { + vars = append(vars, i) } rhs = append(rhs, elem) } if len(part[1]) < n+3 || part[1][n+1] != '#' { log.Fatalf("%d: expected comment; found %s", i, part[1][n:]) } - builder.Add(lhs, rhs) + failOnError(builder.Add(lhs, rhs, vars)) } } - if maxVar >= minNonVar { - log.Fatalf("found maxVar > minNonVar (%d > %d)", maxVar, minNonVar) - } - return maxVar } func convHex(line int, s string) int { @@ -146,11 +138,10 @@ func convHex(line int, s string) int { } // TODO: move this functionality to exp/locale/collate/build. -func printCollators(c *collate.Collator, vartop int) { +func printCollators(c *collate.Collator) { const name = "Root" fmt.Printf("var _%s = Collator{\n", name) fmt.Printf("\tStrength: %v,\n", c.Strength) - fmt.Printf("\tvariableTop: 0x%X,\n", vartop) fmt.Printf("\tf: norm.NFD,\n") fmt.Printf("\tt: &%sTable,\n", strings.ToLower(name)) fmt.Printf("}\n\n") @@ -162,9 +153,9 @@ func printCollators(c *collate.Collator, vartop int) { func main() { flag.Parse() b := build.NewBuilder() - vartop := parseUCA(b) - _, err := b.Build("") - failonerror(err) + parseUCA(b) + c, err := b.Build() + failOnError(err) fmt.Println("// Generated by running") fmt.Printf("// maketables --ducet=%s\n", *ducet) @@ -176,10 +167,8 @@ func main() { fmt.Println(`import "exp/norm"`) fmt.Println("") - c := &collate.Collator{} - c.Strength = collate.Quaternary - printCollators(c, vartop) + printCollators(c) _, err = b.Print(os.Stdout) - failonerror(err) + failOnError(err) } diff --git a/src/pkg/exp/locale/collate/table.go b/src/pkg/exp/locale/collate/table.go index b662b72897..b2a5b62316 100644 --- a/src/pkg/exp/locale/collate/table.go +++ b/src/pkg/exp/locale/collate/table.go @@ -20,6 +20,7 @@ type table struct { contractTries contractTrieSet contractElem []uint32 maxContractLen int + variableTop uint32 } // appendNext appends the weights corresponding to the next rune or diff --git a/src/pkg/exp/locale/collate/table_test.go b/src/pkg/exp/locale/collate/table_test.go index cd6d027254..446d592b6d 100644 --- a/src/pkg/exp/locale/collate/table_test.go +++ b/src/pkg/exp/locale/collate/table_test.go @@ -42,14 +42,9 @@ func pt(p, t int) []int { func makeTable(in []input) (*collate.Collator, error) { b := build.NewBuilder() for _, r := range in { - b.Add([]rune(r.str), r.ces) + b.Add([]rune(r.str), r.ces, nil) } - c, err := b.Build("") - if c == nil { - return nil, err - } - collate.InitCollator(c) - return c, err + return b.Build() } // modSeq holds a seqeunce of modifiers in increasing order of CCC long enough diff --git a/src/pkg/exp/locale/collate/tables.go b/src/pkg/exp/locale/collate/tables.go index 43e310c377..42cc74e22a 100644 --- a/src/pkg/exp/locale/collate/tables.go +++ b/src/pkg/exp/locale/collate/tables.go @@ -8,10 +8,9 @@ package collate import "exp/norm" var _Root = Collator{ - Strength: 3, - variableTop: 0x1560, - f: norm.NFD, - t: &rootTable, + Strength: 3, + f: norm.NFD, + t: &rootTable, } var ( @@ -24,6 +23,7 @@ var rootTable = table{ contractTrieSet(rootCTEntries[:]), rootContractElem[:], 9, + 0x1560, } // rootExpandElem: 4630 entries, 18520 bytes