diff --git a/src/pkg/exp/locale/collate/build/builder.go b/src/pkg/exp/locale/collate/build/builder.go
index 1528b9d0ce..fbb9c4483f 100644
--- a/src/pkg/exp/locale/collate/build/builder.go
+++ b/src/pkg/exp/locale/collate/build/builder.go
@@ -12,6 +12,7 @@ import (
"log"
"sort"
"strings"
+ "unicode/utf8"
)
// TODO: optimizations:
@@ -22,7 +23,11 @@ import (
// - trie valueBlocks are currently 100K. There are a lot of sparse blocks
// and many consecutive values with the same stride. This can be further
// compacted.
-// - compress secondary weights into 8 bits.
+// - Compress secondary weights into 8 bits.
+// - Some LDML specs specify a context element. Currently we simply concatenate
+// those. Context can be implemented using the contraction trie. If Builder
+// could analyze and detect when using a context makes sense, there is no
+// need to expose this construct in the API.
// entry is used to keep track of a single entry in the collation element table
// during building. Examples of entries can be found in the Default Unicode
@@ -60,18 +65,30 @@ func (e *entry) contractionStarter() bool {
return e.contractionHandle.n != 0
}
-// A Builder builds collation tables. It can generate both the root table and
-// locale-specific tables defined as tailorings to the root table.
-// The typical use case is to specify the data for the root table and all locale-specific
-// tables using Add and AddTailoring before making any call to Build. This allows
-// Builder to ensure that a root table can support tailorings for each locale.
+// A Builder builds a root collation table. The user must specify the
+// collation elements for each entry. A common use will be to base the weights
+// on those specified in the allkeys* file as provided by the UCA or CLDR.
type Builder struct {
index *trieBuilder
+ locale []*Tailoring
entryMap map[string]*entry
entry []*entry
t *table
err error
built bool
+
+ minNonVar int // lowest primary recorded for a variable
+ varTop int // highest primary recorded for a non-variable
+}
+
+// A Tailoring builds a collation table based on another collation table.
+// The table is defined by specifying tailorings to the underlying table.
+// See http://unicode.org/reports/tr35/ for an overview of tailoring
+// collation tables. The CLDR contains pre-defined tailorings for a variety
+// of languages (See http://www.unicode.org/Public/cldr/2.0.1/core.zip.)
+type Tailoring struct {
+ id string
+ // TODO: implement.
}
// NewBuilder returns a new Builder.
@@ -83,14 +100,26 @@ func NewBuilder() *Builder {
return b
}
-// Add adds an entry for the root collation element table, mapping
+// Tailoring returns a Tailoring for the given locale. One should
+// have completed all calls to Add before calling Tailoring.
+func (b *Builder) Tailoring(locale string) *Tailoring {
+ t := &Tailoring{
+ id: locale,
+ }
+ b.locale = append(b.locale, t)
+ return t
+}
+
+// Add adds an entry to the collation element table, mapping
// a slice of runes to a sequence of collation elements.
// A collation element is specified as list of weights: []int{primary, secondary, ...}.
// The entries are typically obtained from a collation element table
// as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format.
// Note that the collation elements specified by colelems are only used
// as a guide. The actual weights generated by Builder may differ.
-func (b *Builder) Add(str []rune, colelems [][]int) error {
+// The argument variables is a list of indices into colelems that should contain
+// a value for each colelem that is a variable. (See the reference above.)
+func (b *Builder) Add(str []rune, colelems [][]int, variables []int) error {
e := &entry{
runes: make([]rune, len(str)),
elems: make([][]int, len(colelems)),
@@ -113,6 +142,29 @@ func (b *Builder) Add(str []rune, colelems [][]int) error {
e.elems[i] = append(e.elems[i], ce[0])
}
}
+ for i, ce := range e.elems {
+ isvar := false
+ for _, j := range variables {
+ if i == j {
+ isvar = true
+ }
+ }
+ if isvar {
+ if ce[0] >= b.minNonVar && b.minNonVar > 0 {
+ return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar)
+ }
+ if ce[0] > b.varTop {
+ b.varTop = ce[0]
+ }
+ } else if ce[0] > 0 {
+ if ce[0] <= b.varTop {
+ return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop)
+ }
+ if b.minNonVar == 0 || ce[0] < b.minNonVar {
+ b.minNonVar = ce[0]
+ }
+ }
+ }
elems, err := convertLargeWeights(e.elems)
if err != nil {
return err
@@ -123,13 +175,57 @@ func (b *Builder) Add(str []rune, colelems [][]int) error {
return nil
}
-// AddTailoring defines a tailoring x <_level y for the given locale.
-// For example, AddTailoring("se", "z", "ä", Primary) sorts "ä" after "z"
-// at the primary level for Swedish. AddTailoring("de", "ue", "ü", Secondary)
-// sorts "ü" after "ue" at the secondary level for German.
+// SetAnchor sets the point after which elements passed in subsequent calls to
+// Insert will be inserted. It is equivalent to the reset directive in an LDML
+// specification. See Insert for an example.
+// SetAnchor supports the following logical reset positions:
+// , , ,
+// and .
+func (t *Tailoring) SetAnchor(anchor string) error {
+ // TODO: implement.
+ return nil
+}
+
+// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
+// Insert will insert entries before the anchor.
+func (t *Tailoring) SetAnchorBefore(anchor string) error {
+ // TODO: implement.
+ return nil
+}
+
+// Insert sets the ordering of str relative to the entry set by the previous
+// call to SetAnchor or Insert. The argument extend corresponds
+// to the extend elements as defined in LDML. A non-empty value for extend
+// will cause the collation elements corresponding to extend to be appended
+// to the collation elements generated for the entry added by Insert.
+// This has the same net effect as sorting str after the string anchor+extend.
// See http://www.unicode.org/reports/tr10/#Tailoring_Example for details
-// on parametric tailoring.
-func (b *Builder) AddTailoring(locale, x, y string, l collate.Level) error {
+// on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements
+// for full details on LDML.
+//
+// Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
+// at the primary sorting level:
+// t := b.Tailoring("se")
+// t.SetAnchor("z")
+// t.Insert(collate.Primary, "ä", "")
+// Order "ü" after "ue" at the secondary sorting level:
+// t.SetAnchor("ue")
+// t.Insert(collate.Secondary, "ü","")
+// or
+// t.SetAnchor("u")
+// t.Insert(collate.Secondary, "ü", "e")
+// Order "q" afer "ab" at the secondary level and "Q" after "q"
+// at the tertiary level:
+// t.SetAnchor("ab")
+// t.Insert(collate.Secondary, "q", "")
+// t.Insert(collate.Tertiary, "Q", "")
+// Order "b" before "a":
+// t.SetAnchorBefore("a")
+// t.Insert(collate.Primary, "b", "")
+// Order "0" after the last primary ignorable:
+// t.SetAnchor("")
+// t.Insert(collate.Primary, "0", "")
+func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
// TODO: implement.
return nil
}
@@ -189,7 +285,10 @@ func (b *Builder) error(e error) {
func (b *Builder) build() (*table, error) {
if !b.built {
b.built = true
- b.t = &table{}
+ b.t = &table{
+ maxContractLen: utf8.UTFMax,
+ variableTop: uint32(b.varTop),
+ }
b.simplify()
b.processExpansions() // requires simplify
@@ -202,18 +301,23 @@ func (b *Builder) build() (*table, error) {
return b.t, nil
}
-// Build builds a Collator for the given locale. To build the root table, set locale to "".
-func (b *Builder) Build(locale string) (*collate.Collator, error) {
+// Build builds the root Collator.
+func (b *Builder) Build() (*collate.Collator, error) {
t, err := b.build()
if err != nil {
return nil, err
}
- // TODO: support multiple locales
return collate.Init(t), nil
}
-// Print prints all tables to a Go file that can be included in
-// the Collate package.
+// Build builds a Collator for Tailoring t.
+func (t *Tailoring) Build() (*collate.Collator, error) {
+ // TODO: implement.
+ return nil, nil
+}
+
+// Print prints the tables for b and all its Tailorings as a Go file
+// that can be included in the Collate package.
func (b *Builder) Print(w io.Writer) (int, error) {
t, err := b.build()
if err != nil {
diff --git a/src/pkg/exp/locale/collate/build/builder_test.go b/src/pkg/exp/locale/collate/build/builder_test.go
index a113d449aa..ae13dab780 100644
--- a/src/pkg/exp/locale/collate/build/builder_test.go
+++ b/src/pkg/exp/locale/collate/build/builder_test.go
@@ -48,7 +48,7 @@ type ducetElem struct {
func newBuilder(t *testing.T, ducet []ducetElem) *Builder {
b := NewBuilder()
for _, e := range ducet {
- if err := b.Add([]rune(e.str), e.ces); err != nil {
+ if err := b.Add([]rune(e.str), e.ces, nil); err != nil {
t.Errorf(err.Error())
}
}
diff --git a/src/pkg/exp/locale/collate/build/table.go b/src/pkg/exp/locale/collate/build/table.go
index 91ed51b6de..a7973f5510 100644
--- a/src/pkg/exp/locale/collate/build/table.go
+++ b/src/pkg/exp/locale/collate/build/table.go
@@ -23,6 +23,7 @@ type table struct {
contractTries contractTrieSet
contractElem []uint32
maxContractLen int
+ variableTop uint32
}
func (t *table) TrieIndex() []uint16 {
@@ -53,6 +54,10 @@ func (t *table) MaxContractLen() int {
return t.maxContractLen
}
+func (t *table) VariableTop() uint32 {
+ return t.variableTop
+}
+
// print writes the table as Go compilable code to w. It prefixes the
// variable names with name. It returns the number of bytes written
// and the size of the resulting table.
@@ -78,6 +83,7 @@ func (t *table) fprint(w io.Writer, name string) (n, size int, err error) {
p(",\n")
p("%sContractElem[:],\n", name)
p("%d,\n", t.maxContractLen)
+ p("0x%X,\n", t.variableTop)
p("}\n\n")
// Write arrays needed for the structure.
diff --git a/src/pkg/exp/locale/collate/collate.go b/src/pkg/exp/locale/collate/collate.go
index 9a4bdcdb96..d59b858766 100644
--- a/src/pkg/exp/locale/collate/collate.go
+++ b/src/pkg/exp/locale/collate/collate.go
@@ -55,9 +55,6 @@ const (
// Collator provides functionality for comparing strings for a given
// collation order.
type Collator struct {
- // See SetVariableTop.
- variableTop uint32
-
// Strength sets the maximum level to use in comparison.
Strength Level
@@ -178,7 +175,7 @@ func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
}
func (c *Collator) key(buf *Buffer, w []weights) []byte {
- processWeights(c.Alternate, c.variableTop, w)
+ processWeights(c.Alternate, c.t.variableTop, w)
kn := len(buf.key)
c.keyFromElems(buf, w)
return buf.key[kn:]
diff --git a/src/pkg/exp/locale/collate/export.go b/src/pkg/exp/locale/collate/export.go
index c152296f57..01750dd070 100644
--- a/src/pkg/exp/locale/collate/export.go
+++ b/src/pkg/exp/locale/collate/export.go
@@ -4,6 +4,8 @@
package collate
+import "exp/norm"
+
// Init is used by type Builder in exp/locale/collate/build/
// to create Collator instances. It is for internal use only.
func Init(data interface{}) *Collator {
@@ -21,7 +23,12 @@ func Init(data interface{}) *Collator {
t.contractTries = init.ContractTries()
t.contractElem = init.ContractElems()
t.maxContractLen = init.MaxContractLen()
- return &Collator{t: t}
+ t.variableTop = init.VariableTop()
+ return &Collator{
+ Strength: Quaternary,
+ f: norm.NFD,
+ t: t,
+ }
}
type tableInitializer interface {
@@ -32,4 +39,5 @@ type tableInitializer interface {
ContractTries() []struct{ l, h, n, i uint8 }
ContractElems() []uint32
MaxContractLen() int
+ VariableTop() uint32
}
diff --git a/src/pkg/exp/locale/collate/export_test.go b/src/pkg/exp/locale/collate/export_test.go
index ddbf30d30d..de6e9078b5 100644
--- a/src/pkg/exp/locale/collate/export_test.go
+++ b/src/pkg/exp/locale/collate/export_test.go
@@ -7,7 +7,6 @@ package collate
// Export for testing.
import (
- "exp/norm"
"fmt"
)
@@ -63,18 +62,14 @@ func (t *Table) AppendNext(s []byte) ([]Weights, int) {
}
func SetTop(c *Collator, top int) {
- c.variableTop = uint32(top)
-}
-
-func InitCollator(c *Collator) {
- c.Strength = Quaternary
- c.f = norm.NFD
- c.t.maxContractLen = 30
+ if c.t == nil {
+ c.t = &table{}
+ }
+ c.t.variableTop = uint32(top)
}
func GetColElems(c *Collator, buf *Buffer, str []byte) []Weights {
buf.ResetKeys()
- InitCollator(c)
c.getColElems(buf, str)
return convertToWeights(buf.ce)
}
diff --git a/src/pkg/exp/locale/collate/maketables.go b/src/pkg/exp/locale/collate/maketables.go
index f335b363ab..a76e2d0f93 100644
--- a/src/pkg/exp/locale/collate/maketables.go
+++ b/src/pkg/exp/locale/collate/maketables.go
@@ -33,7 +33,7 @@ var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")
-func failonerror(e error) {
+func failOnError(e error) {
if e != nil {
log.Fatal(e)
}
@@ -62,10 +62,9 @@ func openReader(url string) (io.ReadCloser, error) {
// parseUCA parses a Default Unicode Collation Element Table of the format
// specified in http://www.unicode.org/reports/tr10/#File_Format.
// It returns the variable top.
-func parseUCA(builder *build.Builder) int {
- maxVar, minNonVar := 0, 1<<30
+func parseUCA(builder *build.Builder) {
r, err := openReader(*ducet)
- failonerror(err)
+ failOnError(err)
defer r.Close()
input := bufio.NewReader(r)
colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
@@ -109,32 +108,25 @@ func parseUCA(builder *build.Builder) int {
lhs = append(lhs, rune(convHex(i, v)))
}
var n int
+ var vars []int
rhs := [][]int{}
- for _, m := range colelem.FindAllStringSubmatch(part[1], -1) {
+ for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
n += len(m[0])
elem := []int{}
for _, h := range strings.Split(m[2], ".") {
elem = append(elem, convHex(i, h))
}
- if p := elem[0]; m[1] == "*" {
- if p > maxVar {
- maxVar = p
- }
- } else if p > 0 && p < minNonVar {
- minNonVar = p
+ if m[1] == "*" {
+ vars = append(vars, i)
}
rhs = append(rhs, elem)
}
if len(part[1]) < n+3 || part[1][n+1] != '#' {
log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
}
- builder.Add(lhs, rhs)
+ failOnError(builder.Add(lhs, rhs, vars))
}
}
- if maxVar >= minNonVar {
- log.Fatalf("found maxVar > minNonVar (%d > %d)", maxVar, minNonVar)
- }
- return maxVar
}
func convHex(line int, s string) int {
@@ -146,11 +138,10 @@ func convHex(line int, s string) int {
}
// TODO: move this functionality to exp/locale/collate/build.
-func printCollators(c *collate.Collator, vartop int) {
+func printCollators(c *collate.Collator) {
const name = "Root"
fmt.Printf("var _%s = Collator{\n", name)
fmt.Printf("\tStrength: %v,\n", c.Strength)
- fmt.Printf("\tvariableTop: 0x%X,\n", vartop)
fmt.Printf("\tf: norm.NFD,\n")
fmt.Printf("\tt: &%sTable,\n", strings.ToLower(name))
fmt.Printf("}\n\n")
@@ -162,9 +153,9 @@ func printCollators(c *collate.Collator, vartop int) {
func main() {
flag.Parse()
b := build.NewBuilder()
- vartop := parseUCA(b)
- _, err := b.Build("")
- failonerror(err)
+ parseUCA(b)
+ c, err := b.Build()
+ failOnError(err)
fmt.Println("// Generated by running")
fmt.Printf("// maketables --ducet=%s\n", *ducet)
@@ -176,10 +167,8 @@ func main() {
fmt.Println(`import "exp/norm"`)
fmt.Println("")
- c := &collate.Collator{}
- c.Strength = collate.Quaternary
- printCollators(c, vartop)
+ printCollators(c)
_, err = b.Print(os.Stdout)
- failonerror(err)
+ failOnError(err)
}
diff --git a/src/pkg/exp/locale/collate/table.go b/src/pkg/exp/locale/collate/table.go
index b662b72897..b2a5b62316 100644
--- a/src/pkg/exp/locale/collate/table.go
+++ b/src/pkg/exp/locale/collate/table.go
@@ -20,6 +20,7 @@ type table struct {
contractTries contractTrieSet
contractElem []uint32
maxContractLen int
+ variableTop uint32
}
// appendNext appends the weights corresponding to the next rune or
diff --git a/src/pkg/exp/locale/collate/table_test.go b/src/pkg/exp/locale/collate/table_test.go
index cd6d027254..446d592b6d 100644
--- a/src/pkg/exp/locale/collate/table_test.go
+++ b/src/pkg/exp/locale/collate/table_test.go
@@ -42,14 +42,9 @@ func pt(p, t int) []int {
func makeTable(in []input) (*collate.Collator, error) {
b := build.NewBuilder()
for _, r := range in {
- b.Add([]rune(r.str), r.ces)
+ b.Add([]rune(r.str), r.ces, nil)
}
- c, err := b.Build("")
- if c == nil {
- return nil, err
- }
- collate.InitCollator(c)
- return c, err
+ return b.Build()
}
// modSeq holds a seqeunce of modifiers in increasing order of CCC long enough
diff --git a/src/pkg/exp/locale/collate/tables.go b/src/pkg/exp/locale/collate/tables.go
index 43e310c377..42cc74e22a 100644
--- a/src/pkg/exp/locale/collate/tables.go
+++ b/src/pkg/exp/locale/collate/tables.go
@@ -8,10 +8,9 @@ package collate
import "exp/norm"
var _Root = Collator{
- Strength: 3,
- variableTop: 0x1560,
- f: norm.NFD,
- t: &rootTable,
+ Strength: 3,
+ f: norm.NFD,
+ t: &rootTable,
}
var (
@@ -24,6 +23,7 @@ var rootTable = table{
contractTrieSet(rootCTEntries[:]),
rootContractElem[:],
9,
+ 0x1560,
}
// rootExpandElem: 4630 entries, 18520 bytes