354 lines
10 KiB
Go
354 lines
10 KiB
Go
// Copyright 2017 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package cldrtree builds and generates a CLDR index file, including all
|
|
// inheritance.
|
|
//
|
|
package cldrtree
|
|
|
|
//go:generate go test -gen
|
|
|
|
// cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
|
|
// data each branch in the tree is indicated by either an element name or an
|
|
// attribute value. A Tree does not distinguish between these two cases, but
|
|
// rather assumes that all branches can be accessed by an enum with a compact
|
|
// range of positive integer values starting from 0.
|
|
//
|
|
// Each Tree consists of three parts:
|
|
// - a slice mapping compact language identifiers to an offset into a set of
|
|
// indices,
|
|
// - a set of indices, stored as a large blob of uint16 values that encode
|
|
// the actual tree structure of data, and
|
|
// - a set of buckets that each holds a collection of strings.
|
|
// each of which is explained in more detail below.
|
|
//
|
|
//
|
|
// Tree lookup
|
|
// A tree lookup is done by providing a locale and a "path", which is a
|
|
// sequence of enum values. The search starts with getting the index for the
|
|
// given locale and then incrementally jumping into the index using the path
|
|
// values. If an element cannot be found in the index, the search starts anew
|
|
// for the locale's parent locale. The path may change during lookup by means
|
|
// of aliasing, described below.
|
|
//
|
|
// Buckets
|
|
// Buckets hold the actual string data of the leaf values of the CLDR tree.
|
|
// This data is stored in buckets, rather than one large string, for multiple
|
|
// reasons:
|
|
// - it allows representing leaf values more compactly, by storing all leaf
|
|
// values in a single bucket and then needing only needing a uint16 to index
|
|
// into this bucket for all leaf values,
|
|
// - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
|
|
// linking in a smaller amount of data if only a subset of the buckets is
|
|
// needed,
|
|
// - to be nice to go fmt and the compiler.
|
|
//
|
|
// indices
|
|
// An index is a slice of uint16 for which the values are interpreted in one of
|
|
// two ways: as a node or a set of leaf values.
|
|
// A set of leaf values has the following form:
|
|
// <max_size>, <bucket>, <offset>...
|
|
// max_size indicates the maximum enum value for which an offset is defined.
|
|
// An offset value of 0xFFFF (missingValue) also indicates an undefined value.
|
|
// If defined offset indicates the offset within the given bucket of the string.
|
|
// A node value has the following form:
|
|
// <max_size>, <offset_or_alias>...
|
|
// max_size indicates the maximum value for which an offset is defined.
|
|
// A missing offset may also be indicated with 0. If the high bit (0x8000, or
|
|
// inheritMask) is not set, the offset points to the offset within the index
|
|
// for the current locale.
|
|
// An offset with high bit set is an alias. In this case the uint16 has the form
|
|
// bits:
|
|
// 15: 1
|
|
// 14-12: negative offset into path relative to current position
|
|
// 0-11: new enum value for path element.
|
|
// On encountering an alias, the path is modified accordingly and the lookup is
|
|
// restarted for the given locale.
|
|
|
|
import (
|
|
"fmt"
|
|
"reflect"
|
|
"regexp"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/internal/gen"
|
|
"golang.org/x/text/language"
|
|
"golang.org/x/text/unicode/cldr"
|
|
)
|
|
|
|
// TODO:
|
|
// - allow two Trees to share the same set of buckets.
|
|
|
|
// A Builder allows storing CLDR data in compact form.
|
|
type Builder struct {
|
|
table []string
|
|
|
|
rootMeta *metaData
|
|
locales []locale
|
|
strToBucket map[string]stringInfo
|
|
buckets [][]byte
|
|
enums []*enum
|
|
err error
|
|
|
|
// Stats
|
|
size int
|
|
sizeAll int
|
|
bucketWaste int
|
|
}
|
|
|
|
const (
|
|
maxBucketSize = 8 * 1024 // 8K
|
|
maxStrlen = 254 // allow 0xFF sentinel
|
|
)
|
|
|
|
func (b *Builder) setError(err error) {
|
|
if b.err == nil {
|
|
b.err = err
|
|
}
|
|
}
|
|
|
|
func (b *Builder) addString(data string) stringInfo {
|
|
data = b.makeString(data)
|
|
info, ok := b.strToBucket[data]
|
|
if !ok {
|
|
b.size += len(data)
|
|
x := len(b.buckets) - 1
|
|
bucket := b.buckets[x]
|
|
if len(bucket)+len(data) < maxBucketSize {
|
|
info.bucket = uint16(x)
|
|
info.bucketPos = uint16(len(bucket))
|
|
b.buckets[x] = append(bucket, data...)
|
|
} else {
|
|
info.bucket = uint16(len(b.buckets))
|
|
info.bucketPos = 0
|
|
b.buckets = append(b.buckets, []byte(data))
|
|
}
|
|
b.strToBucket[data] = info
|
|
}
|
|
return info
|
|
}
|
|
|
|
func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
|
|
data = b.makeString(data)
|
|
info, ok := b.strToBucket[data]
|
|
if !ok || info.bucket != bucket {
|
|
if ok {
|
|
b.bucketWaste += len(data)
|
|
}
|
|
b.size += len(data)
|
|
bk := b.buckets[bucket]
|
|
info.bucket = bucket
|
|
info.bucketPos = uint16(len(bk))
|
|
b.buckets[bucket] = append(bk, data...)
|
|
b.strToBucket[data] = info
|
|
}
|
|
return info
|
|
}
|
|
|
|
func (b *Builder) makeString(data string) string {
|
|
if len(data) > maxStrlen {
|
|
b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
|
|
data = data[:maxStrlen]
|
|
for i := len(data) - 1; i > len(data)-4; i-- {
|
|
if utf8.RuneStart(data[i]) {
|
|
data = data[:i]
|
|
break
|
|
}
|
|
}
|
|
}
|
|
data = string([]byte{byte(len(data))}) + data
|
|
b.sizeAll += len(data)
|
|
return data
|
|
}
|
|
|
|
type stringInfo struct {
|
|
bufferPos uint32
|
|
bucket uint16
|
|
bucketPos uint16
|
|
}
|
|
|
|
// New creates a new Builder.
|
|
func New(tableName string) *Builder {
|
|
b := &Builder{
|
|
strToBucket: map[string]stringInfo{},
|
|
buckets: [][]byte{nil}, // initialize with first bucket.
|
|
}
|
|
b.rootMeta = &metaData{
|
|
b: b,
|
|
typeInfo: &typeInfo{},
|
|
}
|
|
return b
|
|
}
|
|
|
|
// Gen writes all the tables and types for the collected data.
|
|
func (b *Builder) Gen(w *gen.CodeWriter) error {
|
|
t, err := build(b)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return generate(b, t, w)
|
|
}
|
|
|
|
// GenTestData generates tables useful for testing data generated with Gen.
|
|
func (b *Builder) GenTestData(w *gen.CodeWriter) error {
|
|
return generateTestData(b, w)
|
|
}
|
|
|
|
type locale struct {
|
|
tag language.Tag
|
|
root *Index
|
|
}
|
|
|
|
// Locale creates an index for the given locale.
|
|
func (b *Builder) Locale(t language.Tag) *Index {
|
|
index := &Index{
|
|
meta: b.rootMeta,
|
|
}
|
|
b.locales = append(b.locales, locale{tag: t, root: index})
|
|
return index
|
|
}
|
|
|
|
// An Index holds a map of either leaf values or other indices.
|
|
type Index struct {
|
|
meta *metaData
|
|
|
|
subIndex []*Index
|
|
values []keyValue
|
|
}
|
|
|
|
func (i *Index) setError(err error) { i.meta.b.setError(err) }
|
|
|
|
type keyValue struct {
|
|
key enumIndex
|
|
value stringInfo
|
|
}
|
|
|
|
// Element is a CLDR XML element.
|
|
type Element interface {
|
|
GetCommon() *cldr.Common
|
|
}
|
|
|
|
// Index creates a subindex where the type and enum values are not shared
|
|
// with siblings by default. The name is derived from the elem. If elem is
|
|
// an alias reference, the alias will be resolved and linked. If elem is nil
|
|
// Index returns nil.
|
|
func (i *Index) Index(elem Element, opt ...Option) *Index {
|
|
if elem == nil || reflect.ValueOf(elem).IsNil() {
|
|
return nil
|
|
}
|
|
c := elem.GetCommon()
|
|
o := &options{
|
|
parent: i,
|
|
name: c.GetCommon().Element(),
|
|
}
|
|
o.fill(opt)
|
|
o.setAlias(elem)
|
|
return i.subIndexForKey(o)
|
|
}
|
|
|
|
// IndexWithName is like Section but derives the name from the given name.
|
|
func (i *Index) IndexWithName(name string, opt ...Option) *Index {
|
|
o := &options{parent: i, name: name}
|
|
o.fill(opt)
|
|
return i.subIndexForKey(o)
|
|
}
|
|
|
|
// IndexFromType creates a subindex the value of tye type attribute as key. It
|
|
// will also configure the Index to share the enumeration values with all
|
|
// sibling values. If elem is an alias, it will be resolved and linked.
|
|
func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
|
|
o := &options{
|
|
parent: i,
|
|
name: elem.GetCommon().Type,
|
|
}
|
|
o.fill(opts)
|
|
o.setAlias(elem)
|
|
useSharedType()(o)
|
|
return i.subIndexForKey(o)
|
|
}
|
|
|
|
// IndexFromAlt creates a subindex the value of tye alt attribute as key. It
|
|
// will also configure the Index to share the enumeration values with all
|
|
// sibling values. If elem is an alias, it will be resolved and linked.
|
|
func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
|
|
o := &options{
|
|
parent: i,
|
|
name: elem.GetCommon().Alt,
|
|
}
|
|
o.fill(opts)
|
|
o.setAlias(elem)
|
|
useSharedType()(o)
|
|
return i.subIndexForKey(o)
|
|
}
|
|
|
|
func (i *Index) subIndexForKey(opts *options) *Index {
|
|
key := opts.name
|
|
if len(i.values) > 0 {
|
|
panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
|
|
}
|
|
meta := i.meta.sub(key, opts)
|
|
for _, x := range i.subIndex {
|
|
if x.meta == meta {
|
|
return x
|
|
}
|
|
}
|
|
if alias := opts.alias; alias != nil {
|
|
if a := alias.GetCommon().Alias; a != nil {
|
|
if a.Source != "locale" {
|
|
i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
|
|
}
|
|
if meta.inheritOffset < 0 {
|
|
i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
|
|
}
|
|
path := a.Path
|
|
for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
|
|
meta.inheritOffset--
|
|
}
|
|
m := aliasRe.FindStringSubmatch(path)
|
|
if m == nil {
|
|
i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
|
|
} else {
|
|
key := m[4]
|
|
if key == "" {
|
|
key = m[1]
|
|
}
|
|
meta.inheritIndex = key
|
|
}
|
|
}
|
|
}
|
|
x := &Index{meta: meta}
|
|
i.subIndex = append(i.subIndex, x)
|
|
return x
|
|
}
|
|
|
|
var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
|
|
|
|
// SetValue sets the value, the data from a CLDR XML element, for the given key.
|
|
func (i *Index) SetValue(key string, value Element, opt ...Option) {
|
|
if len(i.subIndex) > 0 {
|
|
panic(fmt.Errorf("adding value for key %q when index already exists", key))
|
|
}
|
|
o := &options{parent: i}
|
|
o.fill(opt)
|
|
c := value.GetCommon()
|
|
if c.Alias != nil {
|
|
i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
|
|
}
|
|
i.setValue(key, c.Data(), o)
|
|
}
|
|
|
|
func (i *Index) setValue(key, data string, o *options) {
|
|
index, _ := i.meta.typeInfo.lookupSubtype(key, o)
|
|
kv := keyValue{key: index}
|
|
if len(i.values) > 0 {
|
|
// Add string to the same bucket as the other values.
|
|
bucket := i.values[0].value.bucket
|
|
kv.value = i.meta.b.addStringToBucket(data, bucket)
|
|
} else {
|
|
kv.value = i.meta.b.addString(data)
|
|
}
|
|
i.values = append(i.values, kv)
|
|
}
|