// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package cases // This file contains the definitions of case mappings for all supported // languages. The rules for the language-specific tailorings were taken and // modified from the CLDR transform definitions in common/transforms. import ( "strings" "unicode" "unicode/utf8" "golang.org/x/text/language" "golang.org/x/text/transform" "golang.org/x/text/unicode/norm" ) // A mapFunc takes a context set to the current rune and writes the mapped // version to the same context. It may advance the context to the next rune. It // returns whether a checkpoint is possible: whether the pDst bytes written to // dst so far won't need changing as we see more source bytes. type mapFunc func(*context) bool // maxIgnorable defines the maximum number of ignorables to consider for // lookahead operations. const maxIgnorable = 30 // supported lists the language tags for which we have tailorings. const supported = "und af az el lt nl tr" func init() { tags := []language.Tag{} for _, s := range strings.Split(supported, " ") { tags = append(tags, language.MustParse(s)) } matcher = language.NewMatcher(tags) Supported = language.NewCoverage(tags) } var ( matcher language.Matcher Supported language.Coverage // We keep the following lists separate, instead of having a single per- // language struct, to give the compiler a chance to remove unused code. // Some uppercase mappers are stateless, so we can precompute the // Transformers and save a bit on runtime allocations. upperFunc = []mapFunc{ nil, // und nil, // af aztrUpper(upper), // az elUpper, // el ltUpper(upper), // lt nil, // nl aztrUpper(upper), // tr } undUpper transform.Transformer = &undUpperCaser{} lowerFunc = []mapFunc{ lower, // und lower, // af aztrLower, // az lower, // el ltLower, // lt lower, // nl aztrLower, // tr } titleInfos = []struct { title, lower mapFunc rewrite func(*context) }{ {title, lower, nil}, // und {title, lower, afnlRewrite}, // af {aztrUpper(title), aztrLower, nil}, // az {title, lower, nil}, // el {ltUpper(title), ltLower, nil}, // lt {nlTitle, lower, afnlRewrite}, // nl {aztrUpper(title), aztrLower, nil}, // tr } ) func makeUpper(t language.Tag, o options) transform.Transformer { _, i, _ := matcher.Match(t) f := upperFunc[i] if f == nil { return undUpper } return &simpleCaser{f: f} } func makeLower(t language.Tag, o options) transform.Transformer { _, i, _ := matcher.Match(t) f := lowerFunc[i] if o.noFinalSigma { return &simpleCaser{f: f} } return &lowerCaser{ first: f, midWord: finalSigma(f), } } func makeTitle(t language.Tag, o options) transform.Transformer { _, i, _ := matcher.Match(t) x := &titleInfos[i] lower := x.lower if o.noLower { lower = (*context).copy } else if !o.noFinalSigma { lower = finalSigma(lower) } return &titleCaser{ title: x.title, lower: lower, rewrite: x.rewrite, } } // TODO: consider a similar special case for the fast majority lower case. This // is a bit more involved so will require some more precise benchmarking to // justify it. type undUpperCaser struct{ transform.NopResetter } // undUpperCaser implements the Transformer interface for doing an upper case // mapping for the root locale (und). It eliminates the need for an allocation // as it prevents escaping by not using function pointers. func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { c := context{dst: dst, src: src, atEOF: atEOF} for c.next() { upper(&c) c.checkpoint() } return c.ret() } type simpleCaser struct { context f mapFunc } // simpleCaser implements the Transformer interface for doing a case operation // on a rune-by-rune basis. func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { t.context = context{dst: dst, src: src, atEOF: atEOF} c := &t.context for c.next() && t.f(c) { c.checkpoint() } return c.ret() } // lowerCaser implements the Transformer interface. The default Unicode lower // casing requires different treatment for the first and subsequent characters // of a word, most notably to handle the Greek final Sigma. type lowerCaser struct { context first, midWord mapFunc } func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { t.context = context{dst: dst, src: src, atEOF: atEOF} c := &t.context for isInterWord := true; c.next(); { if isInterWord { if c.info.isCased() { if !t.first(c) { break } isInterWord = false } else if !c.copy() { break } } else { if c.info.isNotCasedAndNotCaseIgnorable() { if !c.copy() { break } isInterWord = true } else if !t.midWord(c) { break } } c.checkpoint() } return c.ret() } // titleCaser implements the Transformer interface. Title casing algorithms // distinguish between the first letter of a word and subsequent letters of the // same word. It uses state to avoid requiring a potentially infinite lookahead. type titleCaser struct { context // rune mappings used by the actual casing algorithms. title, lower mapFunc rewrite func(*context) } // Transform implements the standard Unicode title case algorithm as defined in // Chapter 3 of The Unicode Standard: // toTitlecase(X): Find the word boundaries in X according to Unicode Standard // Annex #29, "Unicode Text Segmentation." For each word boundary, find the // first cased character F following the word boundary. If F exists, map F to // Titlecase_Mapping(F); then map all characters C between F and the following // word boundary to Lowercase_Mapping(C). func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord} c := &t.context if !c.next() { return c.ret() } for { p := c.info if t.rewrite != nil { t.rewrite(c) } wasMid := p.isCaseIgnorableAndNonBreakStarter() // Break out of this loop on failure to ensure we do not modify the // state incorrectly. if p.isCased() && !p.isCaseIgnorableAndNotCased() { if !c.isMidWord { if !t.title(c) { break } c.isMidWord = true } else if !t.lower(c) { break } } else if !c.copy() { break } // TODO: make this an "else if" if we can prove that no rune that does // not match the first condition of the if statement can be a break. if p.isBreak() { c.isMidWord = false } // As we save the state of the transformer, it is safe to call // checkpoint after any successful write. c.checkpoint() if !c.next() { break } if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() { c.isMidWord = false } } return c.ret() } // finalSigma adds Greek final Sigma handing to another casing function. It // determines whether a lowercased sigma should be σ or ς, by looking ahead for // case-ignorables and a cased letters. func finalSigma(f mapFunc) mapFunc { return func(c *context) bool { // ::NFD(); // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA // Σ } [:case-ignorable:]* [:cased:] → σ; // [:cased:] [:case-ignorable:]* { Σ → ς; // ::Any-Lower; // ::NFC(); if !c.hasPrefix("Σ") { return f(c) } p := c.pDst c.writeString("ς") // We need to do one more iteration after maxIgnorable, as a cased // letter is not an ignorable and may modify the result. for i := 0; i < maxIgnorable+1; i++ { if !c.next() { return false } if !c.info.isCaseIgnorable() { if c.info.isCased() { // p+1 is guaranteed to be in bounds: if writing ς was // successful, p+1 will contain the second byte of ς. If not, // this function will have returned after c.next returned false. c.dst[p+1]++ // ς → σ } c.unreadRune() return true } // A case ignorable may also introduce a word break, so we may need // to continue searching even after detecting a break. c.isMidWord = c.isMidWord && !c.info.isBreak() c.copy() } return true } } // elUpper implements Greek upper casing, which entails removing a predefined // set of non-blocked modifiers. Note that these accents should not be removed // for title casing! // Example: "Οδός" -> "ΟΔΟΣ". func elUpper(c *context) bool { // From CLDR: // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; r, _ := utf8.DecodeRune(c.src[c.pSrc:]) oldPDst := c.pDst if !upper(c) { return false } if !unicode.Is(unicode.Greek, r) { return true } i := 0 // Take the properties of the uppercased rune that is already written to the // destination. This saves us the trouble of having to uppercase the // decomposed rune again. if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil { // Restore the destination position and process the decomposed rune. r, sz := utf8.DecodeRune(b) if r <= 0xFF { // See A.6.1 return true } c.pDst = oldPDst // Insert the first rune and ignore the modifiers. See A.6.2. c.writeBytes(b[:sz]) i = len(b[sz:]) / 2 // Greek modifiers are always of length 2. } for ; i < maxIgnorable && c.next(); i++ { switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r { // Above and Iota Subscript case 0x0300, // U+0300 COMBINING GRAVE ACCENT 0x0301, // U+0301 COMBINING ACUTE ACCENT 0x0304, // U+0304 COMBINING MACRON 0x0306, // U+0306 COMBINING BREVE 0x0308, // U+0308 COMBINING DIAERESIS 0x0313, // U+0313 COMBINING COMMA ABOVE 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE 0x0342, // U+0342 COMBINING GREEK PERISPOMENI 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI // No-op. Gobble the modifier. default: switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() { case cccZero: c.unreadRune() return true // We don't need to test for IotaSubscript as the only rune that // qualifies (U+0345) was already excluded in the switch statement // above. See A.4. case cccAbove: return c.copy() default: // Some other modifier. We're still allowed to gobble Greek // modifiers after this. c.copy() } } } return i == maxIgnorable } func ltLower(c *context) bool { // From CLDR: // # Introduce an explicit dot above when lowercasing capital I's and J's // # whenever there are more accents above. // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE // ::NFD(); // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; // Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307; // Ì → i \u0307 \u0300; // Í → i \u0307 \u0301; // Ĩ → i \u0307 \u0303; // ::Any-Lower(); // ::NFC(); i := 0 if r := c.src[c.pSrc]; r < utf8.RuneSelf { lower(c) if r != 'I' && r != 'J' { return true } } else { p := norm.NFD.Properties(c.src[c.pSrc:]) if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') { // UTF-8 optimization: the decomposition will only have an above // modifier if the last rune of the decomposition is in [U+300-U+311]. // In all other cases, a decomposition starting with I is always // an I followed by modifiers that are not cased themselves. See A.2. if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4. if !c.writeBytes(d[:1]) { return false } c.dst[c.pDst-1] += 'a' - 'A' // lower // Assumption: modifier never changes on lowercase. See A.1. // Assumption: all modifiers added have CCC = Above. See A.2.3. return c.writeString("\u0307") && c.writeBytes(d[1:]) } // In all other cases the additional modifiers will have a CCC // that is less than 230 (Above). We will insert the U+0307, if // needed, after these modifiers so that a string in FCD form // will remain so. See A.2.2. lower(c) i = 1 } else { return lower(c) } } for ; i < maxIgnorable && c.next(); i++ { switch c.info.cccType() { case cccZero: c.unreadRune() return true case cccAbove: return c.writeString("\u0307") && c.copy() // See A.1. default: c.copy() // See A.1. } } return i == maxIgnorable } func ltUpper(f mapFunc) mapFunc { return func(c *context) bool { // From CLDR: // ::NFD(); // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; // ::Any-Upper(); // ::NFC(); // TODO: See A.5. A soft-dotted rune never has an exception. This would // allow us to overload the exception bit and encode this property in // info. Need to measure performance impact of this. r, _ := utf8.DecodeRune(c.src[c.pSrc:]) oldPDst := c.pDst if !f(c) { return false } if !unicode.Is(unicode.Soft_Dotted, r) { return true } // We don't need to do an NFD normalization, as a soft-dotted rune never // contains U+0307. See A.3. i := 0 for ; i < maxIgnorable && c.next(); i++ { switch c.info.cccType() { case cccZero: c.unreadRune() return true case cccAbove: if c.hasPrefix("\u0307") { // We don't do a full NFC, but rather combine runes for // some of the common cases. (Returning NFC or // preserving normal form is neither a requirement nor // a possibility anyway). if !c.next() { return false } if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc { s := "" switch c.src[c.pSrc+1] { case 0x80: // U+0300 COMBINING GRAVE ACCENT s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE case 0x81: // U+0301 COMBINING ACUTE ACCENT s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE case 0x83: // U+0303 COMBINING TILDE s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE case 0x88: // U+0308 COMBINING DIAERESIS s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS default: } if s != "" { c.pDst = oldPDst return c.writeString(s) } } } return c.copy() default: c.copy() } } return i == maxIgnorable } } func aztrUpper(f mapFunc) mapFunc { return func(c *context) bool { // i→İ; if c.src[c.pSrc] == 'i' { return c.writeString("İ") } return f(c) } } func aztrLower(c *context) (done bool) { // From CLDR: // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE // İ→i; // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. // # This matches the behavior of the canonically equivalent I-dot_above // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; // I→ı ; // ::Any-Lower(); if c.hasPrefix("\u0130") { // İ return c.writeString("i") } if c.src[c.pSrc] != 'I' { return lower(c) } // We ignore the lower-case I for now, but insert it later when we know // which form we need. start := c.pSrc + c.sz i := 0 Loop: // We check for up to n ignorables before \u0307. As \u0307 is an // ignorable as well, n is maxIgnorable-1. for ; i < maxIgnorable && c.next(); i++ { switch c.info.cccType() { case cccAbove: if c.hasPrefix("\u0307") { return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307 } done = true break Loop case cccZero: c.unreadRune() done = true break Loop default: // We'll write this rune after we know which starter to use. } } if i == maxIgnorable { done = true } return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done } func nlTitle(c *context) bool { // From CLDR: // # Special titlecasing for Dutch initial "ij". // ::Any-Title(); // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' { return title(c) } if !c.writeString("I") || !c.next() { return false } if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' { return c.writeString("J") } c.unreadRune() return true } // Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078. func afnlRewrite(c *context) { if c.hasPrefix("'") || c.hasPrefix("’") { c.isMidWord = true } }