diff --git a/src/unicode/letter.go b/src/unicode/letter.go index 9e2cead631..3959314c97 100644 --- a/src/unicode/letter.go +++ b/src/unicode/letter.go @@ -206,34 +206,17 @@ func IsTitle(r rune) bool { return isExcludingLatin(Title, r) } -// to maps the rune using the specified case mapping. -// It additionally reports whether caseRange contained a mapping for r. -func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { - if _case < 0 || MaxCase <= _case { - return ReplacementChar, false // as reasonable an error as any - } +// lookupCaseRange returns the CaseRange mapping for rune r or nil if no +// mapping exists for r. +func lookupCaseRange(r rune, caseRange []CaseRange) *CaseRange { // binary search over ranges lo := 0 hi := len(caseRange) for lo < hi { m := int(uint(lo+hi) >> 1) - cr := caseRange[m] + cr := &caseRange[m] if rune(cr.Lo) <= r && r <= rune(cr.Hi) { - delta := cr.Delta[_case] - if delta > MaxRune { - // In an Upper-Lower sequence, which always starts with - // an UpperCase letter, the real deltas always look like: - // {0, 1, 0} UpperCase (Lower is next) - // {-1, 0, -1} LowerCase (Upper, Title are previous) - // The characters at even offsets from the beginning of the - // sequence are upper case; the ones at odd offsets are lower. - // The correct mapping can be done by clearing or setting the low - // bit in the sequence offset. - // The constants UpperCase and TitleCase are even while LowerCase - // is odd so we take the low bit from _case. - return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true - } - return r + delta, true + return cr } if r < rune(cr.Lo) { hi = m @@ -241,6 +224,37 @@ func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping lo = m + 1 } } + return nil +} + +// convertCase converts r to _case using CaseRange cr. +func convertCase(_case int, r rune, cr *CaseRange) rune { + delta := cr.Delta[_case] + if delta > MaxRune { + // In an Upper-Lower sequence, which always starts with + // an UpperCase letter, the real deltas always look like: + // {0, 1, 0} UpperCase (Lower is next) + // {-1, 0, -1} LowerCase (Upper, Title are previous) + // The characters at even offsets from the beginning of the + // sequence are upper case; the ones at odd offsets are lower. + // The correct mapping can be done by clearing or setting the low + // bit in the sequence offset. + // The constants UpperCase and TitleCase are even while LowerCase + // is odd so we take the low bit from _case. + return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)) + } + return r + delta +} + +// to maps the rune using the specified case mapping. +// It additionally reports whether caseRange contained a mapping for r. +func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { + if _case < 0 || MaxCase <= _case { + return ReplacementChar, false // as reasonable an error as any + } + if cr := lookupCaseRange(r, caseRange); cr != nil { + return convertCase(_case, r, cr), true + } return r, false } @@ -364,8 +378,11 @@ func SimpleFold(r rune) rune { // No folding specified. This is a one- or two-element // equivalence class containing rune and ToLower(rune) // and ToUpper(rune) if they are different from rune. - if l := ToLower(r); l != r { - return l + if cr := lookupCaseRange(r, CaseRanges); cr != nil { + if l := convertCase(LowerCase, r, cr); l != r { + return l + } + return convertCase(UpperCase, r, cr) } - return ToUpper(r) + return r } diff --git a/src/unicode/letter_test.go b/src/unicode/letter_test.go index 123f9a642e..75c8aeee90 100644 --- a/src/unicode/letter_test.go +++ b/src/unicode/letter_test.go @@ -642,3 +642,29 @@ func TestNegativeRune(t *testing.T) { } } } + +func BenchmarkToUpper(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = ToUpper('δ') + } +} + +func BenchmarkToLower(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = ToLower('Δ') + } +} + +func BenchmarkSimpleFold(b *testing.B) { + bench := func(name string, r rune) { + b.Run(name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = SimpleFold(r) + } + }) + } + bench("Upper", 'Δ') + bench("Lower", 'δ') + bench("Fold", '\u212A') + bench("NoFold", '習') +}