mirror of https://github.com/golang/go.git
regexp/syntax: recognize category aliases like \p{Letter}
The Unicode specification defines aliases for some of the general
category names. For example the category "L" has alias "Letter".
The regexp package supports \p{L} but not \p{Letter}, because there
was nothing in the Unicode tables that lets regexp know about Letter.
Now that package unicode provides CategoryAliases (see #70780),
we can use it to provide \p{Letter} as well.
This is the only feature missing from making package regexp suitable
for use in a JSON-API Schema implementation. (The official test suite
includes usage of aliases like \p{Letter} instead of \p{L}.)
For better conformity with Unicode TR18, also accept case-insensitive
matches for names and ignore underscores, hyphens, and spaces;
and add Any, ASCII, and Assigned.
Fixes #70781.
Change-Id: I50ff024d99255338fa8d92663881acb47f1e92a5
Reviewed-on: https://go-review.googlesource.com/c/go/+/641377
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
This commit is contained in:
parent
28fd9fa8a6
commit
930cf59ba8
|
|
@ -0,0 +1,4 @@
|
|||
The `\p{name}` and `\P{name}` character class syntaxes now accept the names
|
||||
Any, ASCII, Assigned, Cn, and LC, as well as Unicode category aliases like `\p{Letter}` for `\pL`.
|
||||
Following [Unicode TR18](https://unicode.org/reports/tr18/), they also now use
|
||||
case-insensitive name lookups, ignoring spaces, underscores, and hyphens.
|
||||
|
|
@ -137,6 +137,7 @@ ASCII character classes:
|
|||
[[:word:]] word characters (== [0-9A-Za-z_])
|
||||
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
|
||||
|
||||
Unicode character classes are those in [unicode.Categories] and [unicode.Scripts].
|
||||
Unicode character classes are those in [unicode.Categories],
|
||||
[unicode.CategoryAliases], and [unicode.Scripts].
|
||||
*/
|
||||
package syntax
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ package syntax
|
|||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
|
@ -1639,20 +1640,109 @@ var anyTable = &unicode.RangeTable{
|
|||
R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
|
||||
}
|
||||
|
||||
var asciiTable = &unicode.RangeTable{
|
||||
R16: []unicode.Range16{{Lo: 0, Hi: 0x7F, Stride: 1}},
|
||||
}
|
||||
|
||||
var asciiFoldTable = &unicode.RangeTable{
|
||||
R16: []unicode.Range16{
|
||||
{Lo: 0, Hi: 0x7F, Stride: 1},
|
||||
{Lo: 0x017F, Hi: 0x017F, Stride: 1}, // Old English long s (ſ), folds to S/s.
|
||||
{Lo: 0x212A, Hi: 0x212A, Stride: 1}, // Kelvin K, folds to K/k.
|
||||
},
|
||||
}
|
||||
|
||||
// categoryAliases is a lazily constructed copy of unicode.CategoryAliases
|
||||
// but with the keys passed through canonicalName, to support inexact matches.
|
||||
var categoryAliases struct {
|
||||
once sync.Once
|
||||
m map[string]string
|
||||
}
|
||||
|
||||
// initCategoryAliases initializes categoryAliases by canonicalizing unicode.CategoryAliases.
|
||||
func initCategoryAliases() {
|
||||
categoryAliases.m = make(map[string]string)
|
||||
for name, actual := range unicode.CategoryAliases {
|
||||
categoryAliases.m[canonicalName(name)] = actual
|
||||
}
|
||||
}
|
||||
|
||||
// canonicalName returns the canonical lookup string for name.
|
||||
// The canonical name has a leading uppercase letter and then lowercase letters,
|
||||
// and it omits all underscores, spaces, and hyphens.
|
||||
// (We could have used all lowercase, but this way most package unicode
|
||||
// map keys are already canonical.)
|
||||
func canonicalName(name string) string {
|
||||
var b []byte
|
||||
first := true
|
||||
for i := range len(name) {
|
||||
c := name[i]
|
||||
switch {
|
||||
case c == '_' || c == '-' || c == ' ':
|
||||
c = ' '
|
||||
case first:
|
||||
if 'a' <= c && c <= 'z' {
|
||||
c -= 'a' - 'A'
|
||||
}
|
||||
first = false
|
||||
default:
|
||||
if 'A' <= c && c <= 'Z' {
|
||||
c += 'a' - 'A'
|
||||
}
|
||||
}
|
||||
if b == nil {
|
||||
if c == name[i] && c != ' ' {
|
||||
// No changes so far, avoid allocating b.
|
||||
continue
|
||||
}
|
||||
b = make([]byte, i, len(name))
|
||||
copy(b, name[:i])
|
||||
}
|
||||
if c == ' ' {
|
||||
continue
|
||||
}
|
||||
b = append(b, c)
|
||||
}
|
||||
if b == nil {
|
||||
return name
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// unicodeTable returns the unicode.RangeTable identified by name
|
||||
// and the table of additional fold-equivalent code points.
|
||||
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
|
||||
// Special case: "Any" means any.
|
||||
if name == "Any" {
|
||||
return anyTable, anyTable
|
||||
// If sign < 0, the result should be inverted.
|
||||
func unicodeTable(name string) (tab, fold *unicode.RangeTable, sign int) {
|
||||
name = canonicalName(name)
|
||||
|
||||
// Special cases: Any, Assigned, and ASCII.
|
||||
// Also LC is the only non-canonical Categories key, so handle it here.
|
||||
switch name {
|
||||
case "Any":
|
||||
return anyTable, anyTable, +1
|
||||
case "Assigned":
|
||||
return unicode.Cn, unicode.Cn, -1 // invert Cn (unassigned)
|
||||
case "Ascii":
|
||||
return asciiTable, asciiFoldTable, +1
|
||||
case "Lc":
|
||||
return unicode.Categories["LC"], unicode.FoldCategory["LC"], +1
|
||||
}
|
||||
if t := unicode.Categories[name]; t != nil {
|
||||
return t, unicode.FoldCategory[name]
|
||||
return t, unicode.FoldCategory[name], +1
|
||||
}
|
||||
if t := unicode.Scripts[name]; t != nil {
|
||||
return t, unicode.FoldScript[name]
|
||||
return t, unicode.FoldScript[name], +1
|
||||
}
|
||||
return nil, nil
|
||||
|
||||
// unicode.CategoryAliases makes liberal use of underscores in its names
|
||||
// (they are defined that way by Unicode), but we want to match ignoring
|
||||
// the underscores, so make our own map with canonical names.
|
||||
categoryAliases.once.Do(initCategoryAliases)
|
||||
if actual := categoryAliases.m[name]; actual != "" {
|
||||
t := unicode.Categories[actual]
|
||||
return t, unicode.FoldCategory[actual], +1
|
||||
}
|
||||
return nil, nil, 0
|
||||
}
|
||||
|
||||
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
|
||||
|
|
@ -1700,10 +1790,13 @@ func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string,
|
|||
name = name[1:]
|
||||
}
|
||||
|
||||
tab, fold := unicodeTable(name)
|
||||
tab, fold, tsign := unicodeTable(name)
|
||||
if tab == nil {
|
||||
return nil, "", &Error{ErrInvalidCharRange, seq}
|
||||
}
|
||||
if tsign < 0 {
|
||||
sign = -sign
|
||||
}
|
||||
|
||||
if p.flags&FoldCase == 0 || fold == nil {
|
||||
if sign > 0 {
|
||||
|
|
|
|||
|
|
@ -107,10 +107,16 @@ var parseTests = []parseTest{
|
|||
{`[\P{^Braille}]`, `cc{0x2800-0x28ff}`},
|
||||
{`[\pZ]`, `cc{0x20 0xa0 0x1680 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}`},
|
||||
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
|
||||
{`\p{Uppercase_Letter}`, mkCharClass(unicode.IsUpper)},
|
||||
{`\p{upper case-let ter}`, mkCharClass(unicode.IsUpper)},
|
||||
{`\p{__upper case-let ter}`, mkCharClass(unicode.IsUpper)},
|
||||
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
|
||||
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
|
||||
{`\p{Any}`, `dot{}`},
|
||||
{`\p{^Any}`, `cc{}`},
|
||||
{`(?i)\p{ascii}`, `cc{0x0-0x7f 0x17f 0x212a}`},
|
||||
{`\p{Assigned}`, mkCharClass(func(r rune) bool { return !unicode.In(r, unicode.Cn) })},
|
||||
{`\p{^Assigned}`, mkCharClass(func(r rune) bool { return unicode.In(r, unicode.Cn) })},
|
||||
|
||||
// Hex, octal.
|
||||
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
|
||||
|
|
|
|||
Loading…
Reference in New Issue