mirror of https://github.com/golang/go.git
regexp: document and implement that invalid UTF-8 bytes are the same as U+FFFD
What should it mean to run a regexp match on invalid UTF-8 bytes?
The coherent behavior options are:
1. Invalid UTF-8 does not match any character classes,
nor a U+FFFD literal (nor \x{fffd}).
2. Each byte of invalid UTF-8 is treated identically to a U+FFFD in the input,
as a utf8.DecodeRune loop might.
RE2 uses Rule 1.
Because it works byte at a time, it can also provide \C to match any
single byte of input, which matches invalid UTF-8 as well.
This provides the nice property that a match for a regexp without \C
is guaranteed to be valid UTF-8.
Unfortunately, today Go has an incoherent mix of these two, although
mostly Rule 2. This is a deviation from RE2, and it gives up the nice
property, but we probably can't correct that at this point.
In particular .* already matches entire inputs today, valid UTF-8 or
not, and I doubt we can break that.
This CL adopts Rule 2 officially, fixing the few places that deviate from it.
Fixes #48749.
Change-Id: I96402527c5dfb1146212f568ffa09dde91d71244
Reviewed-on: https://go-review.googlesource.com/c/go/+/354569
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Rob Pike <r@golang.org>
This commit is contained in:
parent
34f7b1f841
commit
702e337174
|
|
@ -372,6 +372,9 @@ var literalPrefixTests = []MetaTest{
|
|||
{`^^0$$`, ``, ``, false},
|
||||
{`^$^$`, ``, ``, false},
|
||||
{`$$0^^`, ``, ``, false},
|
||||
{`a\x{fffd}b`, ``, `a`, false},
|
||||
{`\x{fffd}b`, ``, ``, false},
|
||||
{"\ufffd", ``, ``, false},
|
||||
}
|
||||
|
||||
func TestQuoteMeta(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -116,6 +116,13 @@ var findTests = []FindTest{
|
|||
{"\\`", "`", build(1, 0, 1)},
|
||||
{"[\\`]+", "`", build(1, 0, 1)},
|
||||
|
||||
{"\ufffd", "\xff", build(1, 0, 1)},
|
||||
{"\ufffd", "hello\xffworld", build(1, 5, 6)},
|
||||
{`.*`, "hello\xffworld", build(1, 0, 11)},
|
||||
{`\x{fffd}`, "\xc2\x00", build(1, 0, 1)},
|
||||
{"[\ufffd]", "\xff", build(1, 0, 1)},
|
||||
{`[\x{fffd}]`, "\xc2\x00", build(1, 0, 1)},
|
||||
|
||||
// long set of matches (longer than startSize)
|
||||
{
|
||||
".",
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import (
|
|||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// "One-pass" regexp execution.
|
||||
|
|
@ -55,7 +56,7 @@ func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
|
|||
|
||||
// Have prefix; gather characters.
|
||||
var buf strings.Builder
|
||||
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
|
||||
for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
|
||||
buf.WriteRune(i.Rune[0])
|
||||
pc, i = i.Out, &p.Inst[i.Out]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@
|
|||
// or any book about automata theory.
|
||||
//
|
||||
// All characters are UTF-8-encoded code points.
|
||||
// Following utf8.DecodeRune, each byte of an invalid UTF-8 sequence
|
||||
// is treated as if it encoded utf8.RuneError (U+FFFD).
|
||||
//
|
||||
// There are 16 methods of Regexp that match a regular expression and identify
|
||||
// the matched text. Their names are matched by this regular expression:
|
||||
|
|
@ -276,7 +278,11 @@ func minInputLen(re *syntax.Regexp) int {
|
|||
case syntax.OpLiteral:
|
||||
l := 0
|
||||
for _, r := range re.Rune {
|
||||
l += utf8.RuneLen(r)
|
||||
if r == utf8.RuneError {
|
||||
l++
|
||||
} else {
|
||||
l += utf8.RuneLen(r)
|
||||
}
|
||||
}
|
||||
return l
|
||||
case syntax.OpCapture, syntax.OpPlus:
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import (
|
|||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Compiled program.
|
||||
|
|
@ -154,7 +155,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
|
|||
|
||||
// Have prefix; gather characters.
|
||||
var buf strings.Builder
|
||||
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
|
||||
for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
|
||||
buf.WriteRune(i.Rune[0])
|
||||
i = p.skipNop(i.Out)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue