mirror of https://github.com/golang/go.git
exp/regexp: implement regexp API using exp/regexp/syntax
Still need to write tests for new syntax and fix bugs that the tests find, but this is a good check point. All tests pass. Compared against existing regexp: benchmark old ns/op new ns/op delta regexp.BenchmarkLiteral 1869 620 -66.83% regexp.BenchmarkNotLiteral 9489 7823 -17.56% regexp.BenchmarkMatchClass 10372 8386 -19.15% regexp.BenchmarkMatchClass_InRange 10800 7750 -28.24% regexp.BenchmarkReplaceAll 13492 8519 -36.86% regexp.BenchmarkAnchoredLiteralShortNonMatch 747 339 -54.62% regexp.BenchmarkAnchoredLiteralLongNonMatch 599 335 -44.07% regexp.BenchmarkAnchoredShortMatch 2137 917 -57.09% regexp.BenchmarkAnchoredLongMatch 2029 917 -54.81% R=r, r CC=golang-dev, sam.thorogood https://golang.org/cl/4820046
This commit is contained in:
parent
fc2480da3c
commit
a1e7cd97d5
|
|
@ -0,0 +1,12 @@
|
|||
# Copyright 2011 The Go Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
include ../../../Make.inc
|
||||
|
||||
TARG=exp/regexp
|
||||
GOFILES=\
|
||||
exec.go\
|
||||
regexp.go\
|
||||
|
||||
include ../../../Make.pkg
|
||||
|
|
@ -0,0 +1,429 @@
|
|||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package regexp
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var good_re = []string{
|
||||
``,
|
||||
`.`,
|
||||
`^.$`,
|
||||
`a`,
|
||||
`a*`,
|
||||
`a+`,
|
||||
`a?`,
|
||||
`a|b`,
|
||||
`a*|b*`,
|
||||
`(a*|b)(c*|d)`,
|
||||
`[a-z]`,
|
||||
`[a-abc-c\-\]\[]`,
|
||||
`[a-z]+`,
|
||||
`[abc]`,
|
||||
`[^1234]`,
|
||||
`[^\n]`,
|
||||
`\!\\`,
|
||||
}
|
||||
|
||||
/*
|
||||
type stringError struct {
|
||||
re string
|
||||
err os.Error
|
||||
}
|
||||
|
||||
var bad_re = []stringError{
|
||||
{`*`, ErrBareClosure},
|
||||
{`+`, ErrBareClosure},
|
||||
{`?`, ErrBareClosure},
|
||||
{`(abc`, ErrUnmatchedLpar},
|
||||
{`abc)`, ErrUnmatchedRpar},
|
||||
{`x[a-z`, ErrUnmatchedLbkt},
|
||||
{`abc]`, ErrUnmatchedRbkt},
|
||||
{`[z-a]`, ErrBadRange},
|
||||
{`abc\`, ErrExtraneousBackslash},
|
||||
{`a**`, ErrBadClosure},
|
||||
{`a*+`, ErrBadClosure},
|
||||
{`a??`, ErrBadClosure},
|
||||
{`\x`, ErrBadBackslash},
|
||||
}
|
||||
*/
|
||||
|
||||
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
|
||||
re, err := Compile(expr)
|
||||
if err != error {
|
||||
t.Error("compiling `", expr, "`; unexpected error: ", err.String())
|
||||
}
|
||||
return re
|
||||
}
|
||||
|
||||
func TestGoodCompile(t *testing.T) {
|
||||
for i := 0; i < len(good_re); i++ {
|
||||
compileTest(t, good_re[i], nil)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
func TestBadCompile(t *testing.T) {
|
||||
for i := 0; i < len(bad_re); i++ {
|
||||
compileTest(t, bad_re[i].re, bad_re[i].err)
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
func matchTest(t *testing.T, test *FindTest) {
|
||||
re := compileTest(t, test.pat, nil)
|
||||
if re == nil {
|
||||
return
|
||||
}
|
||||
m := re.MatchString(test.text)
|
||||
if m != (len(test.matches) > 0) {
|
||||
t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0)
|
||||
}
|
||||
// now try bytes
|
||||
m = re.Match([]byte(test.text))
|
||||
if m != (len(test.matches) > 0) {
|
||||
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatch(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
matchTest(t, &test)
|
||||
}
|
||||
}
|
||||
|
||||
func matchFunctionTest(t *testing.T, test *FindTest) {
|
||||
m, err := MatchString(test.pat, test.text)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
if m != (len(test.matches) > 0) {
|
||||
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchFunction(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
matchFunctionTest(t, &test)
|
||||
}
|
||||
}
|
||||
|
||||
type ReplaceTest struct {
|
||||
pattern, replacement, input, output string
|
||||
}
|
||||
|
||||
var replaceTests = []ReplaceTest{
|
||||
// Test empty input and/or replacement, with pattern that matches the empty string.
|
||||
{"", "", "", ""},
|
||||
{"", "x", "", "x"},
|
||||
{"", "", "abc", "abc"},
|
||||
{"", "x", "abc", "xaxbxcx"},
|
||||
|
||||
// Test empty input and/or replacement, with pattern that does not match the empty string.
|
||||
{"b", "", "", ""},
|
||||
{"b", "x", "", ""},
|
||||
{"b", "", "abc", "ac"},
|
||||
{"b", "x", "abc", "axc"},
|
||||
{"y", "", "", ""},
|
||||
{"y", "x", "", ""},
|
||||
{"y", "", "abc", "abc"},
|
||||
{"y", "x", "abc", "abc"},
|
||||
|
||||
// Multibyte characters -- verify that we don't try to match in the middle
|
||||
// of a character.
|
||||
{"[a-c]*", "x", "\u65e5", "x\u65e5x"},
|
||||
{"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"},
|
||||
|
||||
// Start and end of a string.
|
||||
{"^[a-c]*", "x", "abcdabc", "xdabc"},
|
||||
{"[a-c]*$", "x", "abcdabc", "abcdx"},
|
||||
{"^[a-c]*$", "x", "abcdabc", "abcdabc"},
|
||||
{"^[a-c]*", "x", "abc", "x"},
|
||||
{"[a-c]*$", "x", "abc", "x"},
|
||||
{"^[a-c]*$", "x", "abc", "x"},
|
||||
{"^[a-c]*", "x", "dabce", "xdabce"},
|
||||
{"[a-c]*$", "x", "dabce", "dabcex"},
|
||||
{"^[a-c]*$", "x", "dabce", "dabce"},
|
||||
{"^[a-c]*", "x", "", "x"},
|
||||
{"[a-c]*$", "x", "", "x"},
|
||||
{"^[a-c]*$", "x", "", "x"},
|
||||
|
||||
{"^[a-c]+", "x", "abcdabc", "xdabc"},
|
||||
{"[a-c]+$", "x", "abcdabc", "abcdx"},
|
||||
{"^[a-c]+$", "x", "abcdabc", "abcdabc"},
|
||||
{"^[a-c]+", "x", "abc", "x"},
|
||||
{"[a-c]+$", "x", "abc", "x"},
|
||||
{"^[a-c]+$", "x", "abc", "x"},
|
||||
{"^[a-c]+", "x", "dabce", "dabce"},
|
||||
{"[a-c]+$", "x", "dabce", "dabce"},
|
||||
{"^[a-c]+$", "x", "dabce", "dabce"},
|
||||
{"^[a-c]+", "x", "", ""},
|
||||
{"[a-c]+$", "x", "", ""},
|
||||
{"^[a-c]+$", "x", "", ""},
|
||||
|
||||
// Other cases.
|
||||
{"abc", "def", "abcdefg", "defdefg"},
|
||||
{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"},
|
||||
{"abc", "", "abcdabc", "d"},
|
||||
{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"},
|
||||
{"abc", "d", "", ""},
|
||||
{"abc", "d", "abc", "d"},
|
||||
{".+", "x", "abc", "x"},
|
||||
{"[a-c]*", "x", "def", "xdxexfx"},
|
||||
{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"},
|
||||
{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"},
|
||||
}
|
||||
|
||||
type ReplaceFuncTest struct {
|
||||
pattern string
|
||||
replacement func(string) string
|
||||
input, output string
|
||||
}
|
||||
|
||||
var replaceFuncTests = []ReplaceFuncTest{
|
||||
{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"},
|
||||
{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"},
|
||||
{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"},
|
||||
}
|
||||
|
||||
func TestReplaceAll(t *testing.T) {
|
||||
for _, tc := range replaceTests {
|
||||
re, err := Compile(tc.pattern)
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
|
||||
continue
|
||||
}
|
||||
actual := re.ReplaceAllString(tc.input, tc.replacement)
|
||||
if actual != tc.output {
|
||||
t.Errorf("%q.Replace(%q,%q) = %q; want %q",
|
||||
tc.pattern, tc.input, tc.replacement, actual, tc.output)
|
||||
}
|
||||
// now try bytes
|
||||
actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement)))
|
||||
if actual != tc.output {
|
||||
t.Errorf("%q.Replace(%q,%q) = %q; want %q",
|
||||
tc.pattern, tc.input, tc.replacement, actual, tc.output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestReplaceAllFunc(t *testing.T) {
|
||||
for _, tc := range replaceFuncTests {
|
||||
re, err := Compile(tc.pattern)
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
|
||||
continue
|
||||
}
|
||||
actual := re.ReplaceAllStringFunc(tc.input, tc.replacement)
|
||||
if actual != tc.output {
|
||||
t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q",
|
||||
tc.pattern, tc.input, tc.replacement, actual, tc.output)
|
||||
}
|
||||
// now try bytes
|
||||
actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) }))
|
||||
if actual != tc.output {
|
||||
t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q",
|
||||
tc.pattern, tc.input, tc.replacement, actual, tc.output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type MetaTest struct {
|
||||
pattern, output, literal string
|
||||
isLiteral bool
|
||||
}
|
||||
|
||||
var metaTests = []MetaTest{
|
||||
{``, ``, ``, true},
|
||||
{`foo`, `foo`, `foo`, true},
|
||||
{`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator
|
||||
{`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators
|
||||
{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false},
|
||||
}
|
||||
|
||||
func TestQuoteMeta(t *testing.T) {
|
||||
for _, tc := range metaTests {
|
||||
// Verify that QuoteMeta returns the expected string.
|
||||
quoted := QuoteMeta(tc.pattern)
|
||||
if quoted != tc.output {
|
||||
t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`",
|
||||
tc.pattern, quoted, tc.output)
|
||||
continue
|
||||
}
|
||||
|
||||
// Verify that the quoted string is in fact treated as expected
|
||||
// by Compile -- i.e. that it matches the original, unquoted string.
|
||||
if tc.pattern != "" {
|
||||
re, err := Compile(quoted)
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err)
|
||||
continue
|
||||
}
|
||||
src := "abc" + tc.pattern + "def"
|
||||
repl := "xyz"
|
||||
replaced := re.ReplaceAllString(src, repl)
|
||||
expected := "abcxyzdef"
|
||||
if replaced != expected {
|
||||
t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`",
|
||||
tc.pattern, src, repl, replaced, expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLiteralPrefix(t *testing.T) {
|
||||
for _, tc := range metaTests {
|
||||
// Literal method needs to scan the pattern.
|
||||
re := MustCompile(tc.pattern)
|
||||
str, complete := re.LiteralPrefix()
|
||||
if complete != tc.isLiteral {
|
||||
t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral)
|
||||
}
|
||||
if str != tc.literal {
|
||||
t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type numSubexpCase struct {
|
||||
input string
|
||||
expected int
|
||||
}
|
||||
|
||||
var numSubexpCases = []numSubexpCase{
|
||||
{``, 0},
|
||||
{`.*`, 0},
|
||||
{`abba`, 0},
|
||||
{`ab(b)a`, 1},
|
||||
{`ab(.*)a`, 1},
|
||||
{`(.*)ab(.*)a`, 2},
|
||||
{`(.*)(ab)(.*)a`, 3},
|
||||
{`(.*)((a)b)(.*)a`, 4},
|
||||
{`(.*)(\(ab)(.*)a`, 3},
|
||||
{`(.*)(\(a\)b)(.*)a`, 3},
|
||||
}
|
||||
|
||||
func TestNumSubexp(t *testing.T) {
|
||||
for _, c := range numSubexpCases {
|
||||
re := MustCompile(c.input)
|
||||
n := re.NumSubexp()
|
||||
if n != c.expected {
|
||||
t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkLiteral(b *testing.B) {
|
||||
x := strings.Repeat("x", 50) + "y"
|
||||
b.StopTimer()
|
||||
re := MustCompile("y")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
if !re.MatchString(x) {
|
||||
println("no match!")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkNotLiteral(b *testing.B) {
|
||||
x := strings.Repeat("x", 50) + "y"
|
||||
b.StopTimer()
|
||||
re := MustCompile(".y")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
if !re.MatchString(x) {
|
||||
println("no match!")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMatchClass(b *testing.B) {
|
||||
b.StopTimer()
|
||||
x := strings.Repeat("xxxx", 20) + "w"
|
||||
re := MustCompile("[abcdw]")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
if !re.MatchString(x) {
|
||||
println("no match!")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMatchClass_InRange(b *testing.B) {
|
||||
b.StopTimer()
|
||||
// 'b' is between 'a' and 'c', so the charclass
|
||||
// range checking is no help here.
|
||||
x := strings.Repeat("bbbb", 20) + "c"
|
||||
re := MustCompile("[ac]")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
if !re.MatchString(x) {
|
||||
println("no match!")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkReplaceAll(b *testing.B) {
|
||||
x := "abcdefghijklmnopqrstuvwxyz"
|
||||
b.StopTimer()
|
||||
re := MustCompile("[cjrw]")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
re.ReplaceAllString(x, "")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
|
||||
b.StopTimer()
|
||||
x := []byte("abcdefghijklmnopqrstuvwxyz")
|
||||
re := MustCompile("^zbc(d|e)")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
re.Match(x)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
|
||||
b.StopTimer()
|
||||
x := []byte("abcdefghijklmnopqrstuvwxyz")
|
||||
for i := 0; i < 15; i++ {
|
||||
x = append(x, x...)
|
||||
}
|
||||
re := MustCompile("^zbc(d|e)")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
re.Match(x)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAnchoredShortMatch(b *testing.B) {
|
||||
b.StopTimer()
|
||||
x := []byte("abcdefghijklmnopqrstuvwxyz")
|
||||
re := MustCompile("^.bc(d|e)")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
re.Match(x)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAnchoredLongMatch(b *testing.B) {
|
||||
b.StopTimer()
|
||||
x := []byte("abcdefghijklmnopqrstuvwxyz")
|
||||
for i := 0; i < 15; i++ {
|
||||
x = append(x, x...)
|
||||
}
|
||||
re := MustCompile("^.bc(d|e)")
|
||||
b.StartTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
re.Match(x)
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,295 @@
|
|||
package regexp
|
||||
|
||||
import "exp/regexp/syntax"
|
||||
|
||||
// A queue is a 'sparse array' holding pending threads of execution.
|
||||
// See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
|
||||
type queue struct {
|
||||
sparse []uint32
|
||||
dense []entry
|
||||
}
|
||||
|
||||
// A entry is an entry on a queue.
|
||||
// It holds both the instruction pc and the actual thread.
|
||||
// Some queue entries are just place holders so that the machine
|
||||
// knows it has considered that pc. Such entries have t == nil.
|
||||
type entry struct {
|
||||
pc uint32
|
||||
t *thread
|
||||
}
|
||||
|
||||
// A thread is the state of a single path through the machine:
|
||||
// an instruction and a corresponding capture array.
|
||||
// See http://swtch.com/~rsc/regexp/regexp2.html
|
||||
type thread struct {
|
||||
inst *syntax.Inst
|
||||
cap []int
|
||||
}
|
||||
|
||||
// A machine holds all the state during an NFA simulation for p.
|
||||
type machine struct {
|
||||
re *Regexp // corresponding Regexp
|
||||
p *syntax.Prog // compiled program
|
||||
q0, q1 queue // two queues for runq, nextq
|
||||
pool []*thread // pool of available threads
|
||||
matched bool // whether a match was found
|
||||
matchcap []int // capture information for the match
|
||||
}
|
||||
|
||||
// progMachine returns a new machine running the prog p.
|
||||
func progMachine(p *syntax.Prog) *machine {
|
||||
m := &machine{p: p}
|
||||
n := len(m.p.Inst)
|
||||
m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
|
||||
m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
|
||||
ncap := p.NumCap
|
||||
if ncap < 2 {
|
||||
ncap = 2
|
||||
}
|
||||
m.matchcap = make([]int, ncap)
|
||||
return m
|
||||
}
|
||||
|
||||
// alloc allocates a new thread with the given instruction.
|
||||
// It uses the free pool if possible.
|
||||
func (m *machine) alloc(i *syntax.Inst) *thread {
|
||||
var t *thread
|
||||
if n := len(m.pool); n > 0 {
|
||||
t = m.pool[n-1]
|
||||
m.pool = m.pool[:n-1]
|
||||
} else {
|
||||
t = new(thread)
|
||||
t.cap = make([]int, cap(m.matchcap))
|
||||
}
|
||||
t.cap = t.cap[:len(m.matchcap)]
|
||||
t.inst = i
|
||||
return t
|
||||
}
|
||||
|
||||
// free returns t to the free pool.
|
||||
func (m *machine) free(t *thread) {
|
||||
m.pool = append(m.pool, t)
|
||||
}
|
||||
|
||||
// match runs the machine over the input starting at pos.
|
||||
// It reports whether a match was found.
|
||||
// If so, m.matchcap holds the submatch information.
|
||||
func (m *machine) match(i input, pos int) bool {
|
||||
startCond := m.re.cond
|
||||
if startCond == ^syntax.EmptyOp(0) { // impossible
|
||||
return false
|
||||
}
|
||||
m.matched = false
|
||||
for i := range m.matchcap {
|
||||
m.matchcap[i] = -1
|
||||
}
|
||||
runq, nextq := &m.q0, &m.q1
|
||||
rune, rune1 := endOfText, endOfText
|
||||
width, width1 := 0, 0
|
||||
rune, width = i.step(pos)
|
||||
if rune != endOfText {
|
||||
rune1, width1 = i.step(pos + width)
|
||||
}
|
||||
// TODO: Let caller specify the initial flag setting.
|
||||
// For now assume pos == 0 is beginning of text and
|
||||
// pos != 0 is not even beginning of line.
|
||||
// TODO: Word boundary.
|
||||
var flag syntax.EmptyOp
|
||||
if pos == 0 {
|
||||
flag = syntax.EmptyBeginText | syntax.EmptyBeginLine
|
||||
}
|
||||
|
||||
// Update flag using lookahead rune.
|
||||
if rune1 == '\n' {
|
||||
flag |= syntax.EmptyEndLine
|
||||
}
|
||||
if rune1 == endOfText {
|
||||
flag |= syntax.EmptyEndText
|
||||
}
|
||||
|
||||
for {
|
||||
if len(runq.dense) == 0 {
|
||||
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
|
||||
// Anchored match, past beginning of text.
|
||||
break
|
||||
}
|
||||
if m.matched {
|
||||
// Have match; finished exploring alternatives.
|
||||
break
|
||||
}
|
||||
if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() {
|
||||
// Match requires literal prefix; fast search for it.
|
||||
advance := i.index(m.re, pos)
|
||||
if advance < 0 {
|
||||
break
|
||||
}
|
||||
pos += advance
|
||||
rune, width = i.step(pos)
|
||||
rune1, width1 = i.step(pos + width)
|
||||
}
|
||||
}
|
||||
if !m.matched {
|
||||
if len(m.matchcap) > 0 {
|
||||
m.matchcap[0] = pos
|
||||
}
|
||||
m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag)
|
||||
}
|
||||
// TODO: word boundary
|
||||
flag = 0
|
||||
if rune == '\n' {
|
||||
flag |= syntax.EmptyBeginLine
|
||||
}
|
||||
if rune1 == '\n' {
|
||||
flag |= syntax.EmptyEndLine
|
||||
}
|
||||
if rune1 == endOfText {
|
||||
flag |= syntax.EmptyEndText
|
||||
}
|
||||
m.step(runq, nextq, pos, pos+width, rune, flag)
|
||||
if width == 0 {
|
||||
break
|
||||
}
|
||||
pos += width
|
||||
rune, width = rune1, width1
|
||||
if rune != endOfText {
|
||||
rune1, width1 = i.step(pos + width)
|
||||
}
|
||||
runq, nextq = nextq, runq
|
||||
}
|
||||
m.clear(nextq)
|
||||
return m.matched
|
||||
}
|
||||
|
||||
// clear frees all threads on the thread queue.
|
||||
func (m *machine) clear(q *queue) {
|
||||
for _, d := range q.dense {
|
||||
if d.t != nil {
|
||||
m.free(d.t)
|
||||
}
|
||||
}
|
||||
q.dense = q.dense[:0]
|
||||
}
|
||||
|
||||
// step executes one step of the machine, running each of the threads
|
||||
// on runq and appending new threads to nextq.
|
||||
// The step processes the rune c (which may be endOfText),
|
||||
// which starts at position pos and ends at nextPos.
|
||||
// nextCond gives the setting for the empty-width flags after c.
|
||||
func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) {
|
||||
for j := 0; j < len(runq.dense); j++ {
|
||||
d := &runq.dense[j]
|
||||
t := d.t
|
||||
if t == nil {
|
||||
continue
|
||||
}
|
||||
/*
|
||||
* If we support leftmost-longest matching:
|
||||
if longest && matched && match[0] < t.cap[0] {
|
||||
m.free(t)
|
||||
continue
|
||||
}
|
||||
*/
|
||||
|
||||
i := t.inst
|
||||
switch i.Op {
|
||||
default:
|
||||
panic("bad inst")
|
||||
|
||||
case syntax.InstMatch:
|
||||
if len(t.cap) > 0 {
|
||||
t.cap[1] = pos
|
||||
copy(m.matchcap, t.cap)
|
||||
}
|
||||
m.matched = true
|
||||
for _, d := range runq.dense[j+1:] {
|
||||
if d.t != nil {
|
||||
m.free(d.t)
|
||||
}
|
||||
}
|
||||
runq.dense = runq.dense[:0]
|
||||
|
||||
case syntax.InstRune:
|
||||
if i.MatchRune(c) {
|
||||
m.add(nextq, i.Out, nextPos, t.cap, nextCond)
|
||||
}
|
||||
}
|
||||
m.free(t)
|
||||
}
|
||||
runq.dense = runq.dense[:0]
|
||||
}
|
||||
|
||||
// add adds an entry to q for pc, unless the q already has such an entry.
|
||||
// It also recursively adds an entry for all instructions reachable from pc by following
|
||||
// empty-width conditions satisfied by cond. pos gives the current position
|
||||
// in the input.
|
||||
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) {
|
||||
if pc == 0 {
|
||||
return
|
||||
}
|
||||
if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
|
||||
return
|
||||
}
|
||||
|
||||
j := len(q.dense)
|
||||
q.dense = q.dense[:j+1]
|
||||
d := &q.dense[j]
|
||||
d.t = nil
|
||||
d.pc = pc
|
||||
q.sparse[pc] = uint32(j)
|
||||
|
||||
i := &m.p.Inst[pc]
|
||||
switch i.Op {
|
||||
default:
|
||||
panic("unhandled")
|
||||
case syntax.InstFail:
|
||||
// nothing
|
||||
case syntax.InstAlt, syntax.InstAltMatch:
|
||||
m.add(q, i.Out, pos, cap, cond)
|
||||
m.add(q, i.Arg, pos, cap, cond)
|
||||
case syntax.InstEmptyWidth:
|
||||
if syntax.EmptyOp(i.Arg)&^cond == 0 {
|
||||
m.add(q, i.Out, pos, cap, cond)
|
||||
}
|
||||
case syntax.InstNop:
|
||||
m.add(q, i.Out, pos, cap, cond)
|
||||
case syntax.InstCapture:
|
||||
if int(i.Arg) < len(cap) {
|
||||
opos := cap[i.Arg]
|
||||
cap[i.Arg] = pos
|
||||
m.add(q, i.Out, pos, cap, cond)
|
||||
cap[i.Arg] = opos
|
||||
} else {
|
||||
m.add(q, i.Out, pos, cap, cond)
|
||||
}
|
||||
case syntax.InstMatch, syntax.InstRune:
|
||||
t := m.alloc(i)
|
||||
if len(t.cap) > 0 {
|
||||
copy(t.cap, cap)
|
||||
}
|
||||
d.t = t
|
||||
}
|
||||
}
|
||||
|
||||
// empty is a non-nil 0-element slice,
|
||||
// so doExecute can avoid an allocation
|
||||
// when 0 captures are requested from a successful match.
|
||||
var empty = make([]int, 0)
|
||||
|
||||
// doExecute finds the leftmost match in the input and returns
|
||||
// the position of its subexpressions.
|
||||
func (re *Regexp) doExecute(i input, pos int, ncap int) []int {
|
||||
m := re.get()
|
||||
m.matchcap = m.matchcap[:ncap]
|
||||
if !m.match(i, pos) {
|
||||
re.put(m)
|
||||
return nil
|
||||
}
|
||||
if ncap == 0 {
|
||||
re.put(m)
|
||||
return empty // empty but not nil
|
||||
}
|
||||
cap := make([]int, ncap)
|
||||
copy(cap, m.matchcap)
|
||||
re.put(m)
|
||||
return cap
|
||||
}
|
||||
|
|
@ -0,0 +1,472 @@
|
|||
// Copyright 2010 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package regexp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// For each pattern/text pair, what is the expected output of each function?
|
||||
// We can derive the textual results from the indexed results, the non-submatch
|
||||
// results from the submatched results, the single results from the 'all' results,
|
||||
// and the byte results from the string results. Therefore the table includes
|
||||
// only the FindAllStringSubmatchIndex result.
|
||||
type FindTest struct {
|
||||
pat string
|
||||
text string
|
||||
matches [][]int
|
||||
}
|
||||
|
||||
func (t FindTest) String() string {
|
||||
return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text)
|
||||
}
|
||||
|
||||
var findTests = []FindTest{
|
||||
{``, ``, build(1, 0, 0)},
|
||||
{`^abcdefg`, "abcdefg", build(1, 0, 7)},
|
||||
{`a+`, "baaab", build(1, 1, 4)},
|
||||
{"abcd..", "abcdef", build(1, 0, 6)},
|
||||
{`a`, "a", build(1, 0, 1)},
|
||||
{`x`, "y", nil},
|
||||
{`b`, "abc", build(1, 1, 2)},
|
||||
{`.`, "a", build(1, 0, 1)},
|
||||
{`.*`, "abcdef", build(1, 0, 6)},
|
||||
{`^`, "abcde", build(1, 0, 0)},
|
||||
{`$`, "abcde", build(1, 5, 5)},
|
||||
{`^abcd$`, "abcd", build(1, 0, 4)},
|
||||
{`^bcd'`, "abcdef", nil},
|
||||
{`^abcd$`, "abcde", nil},
|
||||
{`a+`, "baaab", build(1, 1, 4)},
|
||||
{`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)},
|
||||
{`[a-z]+`, "abcd", build(1, 0, 4)},
|
||||
{`[^a-z]+`, "ab1234cd", build(1, 2, 6)},
|
||||
{`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)},
|
||||
{`[^\n]+`, "abcd\n", build(1, 0, 4)},
|
||||
{`[日本語]+`, "日本語日本語", build(1, 0, 18)},
|
||||
{`日本語+`, "日本語", build(1, 0, 9)},
|
||||
{`日本語+`, "日本語語語語", build(1, 0, 18)},
|
||||
{`()`, "", build(1, 0, 0, 0, 0)},
|
||||
{`(a)`, "a", build(1, 0, 1, 0, 1)},
|
||||
{`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)},
|
||||
{`(.*)`, "", build(1, 0, 0, 0, 0)},
|
||||
{`(.*)`, "abcd", build(1, 0, 4, 0, 4)},
|
||||
{`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)},
|
||||
{`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)},
|
||||
{`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)},
|
||||
{`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)},
|
||||
{`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)},
|
||||
{`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)},
|
||||
|
||||
{`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)},
|
||||
{`(.*).*`, "ab", build(1, 0, 2, 0, 2)},
|
||||
{`[.]`, ".", build(1, 0, 1)},
|
||||
{`/$`, "/abc/", build(1, 4, 5)},
|
||||
{`/$`, "/abc", nil},
|
||||
|
||||
// multiple matches
|
||||
{`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)},
|
||||
{`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)},
|
||||
{`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)},
|
||||
{`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)},
|
||||
{`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)},
|
||||
|
||||
// fixed bugs
|
||||
{`ab$`, "cab", build(1, 1, 3)},
|
||||
{`axxb$`, "axxcb", nil},
|
||||
{`data`, "daXY data", build(1, 5, 9)},
|
||||
{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
|
||||
{`zx+`, "zzx", build(1, 1, 3)},
|
||||
|
||||
// can backslash-escape any punctuation
|
||||
{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
|
||||
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
|
||||
{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`,
|
||||
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
|
||||
{"\\`", "`", build(1, 0, 1)},
|
||||
{"[\\`]+", "`", build(1, 0, 1)},
|
||||
|
||||
// long set of matches (longer than startSize)
|
||||
{
|
||||
".",
|
||||
"qwertyuiopasdfghjklzxcvbnm1234567890",
|
||||
build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
|
||||
10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20,
|
||||
20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30,
|
||||
30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36),
|
||||
},
|
||||
}
|
||||
|
||||
// build is a helper to construct a [][]int by extracting n sequences from x.
|
||||
// This represents n matches with len(x)/n submatches each.
|
||||
func build(n int, x ...int) [][]int {
|
||||
ret := make([][]int, n)
|
||||
runLength := len(x) / n
|
||||
j := 0
|
||||
for i := range ret {
|
||||
ret[i] = make([]int, runLength)
|
||||
copy(ret[i], x[j:])
|
||||
j += runLength
|
||||
if j > len(x) {
|
||||
panic("invalid build entry")
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// First the simple cases.
|
||||
|
||||
func TestFind(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
re := MustCompile(test.pat)
|
||||
if re.String() != test.pat {
|
||||
t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
|
||||
}
|
||||
result := re.Find([]byte(test.text))
|
||||
switch {
|
||||
case len(test.matches) == 0 && len(result) == 0:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
expect := test.text[test.matches[0][0]:test.matches[0][1]]
|
||||
if expect != string(result) {
|
||||
t.Errorf("expected %q got %q: %s", expect, result, test)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindString(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindString(test.text)
|
||||
switch {
|
||||
case len(test.matches) == 0 && len(result) == 0:
|
||||
// ok
|
||||
case test.matches == nil && result != "":
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == "":
|
||||
// Tricky because an empty result has two meanings: no match or empty match.
|
||||
if test.matches[0][0] != test.matches[0][1] {
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
}
|
||||
case test.matches != nil && result != "":
|
||||
expect := test.text[test.matches[0][0]:test.matches[0][1]]
|
||||
if expect != result {
|
||||
t.Errorf("expected %q got %q: %s", expect, result, test)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testFindIndex(test *FindTest, result []int, t *testing.T) {
|
||||
switch {
|
||||
case len(test.matches) == 0 && len(result) == 0:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
expect := test.matches[0]
|
||||
if expect[0] != result[0] || expect[1] != result[1] {
|
||||
t.Errorf("expected %v got %v: %s", expect, result, test)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindStringIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindReaderIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
|
||||
}
|
||||
}
|
||||
|
||||
// Now come the simple All cases.
|
||||
|
||||
func TestFindAll(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindAll([]byte(test.text), -1)
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Fatalf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
if len(test.matches) != len(result) {
|
||||
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
|
||||
continue
|
||||
}
|
||||
for k, e := range test.matches {
|
||||
expect := test.text[e[0]:e[1]]
|
||||
if expect != string(result[k]) {
|
||||
t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllString(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindAllString(test.text, -1)
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
if len(test.matches) != len(result) {
|
||||
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
|
||||
continue
|
||||
}
|
||||
for k, e := range test.matches {
|
||||
expect := test.text[e[0]:e[1]]
|
||||
if expect != result[k] {
|
||||
t.Errorf("expected %q got %q: %s", expect, result, test)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
if len(test.matches) != len(result) {
|
||||
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
|
||||
return
|
||||
}
|
||||
for k, e := range test.matches {
|
||||
if e[0] != result[k][0] || e[1] != result[k][1] {
|
||||
t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllStringIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t)
|
||||
}
|
||||
}
|
||||
|
||||
// Now come the Submatch cases.
|
||||
|
||||
func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
|
||||
if len(submatches) != len(result)*2 {
|
||||
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
|
||||
return
|
||||
}
|
||||
for k := 0; k < len(submatches); k += 2 {
|
||||
if submatches[k] == -1 {
|
||||
if result[k/2] != nil {
|
||||
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
|
||||
}
|
||||
continue
|
||||
}
|
||||
expect := test.text[submatches[k]:submatches[k+1]]
|
||||
if expect != string(result[k/2]) {
|
||||
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindSubmatch(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindSubmatch([]byte(test.text))
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
testSubmatchBytes(&test, 0, test.matches[0], result, t)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
|
||||
if len(submatches) != len(result)*2 {
|
||||
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
|
||||
return
|
||||
}
|
||||
for k := 0; k < len(submatches); k += 2 {
|
||||
if submatches[k] == -1 {
|
||||
if result[k/2] != "" {
|
||||
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
|
||||
}
|
||||
continue
|
||||
}
|
||||
expect := test.text[submatches[k]:submatches[k+1]]
|
||||
if expect != result[k/2] {
|
||||
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindStringSubmatch(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindStringSubmatch(test.text)
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
testSubmatchString(&test, 0, test.matches[0], result, t)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
|
||||
if len(expect) != len(result) {
|
||||
t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
|
||||
return
|
||||
}
|
||||
for k, e := range expect {
|
||||
if e != result[k] {
|
||||
t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case test.matches != nil && result != nil:
|
||||
testSubmatchIndices(test, 0, test.matches[0], result, t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindSubmatchIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindStringSubmatchIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindReaderSubmatchIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
|
||||
}
|
||||
}
|
||||
|
||||
// Now come the monster AllSubmatch cases.
|
||||
|
||||
func TestFindAllSubmatch(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1)
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case len(test.matches) != len(result):
|
||||
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
|
||||
case test.matches != nil && result != nil:
|
||||
for k, match := range test.matches {
|
||||
testSubmatchBytes(&test, k, match, result[k], t)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllStringSubmatch(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1)
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case len(test.matches) != len(result):
|
||||
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
|
||||
case test.matches != nil && result != nil:
|
||||
for k, match := range test.matches {
|
||||
testSubmatchString(&test, k, match, result[k], t)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
|
||||
switch {
|
||||
case test.matches == nil && result == nil:
|
||||
// ok
|
||||
case test.matches == nil && result != nil:
|
||||
t.Errorf("expected no match; got one: %s", test)
|
||||
case test.matches != nil && result == nil:
|
||||
t.Errorf("expected match; got none: %s", test)
|
||||
case len(test.matches) != len(result):
|
||||
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
|
||||
case test.matches != nil && result != nil:
|
||||
for k, match := range test.matches {
|
||||
testSubmatchIndices(test, k, match, result[k], t)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllSubmatchIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindAllStringSubmatchIndex(t *testing.T) {
|
||||
for _, test := range findTests {
|
||||
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,795 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package regexp implements a simple regular expression library.
|
||||
//
|
||||
// The syntax of the regular expressions accepted is the same
|
||||
// general syntax used by Perl, Python, and other languages.
|
||||
// More precisely, it is the syntax accepted by RE2 and described at
|
||||
// http://code.google.com/p/re2/wiki/Syntax, except for \C.
|
||||
//
|
||||
// All characters are UTF-8-encoded code points.
|
||||
//
|
||||
// There are 16 methods of Regexp that match a regular expression and identify
|
||||
// the matched text. Their names are matched by this regular expression:
|
||||
//
|
||||
// Find(All)?(String)?(Submatch)?(Index)?
|
||||
//
|
||||
// If 'All' is present, the routine matches successive non-overlapping
|
||||
// matches of the entire expression. Empty matches abutting a preceding
|
||||
// match are ignored. The return value is a slice containing the successive
|
||||
// return values of the corresponding non-'All' routine. These routines take
|
||||
// an extra integer argument, n; if n >= 0, the function returns at most n
|
||||
// matches/submatches.
|
||||
//
|
||||
// If 'String' is present, the argument is a string; otherwise it is a slice
|
||||
// of bytes; return values are adjusted as appropriate.
|
||||
//
|
||||
// If 'Submatch' is present, the return value is a slice identifying the
|
||||
// successive submatches of the expression. Submatches are matches of
|
||||
// parenthesized subexpressions within the regular expression, numbered from
|
||||
// left to right in order of opening parenthesis. Submatch 0 is the match of
|
||||
// the entire expression, submatch 1 the match of the first parenthesized
|
||||
// subexpression, and so on.
|
||||
//
|
||||
// If 'Index' is present, matches and submatches are identified by byte index
|
||||
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
|
||||
// the nth submatch. The pair for n==0 identifies the match of the entire
|
||||
// expression. If 'Index' is not present, the match is identified by the
|
||||
// text of the match/submatch. If an index is negative, it means that
|
||||
// subexpression did not match any string in the input.
|
||||
//
|
||||
// There is also a subset of the methods that can be applied to text read
|
||||
// from a RuneReader:
|
||||
//
|
||||
// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
|
||||
//
|
||||
// This set may grow. Note that regular expression matches may need to
|
||||
// examine text beyond the text returned by a match, so the methods that
|
||||
// match text from a RuneReader may read arbitrarily far into the input
|
||||
// before returning.
|
||||
//
|
||||
// (There are a few other methods that do not match this pattern.)
|
||||
//
|
||||
package regexp
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"exp/regexp/syntax"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"utf8"
|
||||
)
|
||||
|
||||
var debug = false
|
||||
|
||||
// Error is the local type for a parsing error.
|
||||
type Error string
|
||||
|
||||
func (e Error) String() string {
|
||||
return string(e)
|
||||
}
|
||||
|
||||
// Regexp is the representation of a compiled regular expression.
|
||||
// The public interface is entirely through methods.
|
||||
// A Regexp is safe for concurrent use by multiple goroutines.
|
||||
type Regexp struct {
|
||||
// read-only after Compile
|
||||
expr string // as passed to Compile
|
||||
prog *syntax.Prog // compiled program
|
||||
prefix string // required prefix in unanchored matches
|
||||
prefixBytes []byte // prefix, as a []byte
|
||||
prefixComplete bool // prefix is the entire regexp
|
||||
prefixRune int // first rune in prefix
|
||||
cond syntax.EmptyOp // empty-width conditions required at start of match
|
||||
|
||||
// cache of machines for running regexp
|
||||
mu sync.Mutex
|
||||
machine []*machine
|
||||
}
|
||||
|
||||
// String returns the source text used to compile the regular expression.
|
||||
func (re *Regexp) String() string {
|
||||
return re.expr
|
||||
}
|
||||
|
||||
// Compile parses a regular expression and returns, if successful, a Regexp
|
||||
// object that can be used to match against text.
|
||||
func Compile(expr string) (*Regexp, os.Error) {
|
||||
re, err := syntax.Parse(expr, syntax.Perl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
prog, err := syntax.Compile(re)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
regexp := &Regexp{
|
||||
expr: expr,
|
||||
prog: prog,
|
||||
}
|
||||
regexp.prefix, regexp.prefixComplete = prog.Prefix()
|
||||
if regexp.prefix != "" {
|
||||
// TODO(rsc): Remove this allocation by adding
|
||||
// IndexString to package bytes.
|
||||
regexp.prefixBytes = []byte(regexp.prefix)
|
||||
regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
|
||||
}
|
||||
regexp.cond = prog.StartCond()
|
||||
return regexp, nil
|
||||
}
|
||||
|
||||
// get returns a machine to use for matching re.
|
||||
// It uses the re's machine cache if possible, to avoid
|
||||
// unnecessary allocation.
|
||||
func (re *Regexp) get() *machine {
|
||||
re.mu.Lock()
|
||||
if n := len(re.machine); n > 0 {
|
||||
z := re.machine[n-1]
|
||||
re.machine = re.machine[:n-1]
|
||||
re.mu.Unlock()
|
||||
return z
|
||||
}
|
||||
re.mu.Unlock()
|
||||
z := progMachine(re.prog)
|
||||
z.re = re
|
||||
return z
|
||||
}
|
||||
|
||||
// put returns a machine to the re's machine cache.
|
||||
// There is no attempt to limit the size of the cache, so it will
|
||||
// grow to the maximum number of simultaneous matches
|
||||
// run using re. (The cache empties when re gets garbage collected.)
|
||||
func (re *Regexp) put(z *machine) {
|
||||
re.mu.Lock()
|
||||
re.machine = append(re.machine, z)
|
||||
re.mu.Unlock()
|
||||
}
|
||||
|
||||
// MustCompile is like Compile but panics if the expression cannot be parsed.
|
||||
// It simplifies safe initialization of global variables holding compiled regular
|
||||
// expressions.
|
||||
func MustCompile(str string) *Regexp {
|
||||
regexp, error := Compile(str)
|
||||
if error != nil {
|
||||
panic(`regexp: compiling "` + str + `": ` + error.String())
|
||||
}
|
||||
return regexp
|
||||
}
|
||||
|
||||
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
|
||||
func (re *Regexp) NumSubexp() int {
|
||||
// NumCap/2 because captures count ( and ) separately.
|
||||
// -1 because NumCap counts $0 but NumSubexp does not.
|
||||
return re.prog.NumCap/2 - 1
|
||||
}
|
||||
|
||||
const endOfText = -1
|
||||
|
||||
// input abstracts different representations of the input text. It provides
|
||||
// one-character lookahead.
|
||||
type input interface {
|
||||
step(pos int) (rune int, width int) // advance one rune
|
||||
canCheckPrefix() bool // can we look ahead without losing info?
|
||||
hasPrefix(re *Regexp) bool
|
||||
index(re *Regexp, pos int) int
|
||||
}
|
||||
|
||||
// inputString scans a string.
|
||||
type inputString struct {
|
||||
str string
|
||||
}
|
||||
|
||||
func newInputString(str string) *inputString {
|
||||
return &inputString{str: str}
|
||||
}
|
||||
|
||||
func (i *inputString) step(pos int) (int, int) {
|
||||
if pos < len(i.str) {
|
||||
return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
|
||||
}
|
||||
return endOfText, 0
|
||||
}
|
||||
|
||||
func (i *inputString) canCheckPrefix() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (i *inputString) hasPrefix(re *Regexp) bool {
|
||||
return strings.HasPrefix(i.str, re.prefix)
|
||||
}
|
||||
|
||||
func (i *inputString) index(re *Regexp, pos int) int {
|
||||
return strings.Index(i.str[pos:], re.prefix)
|
||||
}
|
||||
|
||||
// inputBytes scans a byte slice.
|
||||
type inputBytes struct {
|
||||
str []byte
|
||||
}
|
||||
|
||||
func newInputBytes(str []byte) *inputBytes {
|
||||
return &inputBytes{str: str}
|
||||
}
|
||||
|
||||
func (i *inputBytes) step(pos int) (int, int) {
|
||||
if pos < len(i.str) {
|
||||
return utf8.DecodeRune(i.str[pos:len(i.str)])
|
||||
}
|
||||
return endOfText, 0
|
||||
}
|
||||
|
||||
func (i *inputBytes) canCheckPrefix() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (i *inputBytes) hasPrefix(re *Regexp) bool {
|
||||
return bytes.HasPrefix(i.str, re.prefixBytes)
|
||||
}
|
||||
|
||||
func (i *inputBytes) index(re *Regexp, pos int) int {
|
||||
return bytes.Index(i.str[pos:], re.prefixBytes)
|
||||
}
|
||||
|
||||
// inputReader scans a RuneReader.
|
||||
type inputReader struct {
|
||||
r io.RuneReader
|
||||
atEOT bool
|
||||
pos int
|
||||
}
|
||||
|
||||
func newInputReader(r io.RuneReader) *inputReader {
|
||||
return &inputReader{r: r}
|
||||
}
|
||||
|
||||
func (i *inputReader) step(pos int) (int, int) {
|
||||
if !i.atEOT && pos != i.pos {
|
||||
return endOfText, 0
|
||||
|
||||
}
|
||||
r, w, err := i.r.ReadRune()
|
||||
if err != nil {
|
||||
i.atEOT = true
|
||||
return endOfText, 0
|
||||
}
|
||||
i.pos += w
|
||||
return r, w
|
||||
}
|
||||
|
||||
func (i *inputReader) canCheckPrefix() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (i *inputReader) hasPrefix(re *Regexp) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (i *inputReader) index(re *Regexp, pos int) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
// LiteralPrefix returns a literal string that must begin any match
|
||||
// of the regular expression re. It returns the boolean true if the
|
||||
// literal string comprises the entire regular expression.
|
||||
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
|
||||
return re.prefix, re.prefixComplete
|
||||
}
|
||||
|
||||
// MatchReader returns whether the Regexp matches the text read by the
|
||||
// RuneReader. The return value is a boolean: true for match, false for no
|
||||
// match.
|
||||
func (re *Regexp) MatchReader(r io.RuneReader) bool {
|
||||
return re.doExecute(newInputReader(r), 0, 0) != nil
|
||||
}
|
||||
|
||||
// MatchString returns whether the Regexp matches the string s.
|
||||
// The return value is a boolean: true for match, false for no match.
|
||||
func (re *Regexp) MatchString(s string) bool {
|
||||
return re.doExecute(newInputString(s), 0, 0) != nil
|
||||
}
|
||||
|
||||
// Match returns whether the Regexp matches the byte slice b.
|
||||
// The return value is a boolean: true for match, false for no match.
|
||||
func (re *Regexp) Match(b []byte) bool {
|
||||
return re.doExecute(newInputBytes(b), 0, 0) != nil
|
||||
}
|
||||
|
||||
// MatchReader checks whether a textual regular expression matches the text
|
||||
// read by the RuneReader. More complicated queries need to use Compile and
|
||||
// the full Regexp interface.
|
||||
func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) {
|
||||
re, err := Compile(pattern)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return re.MatchReader(r), nil
|
||||
}
|
||||
|
||||
// MatchString checks whether a textual regular expression
|
||||
// matches a string. More complicated queries need
|
||||
// to use Compile and the full Regexp interface.
|
||||
func MatchString(pattern string, s string) (matched bool, error os.Error) {
|
||||
re, err := Compile(pattern)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return re.MatchString(s), nil
|
||||
}
|
||||
|
||||
// Match checks whether a textual regular expression
|
||||
// matches a byte slice. More complicated queries need
|
||||
// to use Compile and the full Regexp interface.
|
||||
func Match(pattern string, b []byte) (matched bool, error os.Error) {
|
||||
re, err := Compile(pattern)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return re.Match(b), nil
|
||||
}
|
||||
|
||||
// ReplaceAllString returns a copy of src in which all matches for the Regexp
|
||||
// have been replaced by repl. No support is provided for expressions
|
||||
// (e.g. \1 or $1) in the replacement string.
|
||||
func (re *Regexp) ReplaceAllString(src, repl string) string {
|
||||
return re.ReplaceAllStringFunc(src, func(string) string { return repl })
|
||||
}
|
||||
|
||||
// ReplaceAllStringFunc returns a copy of src in which all matches for the
|
||||
// Regexp have been replaced by the return value of of function repl (whose
|
||||
// first argument is the matched string). No support is provided for
|
||||
// expressions (e.g. \1 or $1) in the replacement string.
|
||||
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
|
||||
lastMatchEnd := 0 // end position of the most recent match
|
||||
searchPos := 0 // position where we next look for a match
|
||||
buf := new(bytes.Buffer)
|
||||
for searchPos <= len(src) {
|
||||
a := re.doExecute(newInputString(src), searchPos, 2)
|
||||
if len(a) == 0 {
|
||||
break // no more matches
|
||||
}
|
||||
|
||||
// Copy the unmatched characters before this match.
|
||||
io.WriteString(buf, src[lastMatchEnd:a[0]])
|
||||
|
||||
// Now insert a copy of the replacement string, but not for a
|
||||
// match of the empty string immediately after another match.
|
||||
// (Otherwise, we get double replacement for patterns that
|
||||
// match both empty and nonempty strings.)
|
||||
if a[1] > lastMatchEnd || a[0] == 0 {
|
||||
io.WriteString(buf, repl(src[a[0]:a[1]]))
|
||||
}
|
||||
lastMatchEnd = a[1]
|
||||
|
||||
// Advance past this match; always advance at least one character.
|
||||
_, width := utf8.DecodeRuneInString(src[searchPos:])
|
||||
if searchPos+width > a[1] {
|
||||
searchPos += width
|
||||
} else if searchPos+1 > a[1] {
|
||||
// This clause is only needed at the end of the input
|
||||
// string. In that case, DecodeRuneInString returns width=0.
|
||||
searchPos++
|
||||
} else {
|
||||
searchPos = a[1]
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the unmatched characters after the last match.
|
||||
io.WriteString(buf, src[lastMatchEnd:])
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
// ReplaceAll returns a copy of src in which all matches for the Regexp
|
||||
// have been replaced by repl. No support is provided for expressions
|
||||
// (e.g. \1 or $1) in the replacement text.
|
||||
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
|
||||
return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
|
||||
}
|
||||
|
||||
// ReplaceAllFunc returns a copy of src in which all matches for the
|
||||
// Regexp have been replaced by the return value of of function repl (whose
|
||||
// first argument is the matched []byte). No support is provided for
|
||||
// expressions (e.g. \1 or $1) in the replacement string.
|
||||
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
|
||||
lastMatchEnd := 0 // end position of the most recent match
|
||||
searchPos := 0 // position where we next look for a match
|
||||
buf := new(bytes.Buffer)
|
||||
for searchPos <= len(src) {
|
||||
a := re.doExecute(newInputBytes(src), searchPos, 2)
|
||||
if len(a) == 0 {
|
||||
break // no more matches
|
||||
}
|
||||
|
||||
// Copy the unmatched characters before this match.
|
||||
buf.Write(src[lastMatchEnd:a[0]])
|
||||
|
||||
// Now insert a copy of the replacement string, but not for a
|
||||
// match of the empty string immediately after another match.
|
||||
// (Otherwise, we get double replacement for patterns that
|
||||
// match both empty and nonempty strings.)
|
||||
if a[1] > lastMatchEnd || a[0] == 0 {
|
||||
buf.Write(repl(src[a[0]:a[1]]))
|
||||
}
|
||||
lastMatchEnd = a[1]
|
||||
|
||||
// Advance past this match; always advance at least one character.
|
||||
_, width := utf8.DecodeRune(src[searchPos:])
|
||||
if searchPos+width > a[1] {
|
||||
searchPos += width
|
||||
} else if searchPos+1 > a[1] {
|
||||
// This clause is only needed at the end of the input
|
||||
// string. In that case, DecodeRuneInString returns width=0.
|
||||
searchPos++
|
||||
} else {
|
||||
searchPos = a[1]
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the unmatched characters after the last match.
|
||||
buf.Write(src[lastMatchEnd:])
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
var specialBytes = []byte(`\.+*?()|[]{}^$`)
|
||||
|
||||
func special(b byte) bool {
|
||||
return bytes.IndexByte(specialBytes, b) >= 0
|
||||
}
|
||||
|
||||
// QuoteMeta returns a string that quotes all regular expression metacharacters
|
||||
// inside the argument text; the returned string is a regular expression matching
|
||||
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
|
||||
func QuoteMeta(s string) string {
|
||||
b := make([]byte, 2*len(s))
|
||||
|
||||
// A byte loop is correct because all metacharacters are ASCII.
|
||||
j := 0
|
||||
for i := 0; i < len(s); i++ {
|
||||
if special(s[i]) {
|
||||
b[j] = '\\'
|
||||
j++
|
||||
}
|
||||
b[j] = s[i]
|
||||
j++
|
||||
}
|
||||
return string(b[0:j])
|
||||
}
|
||||
|
||||
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
|
||||
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
|
||||
var end int
|
||||
if b == nil {
|
||||
end = len(s)
|
||||
} else {
|
||||
end = len(b)
|
||||
}
|
||||
|
||||
for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
|
||||
var in input
|
||||
if b == nil {
|
||||
in = newInputString(s)
|
||||
} else {
|
||||
in = newInputBytes(b)
|
||||
}
|
||||
matches := re.doExecute(in, pos, re.prog.NumCap)
|
||||
if len(matches) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
accept := true
|
||||
if matches[1] == pos {
|
||||
// We've found an empty match.
|
||||
if matches[0] == prevMatchEnd {
|
||||
// We don't allow an empty match right
|
||||
// after a previous match, so ignore it.
|
||||
accept = false
|
||||
}
|
||||
var width int
|
||||
// TODO: use step()
|
||||
if b == nil {
|
||||
_, width = utf8.DecodeRuneInString(s[pos:end])
|
||||
} else {
|
||||
_, width = utf8.DecodeRune(b[pos:end])
|
||||
}
|
||||
if width > 0 {
|
||||
pos += width
|
||||
} else {
|
||||
pos = end + 1
|
||||
}
|
||||
} else {
|
||||
pos = matches[1]
|
||||
}
|
||||
prevMatchEnd = matches[1]
|
||||
|
||||
if accept {
|
||||
deliver(matches)
|
||||
i++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) Find(b []byte) []byte {
|
||||
a := re.doExecute(newInputBytes(b), 0, 2)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
return b[a[0]:a[1]]
|
||||
}
|
||||
|
||||
// FindIndex returns a two-element slice of integers defining the location of
|
||||
// the leftmost match in b of the regular expression. The match itself is at
|
||||
// b[loc[0]:loc[1]].
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindIndex(b []byte) (loc []int) {
|
||||
a := re.doExecute(newInputBytes(b), 0, 2)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
return a[0:2]
|
||||
}
|
||||
|
||||
// FindString returns a string holding the text of the leftmost match in s of the regular
|
||||
// expression. If there is no match, the return value is an empty string,
|
||||
// but it will also be empty if the regular expression successfully matches
|
||||
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
|
||||
// necessary to distinguish these cases.
|
||||
func (re *Regexp) FindString(s string) string {
|
||||
a := re.doExecute(newInputString(s), 0, 2)
|
||||
if a == nil {
|
||||
return ""
|
||||
}
|
||||
return s[a[0]:a[1]]
|
||||
}
|
||||
|
||||
// FindStringIndex returns a two-element slice of integers defining the
|
||||
// location of the leftmost match in s of the regular expression. The match
|
||||
// itself is at s[loc[0]:loc[1]].
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindStringIndex(s string) []int {
|
||||
a := re.doExecute(newInputString(s), 0, 2)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
return a[0:2]
|
||||
}
|
||||
|
||||
// FindReaderIndex returns a two-element slice of integers defining the
|
||||
// location of the leftmost match of the regular expression in text read from
|
||||
// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
|
||||
// value of nil indicates no match.
|
||||
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
|
||||
a := re.doExecute(newInputReader(r), 0, 2)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
return a[0:2]
|
||||
}
|
||||
|
||||
// FindSubmatch returns a slice of slices holding the text of the leftmost
|
||||
// match of the regular expression in b and the matches, if any, of its
|
||||
// subexpressions, as defined by the 'Submatch' descriptions in the package
|
||||
// comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
|
||||
a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
ret := make([][]byte, len(a)/2)
|
||||
for i := range ret {
|
||||
if a[2*i] >= 0 {
|
||||
ret[i] = b[a[2*i]:a[2*i+1]]
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// FindSubmatchIndex returns a slice holding the index pairs identifying the
|
||||
// leftmost match of the regular expression in b and the matches, if any, of
|
||||
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
|
||||
// in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
|
||||
return re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
|
||||
}
|
||||
|
||||
// FindStringSubmatch returns a slice of strings holding the text of the
|
||||
// leftmost match of the regular expression in s and the matches, if any, of
|
||||
// its subexpressions, as defined by the 'Submatch' description in the
|
||||
// package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindStringSubmatch(s string) []string {
|
||||
a := re.doExecute(newInputString(s), 0, re.prog.NumCap)
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
ret := make([]string, len(a)/2)
|
||||
for i := range ret {
|
||||
if a[2*i] >= 0 {
|
||||
ret[i] = s[a[2*i]:a[2*i+1]]
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// FindStringSubmatchIndex returns a slice holding the index pairs
|
||||
// identifying the leftmost match of the regular expression in s and the
|
||||
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
|
||||
// 'Index' descriptions in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
|
||||
return re.doExecute(newInputString(s), 0, re.prog.NumCap)
|
||||
}
|
||||
|
||||
// FindReaderSubmatchIndex returns a slice holding the index pairs
|
||||
// identifying the leftmost match of the regular expression of text read by
|
||||
// the RuneReader, and the matches, if any, of its subexpressions, as defined
|
||||
// by the 'Submatch' and 'Index' descriptions in the package comment. A
|
||||
// return value of nil indicates no match.
|
||||
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
|
||||
return re.doExecute(newInputReader(r), 0, re.prog.NumCap)
|
||||
}
|
||||
|
||||
const startSize = 10 // The size at which to start a slice in the 'All' routines.
|
||||
|
||||
// FindAll is the 'All' version of Find; it returns a slice of all successive
|
||||
// matches of the expression, as defined by the 'All' description in the
|
||||
// package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
|
||||
if n < 0 {
|
||||
n = len(b) + 1
|
||||
}
|
||||
result := make([][]byte, 0, startSize)
|
||||
re.allMatches("", b, n, func(match []int) {
|
||||
result = append(result, b[match[0]:match[1]])
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
|
||||
// successive matches of the expression, as defined by the 'All' description
|
||||
// in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
|
||||
if n < 0 {
|
||||
n = len(b) + 1
|
||||
}
|
||||
result := make([][]int, 0, startSize)
|
||||
re.allMatches("", b, n, func(match []int) {
|
||||
result = append(result, match[0:2])
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllString is the 'All' version of FindString; it returns a slice of all
|
||||
// successive matches of the expression, as defined by the 'All' description
|
||||
// in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllString(s string, n int) []string {
|
||||
if n < 0 {
|
||||
n = len(s) + 1
|
||||
}
|
||||
result := make([]string, 0, startSize)
|
||||
re.allMatches(s, nil, n, func(match []int) {
|
||||
result = append(result, s[match[0]:match[1]])
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
|
||||
// slice of all successive matches of the expression, as defined by the 'All'
|
||||
// description in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
|
||||
if n < 0 {
|
||||
n = len(s) + 1
|
||||
}
|
||||
result := make([][]int, 0, startSize)
|
||||
re.allMatches(s, nil, n, func(match []int) {
|
||||
result = append(result, match[0:2])
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
|
||||
// of all successive matches of the expression, as defined by the 'All'
|
||||
// description in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
|
||||
if n < 0 {
|
||||
n = len(b) + 1
|
||||
}
|
||||
result := make([][][]byte, 0, startSize)
|
||||
re.allMatches("", b, n, func(match []int) {
|
||||
slice := make([][]byte, len(match)/2)
|
||||
for j := range slice {
|
||||
if match[2*j] >= 0 {
|
||||
slice[j] = b[match[2*j]:match[2*j+1]]
|
||||
}
|
||||
}
|
||||
result = append(result, slice)
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
|
||||
// a slice of all successive matches of the expression, as defined by the
|
||||
// 'All' description in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
|
||||
if n < 0 {
|
||||
n = len(b) + 1
|
||||
}
|
||||
result := make([][]int, 0, startSize)
|
||||
re.allMatches("", b, n, func(match []int) {
|
||||
result = append(result, match)
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
|
||||
// returns a slice of all successive matches of the expression, as defined by
|
||||
// the 'All' description in the package comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
|
||||
if n < 0 {
|
||||
n = len(s) + 1
|
||||
}
|
||||
result := make([][]string, 0, startSize)
|
||||
re.allMatches(s, nil, n, func(match []int) {
|
||||
slice := make([]string, len(match)/2)
|
||||
for j := range slice {
|
||||
if match[2*j] >= 0 {
|
||||
slice[j] = s[match[2*j]:match[2*j+1]]
|
||||
}
|
||||
}
|
||||
result = append(result, slice)
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// FindAllStringSubmatchIndex is the 'All' version of
|
||||
// FindStringSubmatchIndex; it returns a slice of all successive matches of
|
||||
// the expression, as defined by the 'All' description in the package
|
||||
// comment.
|
||||
// A return value of nil indicates no match.
|
||||
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
|
||||
if n < 0 {
|
||||
n = len(s) + 1
|
||||
}
|
||||
result := make([][]int, 0, startSize)
|
||||
re.allMatches(s, nil, n, func(match []int) {
|
||||
result = append(result, match)
|
||||
})
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
|
@ -86,6 +86,7 @@ func Compile(re *Regexp) (*Prog, os.Error) {
|
|||
|
||||
func (c *compiler) init() {
|
||||
c.p = new(Prog)
|
||||
c.p.NumCap = 2 // implicit ( and ) for whole match $0
|
||||
c.inst(InstFail)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -55,6 +55,61 @@ func (p *Prog) String() string {
|
|||
return b.String()
|
||||
}
|
||||
|
||||
// skipNop follows any no-op or capturing instructions
|
||||
// and returns the resulting pc.
|
||||
func (p *Prog) skipNop(pc uint32) *Inst {
|
||||
i := &p.Inst[pc]
|
||||
for i.Op == InstNop || i.Op == InstCapture {
|
||||
pc = i.Out
|
||||
i = &p.Inst[pc]
|
||||
}
|
||||
return i
|
||||
}
|
||||
|
||||
// Prefix returns a literal string that all matches for the
|
||||
// regexp must start with. Complete is true if the prefix
|
||||
// is the entire match.
|
||||
func (p *Prog) Prefix() (prefix string, complete bool) {
|
||||
i := p.skipNop(uint32(p.Start))
|
||||
|
||||
// Avoid allocation of buffer if prefix is empty.
|
||||
if i.Op != InstRune || len(i.Rune) != 1 {
|
||||
return "", i.Op == InstMatch
|
||||
}
|
||||
|
||||
// Have prefix; gather characters.
|
||||
var buf bytes.Buffer
|
||||
for i.Op == InstRune && len(i.Rune) == 1 {
|
||||
buf.WriteRune(i.Rune[0])
|
||||
i = p.skipNop(i.Out)
|
||||
}
|
||||
return buf.String(), i.Op == InstMatch
|
||||
}
|
||||
|
||||
// StartCond returns the leading empty-width conditions that must
|
||||
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
|
||||
func (p *Prog) StartCond() EmptyOp {
|
||||
var flag EmptyOp
|
||||
pc := uint32(p.Start)
|
||||
i := &p.Inst[pc]
|
||||
Loop:
|
||||
for {
|
||||
switch i.Op {
|
||||
case InstEmptyWidth:
|
||||
flag |= EmptyOp(i.Arg)
|
||||
case InstFail:
|
||||
return ^EmptyOp(0)
|
||||
case InstCapture, InstNop:
|
||||
// skip
|
||||
default:
|
||||
break Loop
|
||||
}
|
||||
pc = i.Out
|
||||
i = &p.Inst[pc]
|
||||
}
|
||||
return flag
|
||||
}
|
||||
|
||||
// MatchRune returns true if the instruction matches (and consumes) r.
|
||||
// It should only be called when i.Op == InstRune.
|
||||
func (i *Inst) MatchRune(r int) bool {
|
||||
|
|
|
|||
Loading…
Reference in New Issue