text/scanner: provide facility for custom identifiers

LGTM=r R=golang-codereviews, r CC=golang-codereviews https://golang.org/cl/108030044
2014-06-16 16:32:47 -07:00 · 2014-06-16 16:32:47 -07:00 · 60c0b3b5cf
parent 54bc760ad7
commit 60c0b3b5cf
2 changed files with 41 additions and 6 deletions
--- a/src/pkg/text/scanner/scanner.go
+++ b/src/pkg/text/scanner/scanner.go
@ -11,7 +11,7 @@
 // By default, a Scanner skips white space and Go comments and recognizes all
 // literals as defined by the Go language specification.  It may be
 // customized to recognize only a subset of those literals and to recognize
-// different white space characters.
+// different identifier and white space characters.
 //
 // Basic usage pattern:
 //
@ -34,8 +34,6 @@ import (
 	"unicode/utf8"
 )

-// TODO(gri): Consider changing this to use the new (token) Position package.
-
 // A source position is represented by a Position value.
 // A position is valid if Line > 0.
 type Position struct {
@ -164,6 +162,13 @@ type Scanner struct {
 	// for values ch > ' '). The field may be changed at any time.
 	Whitespace uint64

+	// IsIdentRune is a predicate controlling the characters accepted
+	// as the ith rune in an identifier. The set of valid characters
+	// must not intersect with the set of white space characters.
+	// If no IsIdentRune function is set, regular Go identifiers are
+	// accepted instead. The field may be changed at any time.
+	IsIdentRune func(ch rune, i int) bool
+
 	// Start position of most recently scanned token; set by Scan.
 	// Calling Init or Next invalidates the position (Line == 0).
 	// The Filename field is always left untouched by the Scanner.
@ -334,9 +339,17 @@ func (s *Scanner) error(msg string) {
 	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
 }

+func (s *Scanner) isIdentRune(ch rune, i int) bool {
+	if s.IsIdentRune != nil {
+		return s.IsIdentRune(ch, i)
+	}
+	return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
+}
+
 func (s *Scanner) scanIdentifier() rune {
-	ch := s.next() // read character after first '_' or letter
-	for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
+	// we know the zero'th rune is OK; start with 2nd one
+	ch := s.next()
+	for i := 1; s.isIdentRune(ch, i); i++ {
 		ch = s.next()
 	}
 	return ch
@ -563,7 +576,7 @@ redo:
 	// determine token value
 	tok := ch
 	switch {
-	case unicode.IsLetter(ch) || ch == '_':
+	case s.isIdentRune(ch, 0):
 		if s.Mode&ScanIdents != 0 {
 			tok = Ident
 			ch = s.scanIdentifier()
--- a/src/pkg/text/scanner/scanner_test.go
+++ b/src/pkg/text/scanner/scanner_test.go
@ -357,6 +357,28 @@ func TestScanSelectedMask(t *testing.T) {
 	testScanSelectedMode(t, ScanComments, Comment)
 }

+func TestScanCustomIdent(t *testing.T) {
+	const src = "faab12345 a12b123 a12 3b"
+	s := new(Scanner).Init(strings.NewReader(src))
+	// ident = ( 'a' | 'b' ) { digit } .
+	// digit = '0' .. '3' .
+	// with a maximum length of 4
+	s.IsIdentRune = func(ch rune, i int) bool {
+		return i == 0 && (ch == 'a' || ch == 'b') || 0 < i && i < 4 && '0' <= ch && ch <= '3'
+	}
+	checkTok(t, s, 1, s.Scan(), 'f', "f")
+	checkTok(t, s, 1, s.Scan(), Ident, "a")
+	checkTok(t, s, 1, s.Scan(), Ident, "a")
+	checkTok(t, s, 1, s.Scan(), Ident, "b123")
+	checkTok(t, s, 1, s.Scan(), Int, "45")
+	checkTok(t, s, 1, s.Scan(), Ident, "a12")
+	checkTok(t, s, 1, s.Scan(), Ident, "b123")
+	checkTok(t, s, 1, s.Scan(), Ident, "a12")
+	checkTok(t, s, 1, s.Scan(), Int, "3")
+	checkTok(t, s, 1, s.Scan(), Ident, "b")
+	checkTok(t, s, 1, s.Scan(), EOF, "")
+}
+
 func TestScanNext(t *testing.T) {
 	const BOM = '\uFEFF'
 	BOMs := string(BOM)