go/scanner: reject BOMs that are not at the beginning

For compliance with gc. See also issue 5265. Not Go1.1 critical, but harmless. R=r CC=golang-dev https://golang.org/cl/8736043
2013-04-12 21:28:38 -07:00 · 2013-04-12 21:28:38 -07:00 · 968732b677
parent d4d063580f
commit 968732b677
2 changed files with 14 additions and 4 deletions
--- a/src/pkg/go/scanner/scanner.go
+++ b/src/pkg/go/scanner/scanner.go
@ -48,6 +48,8 @@ type Scanner struct {
 	ErrorCount int // number of errors encountered
 }

+const bom = 0xFEFF // byte order mark, only permitted as very first character
+
 // Read the next Unicode char into s.ch.
 // s.ch < 0 means end-of-file.
 //
@ -67,6 +69,8 @@ func (s *Scanner) next() {
 			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
 			if r == utf8.RuneError && w == 1 {
 				s.error(s.offset, "illegal UTF-8 encoding")
+			} else if r == bom && s.offset > 0 {
+				s.error(s.offset, "illegal byte order mark")
 			}
 		}
 		s.rdOffset += w
@ -125,8 +129,8 @@ func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode
 	s.ErrorCount = 0

 	s.next()
-	if s.ch == '\uFEFF' {
-		s.next() // ignore BOM
+	if s.ch == bom {
+		s.next() // ignore BOM at file beginning
 	}
 }

@ -713,7 +717,10 @@ scanAgain:
 		case '|':
 			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 		default:
-			s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
+			// next reports unexpected BOMs - don't repeat
+			if ch != bom {
+				s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
+			}
 			insertSemi = s.insertSemi // preserve insertSemi info
 			tok = token.ILLEGAL
 			lit = string(ch)
--- a/src/pkg/go/scanner/scanner_test.go
+++ b/src/pkg/go/scanner/scanner_test.go
@ -695,7 +695,10 @@ var errors = []struct {
 	{"0X", token.INT, 0, "illegal hexadecimal number"},
 	{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"},
 	{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"},
-	{"\ufeff\ufeff", token.ILLEGAL, 3, "illegal character U+FEFF"}, // only first BOM is ignored
+	{"\ufeff\ufeff", token.ILLEGAL, 3, "illegal byte order mark"},            // only first BOM is ignored
+	{"//\ufeff", token.COMMENT, 2, "illegal byte order mark"},                // only first BOM is ignored
+	{"'\ufeff" + `'`, token.CHAR, 1, "illegal byte order mark"},              // only first BOM is ignored
+	{`"` + "abc\ufeffdef" + `"`, token.STRING, 4, "illegal byte order mark"}, // only first BOM is ignored
 }

 func TestScanErrors(t *testing.T) {