diff --git a/src/go/scanner/scanner.go b/src/go/scanner/scanner.go index e78abf12a2..9e85d4898a 100644 --- a/src/go/scanner/scanner.go +++ b/src/go/scanner/scanner.go @@ -150,6 +150,10 @@ func (s *Scanner) error(offs int, msg string) { s.ErrorCount++ } +func (s *Scanner) errorf(offs int, format string, args ...interface{}) { + s.error(offs, fmt.Sprintf(format, args...)) +} + func (s *Scanner) scanComment() string { // initial '/' already consumed; s.ch == '/' || s.ch == '*' offs := s.offset - 1 // position of initial '/' @@ -336,11 +340,11 @@ func (s *Scanner) findLineEnd() bool { } func isLetter(ch rune) bool { - return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) + return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) } func isDigit(ch rune) bool { - return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) + return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch) } func (s *Scanner) scanIdentifier() string { @@ -355,95 +359,188 @@ func digitVal(ch rune) int { switch { case '0' <= ch && ch <= '9': return int(ch - '0') - case 'a' <= ch && ch <= 'f': - return int(ch - 'a' + 10) - case 'A' <= ch && ch <= 'F': - return int(ch - 'A' + 10) + case 'a' <= lower(ch) && lower(ch) <= 'f': + return int(lower(ch) - 'a' + 10) } return 16 // larger than any legal digit val } -func (s *Scanner) scanMantissa(base int) { - for digitVal(s.ch) < base { - s.next() +func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter +func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } +func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } + +// digits accepts the sequence { digit | '_' }. +// If base <= 10, digits accepts any decimal digit but records +// the offset (relative to the source start) of a digit >= base +// in *invalid, if *invalid < 0. +// digits returns a bitset describing whether the sequence contained +// digits (bit 0 is set), or separators '_' (bit 1 is set). +func (s *Scanner) digits(base int, invalid *int) (digsep int) { + if base <= 10 { + max := rune('0' + base) + for isDecimal(s.ch) || s.ch == '_' { + ds := 1 + if s.ch == '_' { + ds = 2 + } else if s.ch >= max && *invalid < 0 { + *invalid = int(s.offset) // record invalid rune offset + } + digsep |= ds + s.next() + } + } else { + for isHex(s.ch) || s.ch == '_' { + ds := 1 + if s.ch == '_' { + ds = 2 + } + digsep |= ds + s.next() + } } + return } -func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { - // digitVal(s.ch) < 10 +func (s *Scanner) scanNumber() (token.Token, string) { offs := s.offset - tok := token.INT + tok := token.ILLEGAL - if seenDecimalPoint { - offs-- - tok = token.FLOAT - s.scanMantissa(10) - goto exponent - } + base := 10 // number base + prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' + digsep := 0 // bit 0: digit present, bit 1: '_' present + invalid := -1 // index of invalid digit in literal, or < 0 - if s.ch == '0' { - // int or float - offs := s.offset - s.next() - if s.ch == 'x' || s.ch == 'X' { - // hexadecimal int + // integer part + if s.ch != '.' { + tok = token.INT + if s.ch == '0' { s.next() - s.scanMantissa(16) - if s.offset-offs <= 2 { - // only scanned "0x" or "0X" - s.error(offs, "illegal hexadecimal number") - } - } else { - // octal int or float - seenDecimalDigit := false - s.scanMantissa(8) - if s.ch == '8' || s.ch == '9' { - // illegal octal int or float - seenDecimalDigit = true - s.scanMantissa(10) - } - if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { - goto fraction - } - // octal int - if seenDecimalDigit { - s.error(offs, "illegal octal number") + switch lower(s.ch) { + case 'x': + s.next() + base, prefix = 16, 'x' + case 'o': + s.next() + base, prefix = 8, 'o' + case 'b': + s.next() + base, prefix = 2, 'b' + default: + base, prefix = 8, '0' + digsep = 1 // leading 0 } } - goto exit + digsep |= s.digits(base, &invalid) } - // decimal int or float - s.scanMantissa(10) - -fraction: + // fractional part if s.ch == '.' { tok = token.FLOAT + if prefix == 'o' || prefix == 'b' { + s.error(s.offset, "invalid radix point in "+litname(prefix)) + } s.next() - s.scanMantissa(10) + digsep |= s.digits(base, &invalid) } -exponent: - if s.ch == 'e' || s.ch == 'E' { - tok = token.FLOAT + if digsep&1 == 0 { + s.error(s.offset, litname(prefix)+" has no digits") + } + + // exponent + if e := lower(s.ch); e == 'e' || e == 'p' { + switch { + case e == 'e' && prefix != 0 && prefix != '0': + s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch) + case e == 'p' && prefix != 'x': + s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch) + } s.next() - if s.ch == '-' || s.ch == '+' { + tok = token.FLOAT + if s.ch == '+' || s.ch == '-' { s.next() } - if digitVal(s.ch) < 10 { - s.scanMantissa(10) - } else { - s.error(offs, "illegal floating-point exponent") + ds := s.digits(10, nil) + digsep |= ds + if ds&1 == 0 { + s.error(s.offset, "exponent has no digits") } + } else if prefix == 'x' && tok == token.FLOAT { + s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent") } + // suffix 'i' if s.ch == 'i' { tok = token.IMAG + if prefix != 0 && prefix != '0' { + s.error(s.offset, "invalid suffix 'i' on "+litname(prefix)) + } s.next() } -exit: - return tok, string(s.src[offs:s.offset]) + lit := string(s.src[offs:s.offset]) + if tok == token.INT && invalid >= 0 { + s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix)) + } + if digsep&2 != 0 { + if i := invalidSep(lit); i >= 0 { + s.error(offs+i, "'_' must separate successive digits") + } + } + + return tok, lit +} + +func litname(prefix rune) string { + switch prefix { + case 'x': + return "hexadecimal literal" + case 'o', '0': + return "octal literal" + case 'b': + return "binary literal" + } + return "decimal literal" +} + +// invalidSep returns the index of the first invalid separator in x, or -1. +func invalidSep(x string) int { + x1 := ' ' // prefix char, we only care if it's 'x' + d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) + i := 0 + + // a prefix counts as a digit + if len(x) >= 2 && x[0] == '0' { + x1 = lower(rune(x[1])) + if x1 == 'x' || x1 == 'o' || x1 == 'b' { + d = '0' + i = 2 + } + } + + // mantissa and exponent + for ; i < len(x); i++ { + p := d // previous digit + d = rune(x[i]) + switch { + case d == '_': + if p != '0' { + return i + } + case isDecimal(d) || x1 == 'x' && isHex(d): + d = '0' + default: + if p == '_' { + return i - 1 + } + d = '.' + } + } + if d == '_' { + return len(x) - 1 + } + + return -1 } // scanEscape parses an escape sequence where rune is the accepted @@ -708,9 +805,9 @@ scanAgain: insertSemi = true tok = token.IDENT } - case '0' <= ch && ch <= '9': + case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())): insertSemi = true - tok, lit = s.scanNumber(false) + tok, lit = s.scanNumber() default: s.next() // always make progress switch ch { @@ -741,16 +838,12 @@ scanAgain: case ':': tok = s.switch2(token.COLON, token.DEFINE) case '.': - if '0' <= s.ch && s.ch <= '9' { - insertSemi = true - tok, lit = s.scanNumber(true) - } else { - tok = token.PERIOD - if s.ch == '.' && s.peek() == '.' { - s.next() - s.next() // consume last '.' - tok = token.ELLIPSIS - } + // fractions starting with a '.' are handled by outer switch + tok = token.PERIOD + if s.ch == '.' && s.peek() == '.' { + s.next() + s.next() // consume last '.' + tok = token.ELLIPSIS } case ',': tok = token.COMMA @@ -835,7 +928,7 @@ scanAgain: default: // next reports unexpected BOMs - don't repeat if ch != bom { - s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) + s.errorf(s.file.Offset(pos), "illegal character %#U", ch) } insertSemi = s.insertSemi // preserve insertSemi info tok = token.ILLEGAL diff --git a/src/go/scanner/scanner_test.go b/src/go/scanner/scanner_test.go index 36c962209c..1d6865f198 100644 --- a/src/go/scanner/scanner_test.go +++ b/src/go/scanner/scanner_test.go @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "runtime" + "strings" "testing" ) @@ -802,11 +803,10 @@ var errors = []struct { {"078.", token.FLOAT, 0, "078.", ""}, {"07801234567.", token.FLOAT, 0, "07801234567.", ""}, {"078e0", token.FLOAT, 0, "078e0", ""}, - {"0E", token.FLOAT, 0, "0E", "illegal floating-point exponent"}, // issue 17621 - {"078", token.INT, 0, "078", "illegal octal number"}, - {"07800000009", token.INT, 0, "07800000009", "illegal octal number"}, - {"0x", token.INT, 0, "0x", "illegal hexadecimal number"}, - {"0X", token.INT, 0, "0X", "illegal hexadecimal number"}, + {"0E", token.FLOAT, 2, "0E", "exponent has no digits"}, // issue 17621 + {"078", token.INT, 2, "078", "invalid digit '8' in octal literal"}, + {"07090000008", token.INT, 3, "07090000008", "invalid digit '9' in octal literal"}, + {"0x", token.INT, 2, "0x", "hexadecimal literal has no digits"}, {"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"}, {"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"}, {"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"}, // only first BOM is ignored @@ -912,3 +912,199 @@ func BenchmarkScanFile(b *testing.B) { } } } + +func TestNumbers(t *testing.T) { + for _, test := range []struct { + tok token.Token + src, tokens, err string + }{ + // binaries + {token.INT, "0b0", "0b0", ""}, + {token.INT, "0b1010", "0b1010", ""}, + {token.INT, "0B1110", "0B1110", ""}, + + {token.INT, "0b", "0b", "binary literal has no digits"}, + {token.INT, "0b0190", "0b0190", "invalid digit '9' in binary literal"}, + {token.INT, "0b01a0", "0b01 a0", ""}, // only accept 0-9 + + // binary floats and imaginaries (invalid) + {token.FLOAT, "0b.", "0b.", "invalid radix point in binary literal"}, + {token.FLOAT, "0b.1", "0b.1", "invalid radix point in binary literal"}, + {token.FLOAT, "0b1.0", "0b1.0", "invalid radix point in binary literal"}, + {token.FLOAT, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"}, + {token.FLOAT, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"}, + {token.IMAG, "0b10i", "0b10i", "invalid suffix 'i' on binary literal"}, + + // octals + {token.INT, "0o0", "0o0", ""}, + {token.INT, "0o1234", "0o1234", ""}, + {token.INT, "0O1234", "0O1234", ""}, + + {token.INT, "0o", "0o", "octal literal has no digits"}, + {token.INT, "0o8123", "0o8123", "invalid digit '8' in octal literal"}, + {token.INT, "0o1293", "0o1293", "invalid digit '9' in octal literal"}, + {token.INT, "0o12a3", "0o12 a3", ""}, // only accept 0-9 + + // octal floats and imaginaries (invalid) + {token.FLOAT, "0o.", "0o.", "invalid radix point in octal literal"}, + {token.FLOAT, "0o.2", "0o.2", "invalid radix point in octal literal"}, + {token.FLOAT, "0o1.2", "0o1.2", "invalid radix point in octal literal"}, + {token.FLOAT, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"}, + {token.FLOAT, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"}, + {token.IMAG, "0o10i", "0o10i", "invalid suffix 'i' on octal literal"}, + + // 0-octals + {token.INT, "0", "0", ""}, + {token.INT, "0123", "0123", ""}, + + {token.INT, "08123", "08123", "invalid digit '8' in octal literal"}, + {token.INT, "01293", "01293", "invalid digit '9' in octal literal"}, + {token.INT, "0F.", "0 F .", ""}, // only accept 0-9 + {token.INT, "0123F.", "0123 F .", ""}, + {token.INT, "0123456x", "0123456 x", ""}, + + // decimals + {token.INT, "1", "1", ""}, + {token.INT, "1234", "1234", ""}, + + {token.INT, "1f", "1 f", ""}, // only accept 0-9 + + // decimal floats + {token.FLOAT, "0.", "0.", ""}, + {token.FLOAT, "123.", "123.", ""}, + {token.FLOAT, "0123.", "0123.", ""}, + + {token.FLOAT, ".0", ".0", ""}, + {token.FLOAT, ".123", ".123", ""}, + {token.FLOAT, ".0123", ".0123", ""}, + + {token.FLOAT, "0.0", "0.0", ""}, + {token.FLOAT, "123.123", "123.123", ""}, + {token.FLOAT, "0123.0123", "0123.0123", ""}, + + {token.FLOAT, "0e0", "0e0", ""}, + {token.FLOAT, "123e+0", "123e+0", ""}, + {token.FLOAT, "0123E-1", "0123E-1", ""}, + + {token.FLOAT, "0.e+1", "0.e+1", ""}, + {token.FLOAT, "123.E-10", "123.E-10", ""}, + {token.FLOAT, "0123.e123", "0123.e123", ""}, + + {token.FLOAT, ".0e-1", ".0e-1", ""}, + {token.FLOAT, ".123E+10", ".123E+10", ""}, + {token.FLOAT, ".0123E123", ".0123E123", ""}, + + {token.FLOAT, "0.0e1", "0.0e1", ""}, + {token.FLOAT, "123.123E-10", "123.123E-10", ""}, + {token.FLOAT, "0123.0123e+456", "0123.0123e+456", ""}, + + {token.FLOAT, "0e", "0e", "exponent has no digits"}, + {token.FLOAT, "0E+", "0E+", "exponent has no digits"}, + {token.FLOAT, "1e+f", "1e+ f", "exponent has no digits"}, + {token.FLOAT, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"}, + {token.FLOAT, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"}, + + // decimal imaginaries + {token.IMAG, "0.i", "0.i", ""}, + {token.IMAG, ".123i", ".123i", ""}, + {token.IMAG, "123.123i", "123.123i", ""}, + {token.IMAG, "123e+0i", "123e+0i", ""}, + {token.IMAG, "123.E-10i", "123.E-10i", ""}, + {token.IMAG, ".123E+10i", ".123E+10i", ""}, + + // hexadecimals + {token.INT, "0x0", "0x0", ""}, + {token.INT, "0x1234", "0x1234", ""}, + {token.INT, "0xcafef00d", "0xcafef00d", ""}, + {token.INT, "0XCAFEF00D", "0XCAFEF00D", ""}, + + {token.INT, "0x", "0x", "hexadecimal literal has no digits"}, + {token.INT, "0x1g", "0x1 g", ""}, + + // hexadecimal floats + {token.FLOAT, "0x0p0", "0x0p0", ""}, + {token.FLOAT, "0x12efp-123", "0x12efp-123", ""}, + {token.FLOAT, "0xABCD.p+0", "0xABCD.p+0", ""}, + {token.FLOAT, "0x.0189P-0", "0x.0189P-0", ""}, + {token.FLOAT, "0x1.ffffp+1023", "0x1.ffffp+1023", ""}, + + {token.FLOAT, "0x.", "0x.", "hexadecimal literal has no digits"}, + {token.FLOAT, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"}, + {token.FLOAT, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"}, + {token.FLOAT, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"}, + {token.FLOAT, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"}, + {token.FLOAT, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"}, + {token.FLOAT, "0x0p", "0x0p", "exponent has no digits"}, + {token.FLOAT, "0xeP-", "0xeP-", "exponent has no digits"}, + {token.FLOAT, "0x1234PAB", "0x1234P AB", "exponent has no digits"}, + {token.FLOAT, "0x1.2p1a", "0x1.2p1 a", ""}, + + // hexadecimal imaginaries (invalid) + {token.IMAG, "0xf00i", "0xf00i", "invalid suffix 'i' on hexadecimal literal"}, + {token.IMAG, "0xf00.bap+12i", "0xf00.bap+12i", "invalid suffix 'i' on hexadecimal literal"}, + + // separators + {token.INT, "0b_1000_0001", "0b_1000_0001", ""}, + {token.INT, "0o_600", "0o_600", ""}, + {token.INT, "0_466", "0_466", ""}, + {token.INT, "1_000", "1_000", ""}, + {token.FLOAT, "1_000.000_1", "1_000.000_1", ""}, + {token.IMAG, "10e+1_2_3i", "10e+1_2_3i", ""}, + {token.INT, "0x_f00d", "0x_f00d", ""}, + {token.FLOAT, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""}, + + {token.INT, "0b__1000", "0b__1000", "'_' must separate successive digits"}, + {token.INT, "0o60___0", "0o60___0", "'_' must separate successive digits"}, + {token.INT, "0466_", "0466_", "'_' must separate successive digits"}, + {token.FLOAT, "1_.", "1_.", "'_' must separate successive digits"}, + {token.FLOAT, "0._1", "0._1", "'_' must separate successive digits"}, + {token.FLOAT, "2.7_e0", "2.7_e0", "'_' must separate successive digits"}, + {token.IMAG, "10e+12_i", "10e+12_i", "'_' must separate successive digits"}, + {token.INT, "0x___0", "0x___0", "'_' must separate successive digits"}, + {token.FLOAT, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"}, + } { + var s Scanner + var err string + s.Init(fset.AddFile("", fset.Base(), len(test.src)), []byte(test.src), func(_ token.Position, msg string) { + if err == "" { + err = msg + } + }, 0) + for i, want := range strings.Split(test.tokens, " ") { + err = "" + _, tok, lit := s.Scan() + + // compute lit where for tokens where lit is not defined + switch tok { + case token.PERIOD: + lit = "." + case token.ADD: + lit = "+" + case token.SUB: + lit = "-" + } + + if i == 0 { + if tok != test.tok { + t.Errorf("%q: got token %s; want %s", test.src, tok, test.tok) + } + if err != test.err { + t.Errorf("%q: got error %q; want %q", test.src, err, test.err) + } + } + + if lit != want { + t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, tok, want) + } + } + + // make sure we read all + _, tok, _ := s.Scan() + if tok == token.SEMICOLON { + _, tok, _ = s.Scan() + } + if tok != token.EOF { + t.Errorf("%q: got %s; want EOF", test.src, tok) + } + } +}