diff options
author | Robert Griesemer <gri@golang.org> | 2019-02-05 12:42:46 -0800 |
---|---|---|
committer | Robert Griesemer <gri@golang.org> | 2019-02-11 23:26:17 +0000 |
commit | 710417bc92af19379101acbcd4e0f79dba38c891 (patch) | |
tree | eff06388438b6f7e98bb18bcd4fe6a09f171d399 /src/text | |
parent | 33ac854481c49632a4b924b184f83e068014b486 (diff) | |
download | go-git-710417bc92af19379101acbcd4e0f79dba38c891.tar.gz |
text/scanner: accept new Go2 number literals
This CL introduces text/scanner support for the new binary and octal integer
literals, hexadecimal floats, and digit separators for all number literals.
The new code is closely mirroring the respective code for number literals in
cmd/compile/internal/syntax/scanner.go.
Uniformly use the term "invalid" rather than "illegal" in error messages
to match the respective error messages in the other scanners directly.
R=Go1.13
Updates #12711.
Updates #19308.
Updates #28493.
Updates #29008.
Change-Id: I2f291de13ba5afc0e530cd8326e6bf4c3858ebac
Reviewed-on: https://go-review.googlesource.com/c/161199
Run-TryBot: Robert Griesemer <gri@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Diffstat (limited to 'src/text')
-rw-r--r-- | src/text/scanner/scanner.go | 252 | ||||
-rw-r--r-- | src/text/scanner/scanner_test.go | 228 |
2 files changed, 371 insertions, 109 deletions
diff --git a/src/text/scanner/scanner.go b/src/text/scanner/scanner.go index 62b3231e5e..38c27f6a08 100644 --- a/src/text/scanner/scanner.go +++ b/src/text/scanner/scanner.go @@ -266,7 +266,7 @@ func (s *Scanner) next() rune { s.srcPos += width s.lastCharLen = width s.column++ - s.error("illegal UTF-8 encoding") + s.error("invalid UTF-8 encoding") return ch } } @@ -281,7 +281,7 @@ func (s *Scanner) next() rune { switch ch { case 0: // for compatibility with other tools - s.error("illegal character NUL") + s.error("invalid character NUL") case '\n': s.line++ s.lastLineLen = s.column @@ -335,6 +335,10 @@ func (s *Scanner) error(msg string) { fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) } +func (s *Scanner) errorf(format string, args ...interface{}) { + s.error(fmt.Sprintf(format, args...)) +} + func (s *Scanner) isIdentRune(ch rune, i int) bool { if s.IsIdentRune != nil { return s.IsIdentRune(ch, i) @@ -351,95 +355,189 @@ func (s *Scanner) scanIdentifier() rune { return ch } -func digitVal(ch rune) int { - switch { - case '0' <= ch && ch <= '9': - return int(ch - '0') - case 'a' <= ch && ch <= 'f': - return int(ch - 'a' + 10) - case 'A' <= ch && ch <= 'F': - return int(ch - 'A' + 10) +func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter +func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } +func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' } + +// digits accepts the sequence { digit | '_' } starting with ch0. +// If base <= 10, digits accepts any decimal digit but records +// the first invalid digit >= base in *invalid if *invalid == 0. +// digits returns the first rune that is not part of the sequence +// anymore, and a bitset describing whether the sequence contained +// digits (bit 0 is set), or separators '_' (bit 1 is set). +func (s *Scanner) digits(ch0 rune, base int, invalid *rune) (ch rune, digsep int) { + ch = ch0 + if base <= 10 { + max := rune('0' + base) + for isDecimal(ch) || ch == '_' { + ds := 1 + if ch == '_' { + ds = 2 + } else if ch >= max && *invalid == 0 { + *invalid = ch + } + digsep |= ds + ch = s.next() + } + } else { + for isHex(ch) || ch == '_' { + ds := 1 + if ch == '_' { + ds = 2 + } + digsep |= ds + ch = s.next() + } } - return 16 // larger than any legal digit val + return } -func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } +func (s *Scanner) scanNumber(ch rune, integerPart bool) (rune, rune) { + base := 10 // number base + prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b' + digsep := 0 // bit 0: digit present, bit 1: '_' present + invalid := rune(0) // invalid digit in literal, or 0 + + // integer part + var tok rune + var ds int + if integerPart { + tok = Int + if ch == '0' { + ch = s.next() + switch lower(ch) { + case 'x': + ch = s.next() + base, prefix = 16, 'x' + case 'o': + ch = s.next() + base, prefix = 8, 'o' + case 'b': + ch = s.next() + base, prefix = 2, 'b' + default: + base, prefix = 8, '0' + digsep = 1 // leading 0 + } + } + ch, ds = s.digits(ch, base, &invalid) + digsep |= ds + } -func (s *Scanner) scanMantissa(ch rune) rune { - for isDecimal(ch) { - ch = s.next() + // fractional part + if !integerPart || ch == '.' { + tok = Float + if prefix == 'o' || prefix == 'b' { + s.error("invalid radix point in " + litname(prefix)) + } + if ch == '.' { + ch = s.next() + } + ch, ds = s.digits(ch, base, &invalid) + digsep |= ds } - return ch -} -func (s *Scanner) scanFraction(ch rune) rune { - if ch == '.' { - ch = s.scanMantissa(s.next()) + if digsep&1 == 0 { + s.error(litname(prefix) + " has no digits") } - return ch -} -func (s *Scanner) scanExponent(ch rune) rune { - if ch == 'e' || ch == 'E' { + // exponent + if e := lower(ch); e == 'e' || e == 'p' { + switch { + case e == 'e' && prefix != 0 && prefix != '0': + s.errorf("%q exponent requires decimal mantissa", ch) + case e == 'p' && prefix != 'x': + s.errorf("%q exponent requires hexadecimal mantissa", ch) + } ch = s.next() - if ch == '-' || ch == '+' { + tok = Float + if ch == '+' || ch == '-' { ch = s.next() } - if !isDecimal(ch) { - s.error("illegal exponent") + ch, ds = s.digits(ch, 10, nil) + digsep |= ds + if ds&1 == 0 { + s.error("exponent has no digits") } - ch = s.scanMantissa(ch) + } else if prefix == 'x' && tok == Float { + s.error("hexadecimal mantissa requires a 'p' exponent") } - return ch + + if tok == Int && invalid != 0 { + s.errorf("invalid digit %q in %s", invalid, litname(prefix)) + } + + if digsep&2 != 0 { + s.tokEnd = s.srcPos - s.lastCharLen // make sure token text is terminated + if i := invalidSep(s.TokenText()); i >= 0 { + s.error("'_' must separate successive digits") + } + } + + return tok, ch } -func (s *Scanner) scanNumber(ch rune) (rune, rune) { - // isDecimal(ch) - if ch == '0' { - // int or float - ch = s.next() - if ch == 'x' || ch == 'X' { - // hexadecimal int - ch = s.next() - hasMantissa := false - for digitVal(ch) < 16 { - ch = s.next() - hasMantissa = true - } - if !hasMantissa { - s.error("illegal hexadecimal number") - } - } else { - // octal int or float - has8or9 := false - for isDecimal(ch) { - if ch > '7' { - has8or9 = true - } - ch = s.next() - } - if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { - // float - ch = s.scanFraction(ch) - ch = s.scanExponent(ch) - return Float, ch +func litname(prefix rune) string { + switch prefix { + default: + return "decimal literal" + case 'x': + return "hexadecimal literal" + case 'o', '0': + return "octal literal" + case 'b': + return "binary literal" + } +} + +// invalidSep returns the index of the first invalid separator in x, or -1. +func invalidSep(x string) int { + x1 := ' ' // prefix char, we only care if it's 'x' + d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else) + i := 0 + + // a prefix counts as a digit + if len(x) >= 2 && x[0] == '0' { + x1 = lower(rune(x[1])) + if x1 == 'x' || x1 == 'o' || x1 == 'b' { + d = '0' + i = 2 + } + } + + // mantissa and exponent + for ; i < len(x); i++ { + p := d // previous digit + d = rune(x[i]) + switch { + case d == '_': + if p != '0' { + return i } - // octal int - if has8or9 { - s.error("illegal octal number") + case isDecimal(d) || x1 == 'x' && isHex(d): + d = '0' + default: + if p == '_' { + return i - 1 } + d = '.' } - return Int, ch } - // decimal int or float - ch = s.scanMantissa(ch) - if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { - // float - ch = s.scanFraction(ch) - ch = s.scanExponent(ch) - return Float, ch + if d == '_' { + return len(x) - 1 + } + + return -1 +} + +func digitVal(ch rune) int { + switch { + case '0' <= ch && ch <= '9': + return int(ch - '0') + case 'a' <= lower(ch) && lower(ch) <= 'f': + return int(lower(ch) - 'a' + 10) } - return Int, ch + return 16 // larger than any legal digit val } func (s *Scanner) scanDigits(ch rune, base, n int) rune { @@ -448,7 +546,7 @@ func (s *Scanner) scanDigits(ch rune, base, n int) rune { n-- } if n > 0 { - s.error("illegal char escape") + s.error("invalid char escape") } return ch } @@ -468,7 +566,7 @@ func (s *Scanner) scanEscape(quote rune) rune { case 'U': ch = s.scanDigits(s.next(), 16, 8) default: - s.error("illegal char escape") + s.error("invalid char escape") } return ch } @@ -503,7 +601,7 @@ func (s *Scanner) scanRawString() { func (s *Scanner) scanChar() { if s.scanString('\'') != 1 { - s.error("illegal char literal") + s.error("invalid char literal") } } @@ -584,7 +682,7 @@ redo: } case isDecimal(ch): if s.Mode&(ScanInts|ScanFloats) != 0 { - tok, ch = s.scanNumber(ch) + tok, ch = s.scanNumber(ch, true) } else { ch = s.next() } @@ -607,9 +705,7 @@ redo: case '.': ch = s.next() if isDecimal(ch) && s.Mode&ScanFloats != 0 { - tok = Float - ch = s.scanMantissa(ch) - ch = s.scanExponent(ch) + tok, ch = s.scanNumber(ch, false) } case '/': ch = s.next() diff --git a/src/text/scanner/scanner_test.go b/src/text/scanner/scanner_test.go index e7539a058b..58db8e1971 100644 --- a/src/text/scanner/scanner_test.go +++ b/src/text/scanner/scanner_test.go @@ -290,11 +290,11 @@ func TestScan(t *testing.T) { testScan(t, GoTokens&^SkipComments) } -func TestIllegalExponent(t *testing.T) { +func TestInvalidExponent(t *testing.T) { const src = "1.5e 1.5E 1e+ 1e- 1.5z" s := new(Scanner).Init(strings.NewReader(src)) s.Error = func(s *Scanner, msg string) { - const want = "illegal exponent" + const want = "exponent has no digits" if msg != want { t.Errorf("%s: got error %q; want %q", s.TokenText(), msg, want) } @@ -378,7 +378,7 @@ func TestScanSelectedMask(t *testing.T) { testScanSelectedMode(t, 0, 0) testScanSelectedMode(t, ScanIdents, Ident) // Don't test ScanInts and ScanNumbers since some parts of - // the floats in the source look like (illegal) octal ints + // the floats in the source look like (invalid) octal ints // and ScanNumbers may return either Int or Float. testScanSelectedMode(t, ScanChars, Char) testScanSelectedMode(t, ScanStrings, String) @@ -480,34 +480,34 @@ func testError(t *testing.T, src, pos, msg string, tok rune) { } func TestError(t *testing.T) { - testError(t, "\x00", "<input>:1:1", "illegal character NUL", 0) - testError(t, "\x80", "<input>:1:1", "illegal UTF-8 encoding", utf8.RuneError) - testError(t, "\xff", "<input>:1:1", "illegal UTF-8 encoding", utf8.RuneError) - - testError(t, "a\x00", "<input>:1:2", "illegal character NUL", Ident) - testError(t, "ab\x80", "<input>:1:3", "illegal UTF-8 encoding", Ident) - testError(t, "abc\xff", "<input>:1:4", "illegal UTF-8 encoding", Ident) - - testError(t, `"a`+"\x00", "<input>:1:3", "illegal character NUL", String) - testError(t, `"ab`+"\x80", "<input>:1:4", "illegal UTF-8 encoding", String) - testError(t, `"abc`+"\xff", "<input>:1:5", "illegal UTF-8 encoding", String) - - testError(t, "`a"+"\x00", "<input>:1:3", "illegal character NUL", RawString) - testError(t, "`ab"+"\x80", "<input>:1:4", "illegal UTF-8 encoding", RawString) - testError(t, "`abc"+"\xff", "<input>:1:5", "illegal UTF-8 encoding", RawString) - - testError(t, `'\"'`, "<input>:1:3", "illegal char escape", Char) - testError(t, `"\'"`, "<input>:1:3", "illegal char escape", String) - - testError(t, `01238`, "<input>:1:6", "illegal octal number", Int) - testError(t, `01238123`, "<input>:1:9", "illegal octal number", Int) - testError(t, `0x`, "<input>:1:3", "illegal hexadecimal number", Int) - testError(t, `0xg`, "<input>:1:3", "illegal hexadecimal number", Int) - testError(t, `'aa'`, "<input>:1:4", "illegal char literal", Char) - testError(t, `1.5e`, "<input>:1:5", "illegal exponent", Float) - testError(t, `1.5E`, "<input>:1:5", "illegal exponent", Float) - testError(t, `1.5e+`, "<input>:1:6", "illegal exponent", Float) - testError(t, `1.5e-`, "<input>:1:6", "illegal exponent", Float) + testError(t, "\x00", "<input>:1:1", "invalid character NUL", 0) + testError(t, "\x80", "<input>:1:1", "invalid UTF-8 encoding", utf8.RuneError) + testError(t, "\xff", "<input>:1:1", "invalid UTF-8 encoding", utf8.RuneError) + + testError(t, "a\x00", "<input>:1:2", "invalid character NUL", Ident) + testError(t, "ab\x80", "<input>:1:3", "invalid UTF-8 encoding", Ident) + testError(t, "abc\xff", "<input>:1:4", "invalid UTF-8 encoding", Ident) + + testError(t, `"a`+"\x00", "<input>:1:3", "invalid character NUL", String) + testError(t, `"ab`+"\x80", "<input>:1:4", "invalid UTF-8 encoding", String) + testError(t, `"abc`+"\xff", "<input>:1:5", "invalid UTF-8 encoding", String) + + testError(t, "`a"+"\x00", "<input>:1:3", "invalid character NUL", RawString) + testError(t, "`ab"+"\x80", "<input>:1:4", "invalid UTF-8 encoding", RawString) + testError(t, "`abc"+"\xff", "<input>:1:5", "invalid UTF-8 encoding", RawString) + + testError(t, `'\"'`, "<input>:1:3", "invalid char escape", Char) + testError(t, `"\'"`, "<input>:1:3", "invalid char escape", String) + + testError(t, `01238`, "<input>:1:6", "invalid digit '8' in octal literal", Int) + testError(t, `01238123`, "<input>:1:9", "invalid digit '8' in octal literal", Int) + testError(t, `0x`, "<input>:1:3", "hexadecimal literal has no digits", Int) + testError(t, `0xg`, "<input>:1:3", "hexadecimal literal has no digits", Int) + testError(t, `'aa'`, "<input>:1:4", "invalid char literal", Char) + testError(t, `1.5e`, "<input>:1:5", "exponent has no digits", Float) + testError(t, `1.5E`, "<input>:1:5", "exponent has no digits", Float) + testError(t, `1.5e+`, "<input>:1:6", "exponent has no digits", Float) + testError(t, `1.5e-`, "<input>:1:6", "exponent has no digits", Float) testError(t, `'`, "<input>:1:2", "literal not terminated", Char) testError(t, `'`+"\n", "<input>:1:2", "literal not terminated", Char) @@ -711,3 +711,169 @@ func TestIssue29723(t *testing.T) { for r := s.Scan(); r != EOF; r = s.Scan() { } } + +func TestNumbers(t *testing.T) { + for _, test := range []struct { + tok rune + src, tokens, err string + }{ + // binaries + {Int, "0b0", "0b0", ""}, + {Int, "0b1010", "0b1010", ""}, + {Int, "0B1110", "0B1110", ""}, + + {Int, "0b", "0b", "binary literal has no digits"}, + {Int, "0b0190", "0b0190", "invalid digit '9' in binary literal"}, + {Int, "0b01a0", "0b01 a0", ""}, // only accept 0-9 + + // binary floats (invalid) + {Float, "0b.", "0b.", "invalid radix point in binary literal"}, + {Float, "0b.1", "0b.1", "invalid radix point in binary literal"}, + {Float, "0b1.0", "0b1.0", "invalid radix point in binary literal"}, + {Float, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"}, + {Float, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"}, + + // octals + {Int, "0o0", "0o0", ""}, + {Int, "0o1234", "0o1234", ""}, + {Int, "0O1234", "0O1234", ""}, + + {Int, "0o", "0o", "octal literal has no digits"}, + {Int, "0o8123", "0o8123", "invalid digit '8' in octal literal"}, + {Int, "0o1293", "0o1293", "invalid digit '9' in octal literal"}, + {Int, "0o12a3", "0o12 a3", ""}, // only accept 0-9 + + // octal floats (invalid) + {Float, "0o.", "0o.", "invalid radix point in octal literal"}, + {Float, "0o.2", "0o.2", "invalid radix point in octal literal"}, + {Float, "0o1.2", "0o1.2", "invalid radix point in octal literal"}, + {Float, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"}, + {Float, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"}, + + // 0-octals + {Int, "0", "0", ""}, + {Int, "0123", "0123", ""}, + + {Int, "08123", "08123", "invalid digit '8' in octal literal"}, + {Int, "01293", "01293", "invalid digit '9' in octal literal"}, + {Int, "0F.", "0 F .", ""}, // only accept 0-9 + {Int, "0123F.", "0123 F .", ""}, + {Int, "0123456x", "0123456 x", ""}, + + // decimals + {Int, "1", "1", ""}, + {Int, "1234", "1234", ""}, + + {Int, "1f", "1 f", ""}, // only accept 0-9 + + // decimal floats + {Float, "0.", "0.", ""}, + {Float, "123.", "123.", ""}, + {Float, "0123.", "0123.", ""}, + + {Float, ".0", ".0", ""}, + {Float, ".123", ".123", ""}, + {Float, ".0123", ".0123", ""}, + + {Float, "0.0", "0.0", ""}, + {Float, "123.123", "123.123", ""}, + {Float, "0123.0123", "0123.0123", ""}, + + {Float, "0e0", "0e0", ""}, + {Float, "123e+0", "123e+0", ""}, + {Float, "0123E-1", "0123E-1", ""}, + + {Float, "0.e+1", "0.e+1", ""}, + {Float, "123.E-10", "123.E-10", ""}, + {Float, "0123.e123", "0123.e123", ""}, + + {Float, ".0e-1", ".0e-1", ""}, + {Float, ".123E+10", ".123E+10", ""}, + {Float, ".0123E123", ".0123E123", ""}, + + {Float, "0.0e1", "0.0e1", ""}, + {Float, "123.123E-10", "123.123E-10", ""}, + {Float, "0123.0123e+456", "0123.0123e+456", ""}, + + {Float, "0e", "0e", "exponent has no digits"}, + {Float, "0E+", "0E+", "exponent has no digits"}, + {Float, "1e+f", "1e+ f", "exponent has no digits"}, + {Float, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"}, + {Float, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"}, + + // hexadecimals + {Int, "0x0", "0x0", ""}, + {Int, "0x1234", "0x1234", ""}, + {Int, "0xcafef00d", "0xcafef00d", ""}, + {Int, "0XCAFEF00D", "0XCAFEF00D", ""}, + + {Int, "0x", "0x", "hexadecimal literal has no digits"}, + {Int, "0x1g", "0x1 g", ""}, + + // hexadecimal floats + {Float, "0x0p0", "0x0p0", ""}, + {Float, "0x12efp-123", "0x12efp-123", ""}, + {Float, "0xABCD.p+0", "0xABCD.p+0", ""}, + {Float, "0x.0189P-0", "0x.0189P-0", ""}, + {Float, "0x1.ffffp+1023", "0x1.ffffp+1023", ""}, + + {Float, "0x.", "0x.", "hexadecimal literal has no digits"}, + {Float, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"}, + {Float, "0x0p", "0x0p", "exponent has no digits"}, + {Float, "0xeP-", "0xeP-", "exponent has no digits"}, + {Float, "0x1234PAB", "0x1234P AB", "exponent has no digits"}, + {Float, "0x1.2p1a", "0x1.2p1 a", ""}, + + // separators + {Int, "0b_1000_0001", "0b_1000_0001", ""}, + {Int, "0o_600", "0o_600", ""}, + {Int, "0_466", "0_466", ""}, + {Int, "1_000", "1_000", ""}, + {Float, "1_000.000_1", "1_000.000_1", ""}, + {Int, "0x_f00d", "0x_f00d", ""}, + {Float, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""}, + + {Int, "0b__1000", "0b__1000", "'_' must separate successive digits"}, + {Int, "0o60___0", "0o60___0", "'_' must separate successive digits"}, + {Int, "0466_", "0466_", "'_' must separate successive digits"}, + {Float, "1_.", "1_.", "'_' must separate successive digits"}, + {Float, "0._1", "0._1", "'_' must separate successive digits"}, + {Float, "2.7_e0", "2.7_e0", "'_' must separate successive digits"}, + {Int, "0x___0", "0x___0", "'_' must separate successive digits"}, + {Float, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"}, + } { + s := new(Scanner).Init(strings.NewReader(test.src)) + var err string + s.Error = func(s *Scanner, msg string) { + if err == "" { + err = msg + } + } + + for i, want := range strings.Split(test.tokens, " ") { + err = "" + tok := s.Scan() + lit := s.TokenText() + if i == 0 { + if tok != test.tok { + t.Errorf("%q: got token %s; want %s", test.src, TokenString(tok), TokenString(test.tok)) + } + if err != test.err { + t.Errorf("%q: got error %q; want %q", test.src, err, test.err) + } + } + if lit != want { + t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, TokenString(tok), want) + } + } + + // make sure we read all + if tok := s.Scan(); tok != EOF { + t.Errorf("%q: got %s; want EOF", test.src, TokenString(tok)) + } + } +} |