cmd/compile/internal/syntax: implement buffered reading from io.Reader

No performance impact: $ go test -run StdLib -fast parsed 1073074 lines (2823 files) in 575.606804ms (1864248 lines/s) allocated 263.956Mb (458.570Mb/s) PASS
author: Robert Griesemer <gri@golang.org> 2016-03-10 13:25:09 -0800
committer: Matthew Dempsky <mdempsky@google.com> 2016-08-16 10:48:01 -0700
commit: c7cc983097d21f1ed3ad07672052f37a431a1192 (patch)
tree: 5362f29f02783f79ceb43b14b5c156d90654a00e
parent: a85b9c5467b778f765504df330b7dec1ee1a1c97 (diff)
download: go-git-c7cc983097d21f1ed3ad07672052f37a431a1192.tar.gz
5 files changed, 147 insertions, 295 deletions
diff --git a/src/cmd/compile/internal/syntax/nodes.go b/src/cmd/compile/internal/syntax/nodes.go
index 7632f95348..25eee38428 100644
--- a/src/cmd/compile/internal/syntax/nodes.go
+++ b/src/cmd/compile/internal/syntax/nodes.go
@@ -89,7 +89,7 @@ func (decl) aDecl() {}
 
 // All declarations belonging to the same group point to the same Group node.
 type Group struct {
-	dummy int // make sure it's not an empty node
+	dummy int // not empty so we are guaranteed different Group instances
 }
 
 // ----------------------------------------------------------------------------
diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go
index 75bf1db303..35e9b2cd1a 100644
--- a/src/cmd/compile/internal/syntax/scanner.go
+++ b/src/cmd/compile/internal/syntax/scanner.go
@@ -6,6 +6,7 @@ package syntax
 
 import (
 	"fmt"
+	"io"
 	"unicode"
 	"unicode/utf8"
 )
@@ -22,7 +23,7 @@ type scanner struct {
 	prec      int      // valid if tok is _Operator
 }
 
-func (s *scanner) init(src []byte) {
+func (s *scanner) init(src io.Reader) {
 	s.source.init(src)
 	s.nlsemi = false
 }
@@ -39,7 +40,7 @@ redo:
 	}
 
 	// token start
-	s.pos, s.line = s.source.pos, s.source.line
+	s.pos, s.line = s.source.pos(), s.source.line
 
 	if isLetter(c) || c >= utf8.RuneSelf && unicode.IsLetter(c) {
 		s.ident()
@@ -127,7 +128,7 @@ redo:
 				break
 			}
 			s.ungetr()
-			s.oldpos-- // make next ungetr work (line cannot have changed)
+			s.source.r0-- // make next ungetr work (line cannot have changed)
 		}
 		s.ungetr()
 		s.tok = _Dot
@@ -274,7 +275,9 @@ redo:
 
 	default:
 		s.tok = 0
-		panic(0)
+		fmt.Printf("invalid rune %q\n", c)
+		panic("invalid rune")
+		goto redo
 	}
 
 	return
@@ -357,11 +360,12 @@ func (s *scanner) number(c rune) {
 			if c == 'x' || c == 'X' {
 				// hex
 				c = s.getr()
-				pos := s.source.pos
+				hasDigit := false
 				for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 					c = s.getr()
+					hasDigit = true
 				}
-				if pos == s.source.pos {
+				if !hasDigit {
 					panic("malformed hex constant")
 				}
 				s.ungetr()
@@ -500,17 +504,16 @@ var pragmas = map[string]bool{
 
 func (s *scanner) lineComment() {
 	// recognize pragmas
-	start := s.source.pos
 	r := s.getr()
 	switch r {
 	case 'g':
 		r = s.match(r, "go:")
 		if r < 0 {
-			m := string(s.buf[start+3 : s.source.pos])
-			if pragmas[m] {
-				// TODO(gri) record pragma
-				//println(m)
-			}
+			// m := string(s.buf[start+3 : s.source.pos()])
+			// if pragmas[m] {
+			// 	// TODO(gri) record pragma
+			// 	//println(m)
+			// }
 			return
 		}
 	case 'l':
diff --git a/src/cmd/compile/internal/syntax/scanner_test.go b/src/cmd/compile/internal/syntax/scanner_test.go
index 0e90812d4b..9268cb35e0 100644
--- a/src/cmd/compile/internal/syntax/scanner_test.go
+++ b/src/cmd/compile/internal/syntax/scanner_test.go
@@ -6,15 +6,17 @@ package syntax
 
 import (
 	"fmt"
-	"io/ioutil"
+	"os"
 	"testing"
 )
 
 func TestScanner(t *testing.T) {
-	src, err := ioutil.ReadFile("parser.go")
+	src, err := os.Open("parser.go")
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer src.Close()
+
 	var s scanner
 	s.init(src)
 	for {
diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go
index 8fd7687da2..07b59ab298 100644
--- a/src/cmd/compile/internal/syntax/source.go
+++ b/src/cmd/compile/internal/syntax/source.go
@@ -9,304 +9,141 @@ import (
 	"unicode/utf8"
 )
 
+// buf [...read...|...|...unread...|s|...free...]
+//         ^      ^   ^            ^
+//         |      |   |            |
+//        suf     r0  r            w
+
 type source struct {
 	src io.Reader
-	end int
 
-	buf             []byte
-	litbuf          []byte
-	pos, line       int
-	oldpos, oldline int
-	pin             int
-}
+	// source buffer
+	buf         [4 << 10]byte
+	offs        int   // source offset of buf
+	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
+	line0, line int   // previous/current line
+	err         error // pending io error
 
-func (s *source) init(src []byte) {
-	s.buf = append(src, utf8.RuneSelf) // terminate with sentinel
-	s.pos = 0
-	s.line = 1
-	s.oldline = 1
-}
-
-func (s *source) ungetr() {
-	s.pos, s.line = s.oldpos, s.oldline
+	// literal buffer
+	lit []byte // literal prefix
+	suf int    // literal suffix; suf >= 0 means we are scanning a literal
 }
 
-func (s *source) getr() rune {
-redo:
-	s.oldpos, s.oldline = s.pos, s.line
-
-	// common case: 7bit ASCII
-	if b := s.buf[s.pos]; b < utf8.RuneSelf {
-		s.pos++
-		if b == 0 {
-			panic("invalid NUL byte")
-			goto redo // (or return 0?)
-		}
-		if b == '\n' {
-			s.line++
-		}
-		return rune(b)
-	}
-
-	// uncommon case: not ASCII or not enough bytes
-	r, w := utf8.DecodeRune(s.buf[s.pos:])
-	s.pos += w
-	if r == utf8.RuneError && w == 1 {
-		if s.pos >= len(s.buf) {
-			s.ungetr() // so next getr also returns EOF
-			return -1  // EOF
-		}
-		panic("invalid Unicode character")
-		goto redo
-	}
+func (s *source) init(src io.Reader) {
+	s.src = src
+	s.buf[0] = utf8.RuneSelf // terminate with sentinel
+	s.offs = 0
+	s.r0, s.r, s.w = 0, 0, 0
+	s.line0, s.line = 1, 1
+	s.err = nil
 
-	// BOM's are only allowed as the first character in a file
-	const BOM = 0xfeff
-	if r == BOM && s.oldpos > 0 {
-		panic("invalid BOM in the middle of the file")
-		goto redo
-	}
-
-	return r
+	s.lit = s.lit[:0]
+	s.suf = -1
 }
 
-// TODO(gri) enable this one
-func (s *source) getr_() rune {
-redo:
-	s.oldpos, s.oldline = s.pos, s.line
-
-	// common case: 7bit ASCII
-	if b := s.buf[s.pos]; b < utf8.RuneSelf {
-		s.pos++
-		if b == 0 {
-			panic("invalid NUL byte")
-			goto redo // (or return 0?)
-		}
-		if b == '\n' {
-			s.line++
-		}
-		return rune(b)
-	}
-
-	// uncommon case: not ASCII or not enough bytes
-	r, w := utf8.DecodeRune(s.buf[s.pos:s.end])
-	if r == utf8.RuneError && w == 1 {
-		if s.refill() {
-			goto redo
-		}
-		// TODO(gri) carefull: this depends on whether s.end includes sentinel or not
-		if s.pos < s.end {
-			panic("invalid Unicode character")
-			goto redo
-		}
-		// EOF
-		return -1
-	}
-
-	s.pos += w
-
-	// BOM's are only allowed as the first character in a file
-	const BOM = 0xfeff
-	if r == BOM && s.oldpos > 0 {
-		panic("invalid BOM in the middle of the file")
-		goto redo
-	}
-
-	return r
+func (s *source) pos() int {
+	return s.offs + s.r
 }
 
-func (s *source) refill() bool {
-	for s.pos+utf8.UTFMax > s.end && !utf8.FullRune(s.buf[s.pos:s.end]) {
-		// not enough bytes
-
-		// save literal prefix if any
-		if s.pin >= 0 {
-			s.litbuf = append(s.litbuf, s.buf[s.pin:s.oldpos]...)
-			s.pin = 0
-		}
+func (s *source) ungetr() {
+	s.r, s.line = s.r0, s.line0
+}
 
-		// move unread bytes to beginning of buffer
-		copy(s.buf[0:], s.buf[s.oldpos:s.end])
-		// read more bytes
-		// (an io.Reader must return io.EOF when it reaches
-		// the end of what it is reading - simply returning
-		// n == 0 will make this loop retry forever; but the
-		// error is in the reader implementation in that case)
-		// TODO(gri) check for it and return io.ErrNoProgress?
-		// (see also bufio.go:666)
-		i := s.end - s.oldpos
-		n, err := s.src.Read(s.buf[i : len(s.buf)-1])
-		s.pos -= s.oldpos
-		s.oldpos = 0
-		s.end = i + n
-		s.buf[s.end] = utf8.RuneSelf // sentinel
-		if err != nil {
-			if s.pos == s.end {
-				return false // EOF
+func (s *source) getr() rune {
+	for {
+		s.r0, s.line0 = s.r, s.line
+
+		// common case: ASCII and enough bytes
+		if b := s.buf[s.r]; b < utf8.RuneSelf {
+			s.r++
+			if b == 0 {
+				panic("invalid NUL character")
+				continue
 			}
-			if err != io.EOF {
-				panic(err) // TODO(gri) fix this
+			if b == '\n' {
+				s.line++
 			}
-			// If err == EOF, we won't be getting more
-			// bytes; break to avoid infinite loop. If
-			// err is something else, we don't know if
-			// we can get more bytes; thus also break.
-			break
-		}
-	}
-	return true
-}
-
-func (s *source) startLit() {
-	s.litbuf = s.litbuf[:0]
-	s.pin = s.oldpos
-}
-
-func (s *source) stopLit() string {
-	return string(s.buf[s.pin:s.pos])
-
-	lit := s.buf[s.pin:s.pos]
-	s.pin = -1
-	if len(s.litbuf) > 0 {
-		s.litbuf = append(s.litbuf, lit...)
-		lit = s.litbuf
-	}
-
-	return string(lit)
-}
-
-/*
-// getr reads and returns the next Unicode character. It is designed such
-// that only a minimal amount of work needs to be done in the common ASCII
-// case (a single test to check for both ASCII and end-of-buffer, and one
-// test each to check for NUL and to count newlines).
-func (s *scanner) getr() rune {
-	// unread rune != 0 available
-	if r := s.peekr1; r != 0 {
-		s.peekr1 = s.peekr2
-		s.peekr2 = 0
-		if r == '\n' && importpkg == nil {
-			lexlineno++
-		}
-		return r
-	}
-
-redo:
-	// common case: 7bit ASCII
-	if b := s.buf[s.pos]; b < utf8.RuneSelf {
-		s.pos++
-		if b == 0 {
-			// TODO(gri) do we need lineno = lexlineno here?
-			Yyerror("illegal NUL byte")
-			return 0
-		}
-		if b == '\n' && importpkg == nil {
-			lexlineno++
+			return rune(b)
+		}
+
+		// uncommon case: not ASCII or not enough bytes
+		r, w := utf8.DecodeRune(s.buf[s.r:s.w]) // optimistically assume valid rune
+		if r != utf8.RuneError || w > 1 {
+			s.r += w
+			// BOM's are only allowed as the first character in a file
+			const BOM = 0xfeff
+			if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
+				panic("invalid BOM in the middle of the file")
+				continue
+			}
+			return r
 		}
-		return rune(b)
-	}
 
-	// uncommon case: not ASCII or not enough bytes
-	for s.pos+utf8.UTFMax > s.end && !utf8.FullRune(s.buf[s.pos:s.end]) {
-		// not enough bytes: read some more, but first
-		// move unread bytes to beginning of buffer
-		copy(s.buf[0:], s.buf[s.pos:s.end])
-		// read more bytes
-		// (an io.Reader must return io.EOF when it reaches
-		// the end of what it is reading - simply returning
-		// n == 0 will make this loop retry forever; but the
-		// error is in the reader implementation in that case)
-		// TODO(gri) check for it an return io.ErrNoProgress?
-		// (see also bufio.go:666)
-		i := s.end - s.pos
-		n, err := s.src.Read(s.buf[i : len(s.buf)-1])
-		s.pos = 0
-		s.end = i + n
-		s.buf[s.end] = utf8.RuneSelf // sentinel
-		if err != nil {
-			if s.end == 0 {
-				return EOF
+		if w == 0 && s.err != nil {
+			if s.err != io.EOF {
+				panic(s.err)
 			}
-			if err != io.EOF {
-				panic(err) // TODO(gri) fix this
-			}
-			// If err == EOF, we won't be getting more
-			// bytes; break to avoid infinite loop. If
-			// err is something else, we don't know if
-			// we can get more bytes; thus also break.
-			break
+			return -1
 		}
-	}
 
-	// we have at least one byte (excluding sentinel)
-	// common case: 7bit ASCII
-	if b := s.buf[s.pos]; b < utf8.RuneSelf {
-		s.pos++
-		if b == 0 {
-			// TODO(gri) do we need lineno = lexlineno here?
-			Yyerror("illegal NUL byte")
-			return 0
-		}
-		if b == '\n' && importpkg == nil {
-			lexlineno++
+		if w == 1 && (s.r+utf8.UTFMax <= s.w || utf8.FullRune(s.buf[s.r:s.w])) {
+			s.r++
+			panic("invalid UTF-8 encoding")
+			continue
 		}
-		return rune(b)
-	}
 
-	// uncommon case: not ASCII
-	r, w := utf8.DecodeRune(s.buf[s.pos:s.end])
-	s.pos += w
-	if r == utf8.RuneError && w == 1 {
-		lineno = lexlineno
-		// The string conversion here makes a copy for passing
-		// to fmt.Printf, so that buf itself does not escape and
-		// can be allocated on the stack.
-		Yyerror("illegal UTF-8 sequence %x", r)
+		s.fill()
 	}
+}
 
-	if r == BOM {
-		yyerrorl(int(lexlineno), "Unicode (UTF-8) BOM in middle of file")
-		goto redo
+func (s *source) fill() {
+	// Slide unread bytes to beginning but preserve last read char
+	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
+	// call, only for ".." character sequence).
+	if s.r0 > 1 {
+		// save literal prefix, if any
+		// (We see at most one ungetr call while reading
+		// a literal, so make sure s.r0 remains in buf.)
+		if s.suf >= 0 {
+			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
+			s.suf = 1 // == s.r0 after slide below
+		}
+		s.offs += s.r0 - 1
+		r := s.r - s.r0 + 1 // last read char plus one byte
+		s.w = r + copy(s.buf[r:], s.buf[s.r:s.w])
+		s.r = r
+		s.r0 = 1
+	}
+
+	// read more data: try a limited number of times
+	for i := 100; i > 0; i-- {
+		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
+		if n < 0 {
+			panic("negative read")
+		}
+		s.w += n
+		if n > 0 || err != nil {
+			s.buf[s.w] = utf8.RuneSelf // sentinel
+			if err != nil {
+				s.err = err
+			}
+			return
+		}
 	}
 
-	return r
-}
-
-// pos returns the position of the most recently read character s.ch.
-func (s *Scanner) pos() Offset {
-	// TODO(gri) consider replacing lastCharLen with chPos or equivalent
-	return Offset(s.srcBufOffset + s.srcPos - s.chLen)
+	panic("no progress")
 }
 
-func (s *Scanner) startLiteral() {
-	s.symBuf = s.symBuf[:0]
-	s.symPos = s.srcPos - s.chLen
+func (s *source) startLit() {
+	s.suf = s.r0
+	s.lit = s.lit[:0] // reuse lit
 }
 
-func (s *Scanner) stopLiteral(stripCR bool) string {
-	symEnd := s.srcPos - s.chLen
-
-	lit := s.srcBuf[s.symPos:symEnd]
-	s.symPos = -1
-	if len(s.symBuf) > 0 {
-		// part of the symbol text was saved in symBuf: save the rest in
-		// symBuf as well and return its content
-		s.symBuf = append(s.symBuf, lit...)
-		lit = s.symBuf
-	}
-
-	if stripCR {
-		c := make([]byte, len(lit))
-		i := 0
-		for _, ch := range lit {
-			if ch != '\r' {
-				c[i] = ch
-				i++
-			}
-		}
-		lit = c[:i]
+func (s *source) stopLit() string {
+	lit := s.buf[s.suf:s.r]
+	if len(s.lit) > 0 {
+		lit = append(s.lit, lit...)
 	}
-
+	s.suf = -1 // no pending literal
 	return string(lit)
 }
-*/
diff --git a/src/cmd/compile/internal/syntax/syntax.go b/src/cmd/compile/internal/syntax/syntax.go
index 8dde0703a3..4df2eb5fe2 100644
--- a/src/cmd/compile/internal/syntax/syntax.go
+++ b/src/cmd/compile/internal/syntax/syntax.go
@@ -7,7 +7,7 @@ package syntax
 import (
 	"fmt"
 	"io"
-	"io/ioutil"
+	"os"
 )
 
 type Mode uint
@@ -15,14 +15,32 @@ type Mode uint
 // TODO(gri) These need a lot more work.
 
 func ReadFile(filename string, mode Mode) (*File, error) {
-	src, err := ioutil.ReadFile(filename)
+	src, err := os.Open(filename)
 	if err != nil {
 		return nil, err
 	}
-	return ReadBytes(src, mode)
+	defer src.Close()
+	return Read(src, mode)
+}
+
+type bytesReader struct {
+	data []byte
+}
+
+func (r *bytesReader) Read(p []byte) (int, error) {
+	if len(r.data) > 0 {
+		n := copy(p, r.data)
+		r.data = r.data[n:]
+		return n, nil
+	}
+	return 0, io.EOF
 }
 
 func ReadBytes(src []byte, mode Mode) (*File, error) {
+	return Read(&bytesReader{src}, mode)
+}
+
+func Read(src io.Reader, mode Mode) (*File, error) {
 	var p parser
 	p.init(src)
 
@@ -41,14 +59,6 @@ func ReadBytes(src []byte, mode Mode) (*File, error) {
 	return ast, nil
 }
 
-func Read(r io.Reader, mode Mode) (*File, error) {
-	src, err := ioutil.ReadAll(r)
-	if err != nil {
-		return nil, err
-	}
-	return ReadBytes(src, mode)
-}
-
 func Write(w io.Writer, n *File) error {
 	panic("unimplemented")
 }
author	Robert Griesemer <gri@golang.org>	2016-03-10 13:25:09 -0800
committer	Matthew Dempsky <mdempsky@google.com>	2016-08-16 10:48:01 -0700
commit	c7cc983097d21f1ed3ad07672052f37a431a1192 (patch)
tree	5362f29f02783f79ceb43b14b5c156d90654a00e
parent	a85b9c5467b778f765504df330b7dec1ee1a1c97 (diff)
download	go-git-c7cc983097d21f1ed3ad07672052f37a431a1192.tar.gz