From c7cc983097d21f1ed3ad07672052f37a431a1192 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Thu, 10 Mar 2016 13:25:09 -0800 Subject: cmd/compile/internal/syntax: implement buffered reading from io.Reader No performance impact: $ go test -run StdLib -fast parsed 1073074 lines (2823 files) in 575.606804ms (1864248 lines/s) allocated 263.956Mb (458.570Mb/s) PASS --- src/cmd/compile/internal/syntax/nodes.go | 2 +- src/cmd/compile/internal/syntax/scanner.go | 27 +- src/cmd/compile/internal/syntax/scanner_test.go | 6 +- src/cmd/compile/internal/syntax/source.go | 375 +++++++----------------- src/cmd/compile/internal/syntax/syntax.go | 32 +- 5 files changed, 147 insertions(+), 295 deletions(-) diff --git a/src/cmd/compile/internal/syntax/nodes.go b/src/cmd/compile/internal/syntax/nodes.go index 7632f95348..25eee38428 100644 --- a/src/cmd/compile/internal/syntax/nodes.go +++ b/src/cmd/compile/internal/syntax/nodes.go @@ -89,7 +89,7 @@ func (decl) aDecl() {} // All declarations belonging to the same group point to the same Group node. type Group struct { - dummy int // make sure it's not an empty node + dummy int // not empty so we are guaranteed different Group instances } // ---------------------------------------------------------------------------- diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go index 75bf1db303..35e9b2cd1a 100644 --- a/src/cmd/compile/internal/syntax/scanner.go +++ b/src/cmd/compile/internal/syntax/scanner.go @@ -6,6 +6,7 @@ package syntax import ( "fmt" + "io" "unicode" "unicode/utf8" ) @@ -22,7 +23,7 @@ type scanner struct { prec int // valid if tok is _Operator } -func (s *scanner) init(src []byte) { +func (s *scanner) init(src io.Reader) { s.source.init(src) s.nlsemi = false } @@ -39,7 +40,7 @@ redo: } // token start - s.pos, s.line = s.source.pos, s.source.line + s.pos, s.line = s.source.pos(), s.source.line if isLetter(c) || c >= utf8.RuneSelf && unicode.IsLetter(c) { s.ident() @@ -127,7 +128,7 @@ redo: break } s.ungetr() - s.oldpos-- // make next ungetr work (line cannot have changed) + s.source.r0-- // make next ungetr work (line cannot have changed) } s.ungetr() s.tok = _Dot @@ -274,7 +275,9 @@ redo: default: s.tok = 0 - panic(0) + fmt.Printf("invalid rune %q\n", c) + panic("invalid rune") + goto redo } return @@ -357,11 +360,12 @@ func (s *scanner) number(c rune) { if c == 'x' || c == 'X' { // hex c = s.getr() - pos := s.source.pos + hasDigit := false for isDigit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' { c = s.getr() + hasDigit = true } - if pos == s.source.pos { + if !hasDigit { panic("malformed hex constant") } s.ungetr() @@ -500,17 +504,16 @@ var pragmas = map[string]bool{ func (s *scanner) lineComment() { // recognize pragmas - start := s.source.pos r := s.getr() switch r { case 'g': r = s.match(r, "go:") if r < 0 { - m := string(s.buf[start+3 : s.source.pos]) - if pragmas[m] { - // TODO(gri) record pragma - //println(m) - } + // m := string(s.buf[start+3 : s.source.pos()]) + // if pragmas[m] { + // // TODO(gri) record pragma + // //println(m) + // } return } case 'l': diff --git a/src/cmd/compile/internal/syntax/scanner_test.go b/src/cmd/compile/internal/syntax/scanner_test.go index 0e90812d4b..9268cb35e0 100644 --- a/src/cmd/compile/internal/syntax/scanner_test.go +++ b/src/cmd/compile/internal/syntax/scanner_test.go @@ -6,15 +6,17 @@ package syntax import ( "fmt" - "io/ioutil" + "os" "testing" ) func TestScanner(t *testing.T) { - src, err := ioutil.ReadFile("parser.go") + src, err := os.Open("parser.go") if err != nil { t.Fatal(err) } + defer src.Close() + var s scanner s.init(src) for { diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go index 8fd7687da2..07b59ab298 100644 --- a/src/cmd/compile/internal/syntax/source.go +++ b/src/cmd/compile/internal/syntax/source.go @@ -9,304 +9,141 @@ import ( "unicode/utf8" ) +// buf [...read...|...|...unread...|s|...free...] +// ^ ^ ^ ^ +// | | | | +// suf r0 r w + type source struct { src io.Reader - end int - buf []byte - litbuf []byte - pos, line int - oldpos, oldline int - pin int -} + // source buffer + buf [4 << 10]byte + offs int // source offset of buf + r0, r, w int // previous/current read and write buf positions, excluding sentinel + line0, line int // previous/current line + err error // pending io error -func (s *source) init(src []byte) { - s.buf = append(src, utf8.RuneSelf) // terminate with sentinel - s.pos = 0 - s.line = 1 - s.oldline = 1 -} - -func (s *source) ungetr() { - s.pos, s.line = s.oldpos, s.oldline + // literal buffer + lit []byte // literal prefix + suf int // literal suffix; suf >= 0 means we are scanning a literal } -func (s *source) getr() rune { -redo: - s.oldpos, s.oldline = s.pos, s.line - - // common case: 7bit ASCII - if b := s.buf[s.pos]; b < utf8.RuneSelf { - s.pos++ - if b == 0 { - panic("invalid NUL byte") - goto redo // (or return 0?) - } - if b == '\n' { - s.line++ - } - return rune(b) - } - - // uncommon case: not ASCII or not enough bytes - r, w := utf8.DecodeRune(s.buf[s.pos:]) - s.pos += w - if r == utf8.RuneError && w == 1 { - if s.pos >= len(s.buf) { - s.ungetr() // so next getr also returns EOF - return -1 // EOF - } - panic("invalid Unicode character") - goto redo - } +func (s *source) init(src io.Reader) { + s.src = src + s.buf[0] = utf8.RuneSelf // terminate with sentinel + s.offs = 0 + s.r0, s.r, s.w = 0, 0, 0 + s.line0, s.line = 1, 1 + s.err = nil - // BOM's are only allowed as the first character in a file - const BOM = 0xfeff - if r == BOM && s.oldpos > 0 { - panic("invalid BOM in the middle of the file") - goto redo - } - - return r + s.lit = s.lit[:0] + s.suf = -1 } -// TODO(gri) enable this one -func (s *source) getr_() rune { -redo: - s.oldpos, s.oldline = s.pos, s.line - - // common case: 7bit ASCII - if b := s.buf[s.pos]; b < utf8.RuneSelf { - s.pos++ - if b == 0 { - panic("invalid NUL byte") - goto redo // (or return 0?) - } - if b == '\n' { - s.line++ - } - return rune(b) - } - - // uncommon case: not ASCII or not enough bytes - r, w := utf8.DecodeRune(s.buf[s.pos:s.end]) - if r == utf8.RuneError && w == 1 { - if s.refill() { - goto redo - } - // TODO(gri) carefull: this depends on whether s.end includes sentinel or not - if s.pos < s.end { - panic("invalid Unicode character") - goto redo - } - // EOF - return -1 - } - - s.pos += w - - // BOM's are only allowed as the first character in a file - const BOM = 0xfeff - if r == BOM && s.oldpos > 0 { - panic("invalid BOM in the middle of the file") - goto redo - } - - return r +func (s *source) pos() int { + return s.offs + s.r } -func (s *source) refill() bool { - for s.pos+utf8.UTFMax > s.end && !utf8.FullRune(s.buf[s.pos:s.end]) { - // not enough bytes - - // save literal prefix if any - if s.pin >= 0 { - s.litbuf = append(s.litbuf, s.buf[s.pin:s.oldpos]...) - s.pin = 0 - } +func (s *source) ungetr() { + s.r, s.line = s.r0, s.line0 +} - // move unread bytes to beginning of buffer - copy(s.buf[0:], s.buf[s.oldpos:s.end]) - // read more bytes - // (an io.Reader must return io.EOF when it reaches - // the end of what it is reading - simply returning - // n == 0 will make this loop retry forever; but the - // error is in the reader implementation in that case) - // TODO(gri) check for it and return io.ErrNoProgress? - // (see also bufio.go:666) - i := s.end - s.oldpos - n, err := s.src.Read(s.buf[i : len(s.buf)-1]) - s.pos -= s.oldpos - s.oldpos = 0 - s.end = i + n - s.buf[s.end] = utf8.RuneSelf // sentinel - if err != nil { - if s.pos == s.end { - return false // EOF +func (s *source) getr() rune { + for { + s.r0, s.line0 = s.r, s.line + + // common case: ASCII and enough bytes + if b := s.buf[s.r]; b < utf8.RuneSelf { + s.r++ + if b == 0 { + panic("invalid NUL character") + continue } - if err != io.EOF { - panic(err) // TODO(gri) fix this + if b == '\n' { + s.line++ } - // If err == EOF, we won't be getting more - // bytes; break to avoid infinite loop. If - // err is something else, we don't know if - // we can get more bytes; thus also break. - break - } - } - return true -} - -func (s *source) startLit() { - s.litbuf = s.litbuf[:0] - s.pin = s.oldpos -} - -func (s *source) stopLit() string { - return string(s.buf[s.pin:s.pos]) - - lit := s.buf[s.pin:s.pos] - s.pin = -1 - if len(s.litbuf) > 0 { - s.litbuf = append(s.litbuf, lit...) - lit = s.litbuf - } - - return string(lit) -} - -/* -// getr reads and returns the next Unicode character. It is designed such -// that only a minimal amount of work needs to be done in the common ASCII -// case (a single test to check for both ASCII and end-of-buffer, and one -// test each to check for NUL and to count newlines). -func (s *scanner) getr() rune { - // unread rune != 0 available - if r := s.peekr1; r != 0 { - s.peekr1 = s.peekr2 - s.peekr2 = 0 - if r == '\n' && importpkg == nil { - lexlineno++ - } - return r - } - -redo: - // common case: 7bit ASCII - if b := s.buf[s.pos]; b < utf8.RuneSelf { - s.pos++ - if b == 0 { - // TODO(gri) do we need lineno = lexlineno here? - Yyerror("illegal NUL byte") - return 0 - } - if b == '\n' && importpkg == nil { - lexlineno++ + return rune(b) + } + + // uncommon case: not ASCII or not enough bytes + r, w := utf8.DecodeRune(s.buf[s.r:s.w]) // optimistically assume valid rune + if r != utf8.RuneError || w > 1 { + s.r += w + // BOM's are only allowed as the first character in a file + const BOM = 0xfeff + if r == BOM && s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1) + panic("invalid BOM in the middle of the file") + continue + } + return r } - return rune(b) - } - // uncommon case: not ASCII or not enough bytes - for s.pos+utf8.UTFMax > s.end && !utf8.FullRune(s.buf[s.pos:s.end]) { - // not enough bytes: read some more, but first - // move unread bytes to beginning of buffer - copy(s.buf[0:], s.buf[s.pos:s.end]) - // read more bytes - // (an io.Reader must return io.EOF when it reaches - // the end of what it is reading - simply returning - // n == 0 will make this loop retry forever; but the - // error is in the reader implementation in that case) - // TODO(gri) check for it an return io.ErrNoProgress? - // (see also bufio.go:666) - i := s.end - s.pos - n, err := s.src.Read(s.buf[i : len(s.buf)-1]) - s.pos = 0 - s.end = i + n - s.buf[s.end] = utf8.RuneSelf // sentinel - if err != nil { - if s.end == 0 { - return EOF + if w == 0 && s.err != nil { + if s.err != io.EOF { + panic(s.err) } - if err != io.EOF { - panic(err) // TODO(gri) fix this - } - // If err == EOF, we won't be getting more - // bytes; break to avoid infinite loop. If - // err is something else, we don't know if - // we can get more bytes; thus also break. - break + return -1 } - } - // we have at least one byte (excluding sentinel) - // common case: 7bit ASCII - if b := s.buf[s.pos]; b < utf8.RuneSelf { - s.pos++ - if b == 0 { - // TODO(gri) do we need lineno = lexlineno here? - Yyerror("illegal NUL byte") - return 0 - } - if b == '\n' && importpkg == nil { - lexlineno++ + if w == 1 && (s.r+utf8.UTFMax <= s.w || utf8.FullRune(s.buf[s.r:s.w])) { + s.r++ + panic("invalid UTF-8 encoding") + continue } - return rune(b) - } - // uncommon case: not ASCII - r, w := utf8.DecodeRune(s.buf[s.pos:s.end]) - s.pos += w - if r == utf8.RuneError && w == 1 { - lineno = lexlineno - // The string conversion here makes a copy for passing - // to fmt.Printf, so that buf itself does not escape and - // can be allocated on the stack. - Yyerror("illegal UTF-8 sequence %x", r) + s.fill() } +} - if r == BOM { - yyerrorl(int(lexlineno), "Unicode (UTF-8) BOM in middle of file") - goto redo +func (s *source) fill() { + // Slide unread bytes to beginning but preserve last read char + // (for one ungetr call) plus one extra byte (for a 2nd ungetr + // call, only for ".." character sequence). + if s.r0 > 1 { + // save literal prefix, if any + // (We see at most one ungetr call while reading + // a literal, so make sure s.r0 remains in buf.) + if s.suf >= 0 { + s.lit = append(s.lit, s.buf[s.suf:s.r0]...) + s.suf = 1 // == s.r0 after slide below + } + s.offs += s.r0 - 1 + r := s.r - s.r0 + 1 // last read char plus one byte + s.w = r + copy(s.buf[r:], s.buf[s.r:s.w]) + s.r = r + s.r0 = 1 + } + + // read more data: try a limited number of times + for i := 100; i > 0; i-- { + n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel + if n < 0 { + panic("negative read") + } + s.w += n + if n > 0 || err != nil { + s.buf[s.w] = utf8.RuneSelf // sentinel + if err != nil { + s.err = err + } + return + } } - return r -} - -// pos returns the position of the most recently read character s.ch. -func (s *Scanner) pos() Offset { - // TODO(gri) consider replacing lastCharLen with chPos or equivalent - return Offset(s.srcBufOffset + s.srcPos - s.chLen) + panic("no progress") } -func (s *Scanner) startLiteral() { - s.symBuf = s.symBuf[:0] - s.symPos = s.srcPos - s.chLen +func (s *source) startLit() { + s.suf = s.r0 + s.lit = s.lit[:0] // reuse lit } -func (s *Scanner) stopLiteral(stripCR bool) string { - symEnd := s.srcPos - s.chLen - - lit := s.srcBuf[s.symPos:symEnd] - s.symPos = -1 - if len(s.symBuf) > 0 { - // part of the symbol text was saved in symBuf: save the rest in - // symBuf as well and return its content - s.symBuf = append(s.symBuf, lit...) - lit = s.symBuf - } - - if stripCR { - c := make([]byte, len(lit)) - i := 0 - for _, ch := range lit { - if ch != '\r' { - c[i] = ch - i++ - } - } - lit = c[:i] +func (s *source) stopLit() string { + lit := s.buf[s.suf:s.r] + if len(s.lit) > 0 { + lit = append(s.lit, lit...) } - + s.suf = -1 // no pending literal return string(lit) } -*/ diff --git a/src/cmd/compile/internal/syntax/syntax.go b/src/cmd/compile/internal/syntax/syntax.go index 8dde0703a3..4df2eb5fe2 100644 --- a/src/cmd/compile/internal/syntax/syntax.go +++ b/src/cmd/compile/internal/syntax/syntax.go @@ -7,7 +7,7 @@ package syntax import ( "fmt" "io" - "io/ioutil" + "os" ) type Mode uint @@ -15,14 +15,32 @@ type Mode uint // TODO(gri) These need a lot more work. func ReadFile(filename string, mode Mode) (*File, error) { - src, err := ioutil.ReadFile(filename) + src, err := os.Open(filename) if err != nil { return nil, err } - return ReadBytes(src, mode) + defer src.Close() + return Read(src, mode) +} + +type bytesReader struct { + data []byte +} + +func (r *bytesReader) Read(p []byte) (int, error) { + if len(r.data) > 0 { + n := copy(p, r.data) + r.data = r.data[n:] + return n, nil + } + return 0, io.EOF } func ReadBytes(src []byte, mode Mode) (*File, error) { + return Read(&bytesReader{src}, mode) +} + +func Read(src io.Reader, mode Mode) (*File, error) { var p parser p.init(src) @@ -41,14 +59,6 @@ func ReadBytes(src []byte, mode Mode) (*File, error) { return ast, nil } -func Read(r io.Reader, mode Mode) (*File, error) { - src, err := ioutil.ReadAll(r) - if err != nil { - return nil, err - } - return ReadBytes(src, mode) -} - func Write(w io.Writer, n *File) error { panic("unimplemented") } -- cgit v1.2.1