diff options
Diffstat (limited to 'libgo/go/regexp')
-rw-r--r-- | libgo/go/regexp/all_test.go | 15 | ||||
-rw-r--r-- | libgo/go/regexp/onepass.go | 7 | ||||
-rw-r--r-- | libgo/go/regexp/regexp.go | 2 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/doc.go | 48 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/parse.go | 41 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/parse_test.go | 13 | ||||
-rw-r--r-- | libgo/go/regexp/syntax/regexp.go | 2 |
7 files changed, 95 insertions, 33 deletions
diff --git a/libgo/go/regexp/all_test.go b/libgo/go/regexp/all_test.go index 301a1dfcd83..01ea3742a8b 100644 --- a/libgo/go/regexp/all_test.go +++ b/libgo/go/regexp/all_test.go @@ -6,6 +6,7 @@ package regexp import ( "reflect" + "regexp/syntax" "strings" "testing" ) @@ -473,9 +474,19 @@ func TestSplit(t *testing.T) { } } -// This ran out of stack before issue 7608 was fixed. +// Check that one-pass cutoff does trigger. func TestOnePassCutoff(t *testing.T) { - MustCompile(`^(?:x{1,1000}){1,1000}$`) + re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl) + if err != nil { + t.Fatalf("parse: %v", err) + } + p, err := syntax.Compile(re.Simplify()) + if err != nil { + t.Fatalf("compile: %v", err) + } + if compileOnePass(p) != notOnePass { + t.Fatalf("makeOnePass succeeded; wanted notOnePass") + } } func BenchmarkLiteral(b *testing.B) { diff --git a/libgo/go/regexp/onepass.go b/libgo/go/regexp/onepass.go index 501fb28af66..e6f42856387 100644 --- a/libgo/go/regexp/onepass.go +++ b/libgo/go/regexp/onepass.go @@ -1,4 +1,6 @@ // Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. package regexp @@ -9,9 +11,6 @@ import ( "unicode" ) -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - // "One-pass" regexp execution. // Some regexps can be analyzed to determine that they never need // backtracking: they are guaranteed to run in one pass over the string @@ -484,7 +483,7 @@ func makeOnePass(p *onePassProg) *onePassProg { } } if p != notOnePass { - for i, _ := range p.Inst { + for i := range p.Inst { p.Inst[i].Rune = onePassRunes[i] } } diff --git a/libgo/go/regexp/regexp.go b/libgo/go/regexp/regexp.go index 0b8336a04fb..b615acdf0e5 100644 --- a/libgo/go/regexp/regexp.go +++ b/libgo/go/regexp/regexp.go @@ -452,7 +452,7 @@ func (re *Regexp) ReplaceAllString(src, repl string) string { return string(b) } -// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp +// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp // with the replacement string repl. The replacement repl is substituted directly, // without using Expand. func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { diff --git a/libgo/go/regexp/syntax/doc.go b/libgo/go/regexp/syntax/doc.go index 8e72c90d3eb..e5e71f14f59 100644 --- a/libgo/go/regexp/syntax/doc.go +++ b/libgo/go/regexp/syntax/doc.go @@ -21,8 +21,8 @@ Single characters: [^xyz] negated character class \d Perl character class \D negated Perl character class - [:alpha:] ASCII character class - [:^alpha:] negated ASCII character class + [[:alpha:]] ASCII character class + [[:^alpha:]] negated ASCII character class \pN Unicode character class (one-letter name) \p{Greek} Unicode character class \PN negated Unicode character class (one-letter name) @@ -46,14 +46,14 @@ Repetitions: x{n,}? n or more x, prefer fewer x{n}? exactly n x -Implementation restriction: The counting forms x{n} etc. (but not the other -forms x* etc.) have an upper limit of n=1000. Negative or higher explicit -counts yield the parse error ErrInvalidRepeatSize. +Implementation restriction: The counting forms x{n,m}, x{n,}, and x{n} +reject forms that create a minimum or maximum repetition count above 1000. +Unlimited repetitions are not subject to this restriction. Grouping: (re) numbered capturing group (submatch) (?P<name>re) named & numbered capturing group (submatch) - (?:re) non-capturing group (submatch) + (?:re) non-capturing group (?flags) set flags within current group; non-capturing (?flags:re) set flags during re; non-capturing @@ -69,7 +69,7 @@ Empty strings: $ at end of text (like \z not \Z) or line (flag m=true) \A at beginning of text \b at ASCII word boundary (\w on one side and \W, \A, or \z on the other) - \B not an ASCII word boundary + \B not at ASCII word boundary \z at end of text Escape sequences: @@ -103,29 +103,29 @@ Named character classes as character class elements: [\p{Name}] named Unicode property inside character class (== \p{Name}) [^\p{Name}] named Unicode property inside negated character class (== \P{Name}) -Perl character classes: +Perl character classes (all ASCII-only): \d digits (== [0-9]) \D not digits (== [^0-9]) \s whitespace (== [\t\n\f\r ]) \S not whitespace (== [^\t\n\f\r ]) - \w ASCII word characters (== [0-9A-Za-z_]) - \W not ASCII word characters (== [^0-9A-Za-z_]) + \w word characters (== [0-9A-Za-z_]) + \W not word characters (== [^0-9A-Za-z_]) ASCII character classes: - [:alnum:] alphanumeric (== [0-9A-Za-z]) - [:alpha:] alphabetic (== [A-Za-z]) - [:ascii:] ASCII (== [\x00-\x7F]) - [:blank:] blank (== [\t ]) - [:cntrl:] control (== [\x00-\x1F\x7F]) - [:digit:] digits (== [0-9]) - [:graph:] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) - [:lower:] lower case (== [a-z]) - [:print:] printable (== [ -~] == [ [:graph:]]) - [:punct:] punctuation (== [!-/:-@[-`{-~]) - [:space:] whitespace (== [\t\n\v\f\r ]) - [:upper:] upper case (== [A-Z]) - [:word:] word characters (== [0-9A-Za-z_]) - [:xdigit:] hex digit (== [0-9A-Fa-f]) + [[:alnum:]] alphanumeric (== [0-9A-Za-z]) + [[:alpha:]] alphabetic (== [A-Za-z]) + [[:ascii:]] ASCII (== [\x00-\x7F]) + [[:blank:]] blank (== [\t ]) + [[:cntrl:]] control (== [\x00-\x1F\x7F]) + [[:digit:]] digits (== [0-9]) + [[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]) + [[:lower:]] lower case (== [a-z]) + [[:print:]] printable (== [ -~] == [ [:graph:]]) + [[:punct:]] punctuation (== [!-/:-@[-`{-~]) + [[:space:]] whitespace (== [\t\n\v\f\r ]) + [[:upper:]] upper case (== [A-Z]) + [[:word:]] word characters (== [0-9A-Za-z_]) + [[:xdigit:]] hex digit (== [0-9A-Fa-f]) */ package syntax diff --git a/libgo/go/regexp/syntax/parse.go b/libgo/go/regexp/syntax/parse.go index cb25dca3956..d579a4069b1 100644 --- a/libgo/go/regexp/syntax/parse.go +++ b/libgo/go/regexp/syntax/parse.go @@ -244,6 +244,7 @@ func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) ( if sub.Op >= opPseudo { return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} } + re := p.newRegexp(op) re.Min = min re.Max = max @@ -251,9 +252,47 @@ func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) ( re.Sub = re.Sub0[:1] re.Sub[0] = sub p.stack[n-1] = re + + if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { + return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} + } + return after, nil } +// repeatIsValid reports whether the repetition re is valid. +// Valid means that the combination of the top-level repetition +// and any inner repetitions does not exceed n copies of the +// innermost thing. +// This function rewalks the regexp tree and is called for every repetition, +// so we have to worry about inducing quadratic behavior in the parser. +// We avoid this by only calling repeatIsValid when min or max >= 2. +// In that case the depth of any >= 2 nesting can only get to 9 without +// triggering a parse error, so each subtree can only be rewalked 9 times. +func repeatIsValid(re *Regexp, n int) bool { + if re.Op == OpRepeat { + m := re.Max + if m == 0 { + return true + } + if m < 0 { + m = re.Min + } + if m > n { + return false + } + if m > 0 { + n /= m + } + } + for _, sub := range re.Sub { + if !repeatIsValid(sub, n) { + return false + } + } + return true +} + // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. func (p *parser) concat() *Regexp { p.maybeConcat(-1, 0) @@ -1639,7 +1678,7 @@ const ( // minimum and maximum runes involved in folding. // checked during test. minFold = 0x0041 - maxFold = 0x1044f + maxFold = 0x118df ) // appendFoldedRange returns the result of appending the range lo-hi diff --git a/libgo/go/regexp/syntax/parse_test.go b/libgo/go/regexp/syntax/parse_test.go index f3089294c6a..c4a1117ff86 100644 --- a/libgo/go/regexp/syntax/parse_test.go +++ b/libgo/go/regexp/syntax/parse_test.go @@ -200,6 +200,10 @@ var parseTests = []parseTest{ `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, {`x{2}y|x{2}[0-9]y`, `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, + + // Valid repetitions. + {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, + {`((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})`, ``}, } const testFlags = MatchNL | PerlX | UnicodeGroups @@ -262,6 +266,10 @@ func testParseDump(t *testing.T, tests []parseTest, flags Flags) { t.Errorf("Parse(%#q): %v", tt.Regexp, err) continue } + if tt.Dump == "" { + // It parsed. That's all we care about. + continue + } d := dump(re) if d != tt.Dump { t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) @@ -470,6 +478,7 @@ var invalidRegexps = []string{ `(?i)[a-Z]`, `a{100000}`, `a{100000,}`, + "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", } var onlyPerl = []string{ @@ -527,6 +536,10 @@ func TestToStringEquivalentParse(t *testing.T) { t.Errorf("Parse(%#q): %v", tt.Regexp, err) continue } + if tt.Dump == "" { + // It parsed. That's all we care about. + continue + } d := dump(re) if d != tt.Dump { t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) diff --git a/libgo/go/regexp/syntax/regexp.go b/libgo/go/regexp/syntax/regexp.go index 329a90e0129..cea7d9e04fe 100644 --- a/libgo/go/regexp/syntax/regexp.go +++ b/libgo/go/regexp/syntax/regexp.go @@ -39,7 +39,7 @@ const ( OpEmptyMatch // matches empty string OpLiteral // matches Runes sequence OpCharClass // matches Runes interpreted as range pair list - OpAnyCharNotNL // matches any character + OpAnyCharNotNL // matches any character except newline OpAnyChar // matches any character OpBeginLine // matches empty string at beginning of line OpEndLine // matches empty string at end of line |