diff options
author | ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4> | 2010-12-03 04:34:57 +0000 |
---|---|---|
committer | ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4> | 2010-12-03 04:34:57 +0000 |
commit | e440a3286bc89368b8d3a8fd6accd47191790bf2 (patch) | |
tree | 38fe54a4f38ede5d949c915d66191f24a6fe5153 /libgo/go/regexp | |
parent | a641ee368e2614349084a9a7bda2ec2b0b2bc1cf (diff) | |
download | gcc-e440a3286bc89368b8d3a8fd6accd47191790bf2.tar.gz |
Add Go frontend, libgo library, and Go testsuite.
gcc/:
* gcc.c (default_compilers): Add entry for ".go".
* common.opt: Add -static-libgo as a driver option.
* doc/install.texi (Configuration): Mention libgo as an option for
--enable-shared. Mention go as an option for --enable-languages.
* doc/invoke.texi (Overall Options): Mention .go as a file name
suffix. Mention go as a -x option.
* doc/frontends.texi (G++ and GCC): Mention Go as a supported
language.
* doc/sourcebuild.texi (Top Level): Mention libgo.
* doc/standards.texi (Standards): Add section on Go language.
Move references for other languages into their own section.
* doc/contrib.texi (Contributors): Mention that I contributed the
Go frontend.
gcc/testsuite/:
* lib/go.exp: New file.
* lib/go-dg.exp: New file.
* lib/go-torture.exp: New file.
* lib/target-supports.exp (check_compile): Match // Go.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@167407 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgo/go/regexp')
-rw-r--r-- | libgo/go/regexp/all_test.go | 362 | ||||
-rw-r--r-- | libgo/go/regexp/find_test.go | 454 | ||||
-rw-r--r-- | libgo/go/regexp/regexp.go | 1400 |
3 files changed, 2216 insertions, 0 deletions
diff --git a/libgo/go/regexp/all_test.go b/libgo/go/regexp/all_test.go new file mode 100644 index 00000000000..d5a0e7da6ad --- /dev/null +++ b/libgo/go/regexp/all_test.go @@ -0,0 +1,362 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "os" + "strings" + "testing" +) + +var good_re = []string{ + ``, + `.`, + `^.$`, + `a`, + `a*`, + `a+`, + `a?`, + `a|b`, + `a*|b*`, + `(a*|b)(c*|d)`, + `[a-z]`, + `[a-abc-c\-\]\[]`, + `[a-z]+`, + `[]`, + `[abc]`, + `[^1234]`, + `[^\n]`, + `\!\\`, +} + +type stringError struct { + re string + err os.Error +} + +var bad_re = []stringError{ + {`*`, ErrBareClosure}, + {`(abc`, ErrUnmatchedLpar}, + {`abc)`, ErrUnmatchedRpar}, + {`x[a-z`, ErrUnmatchedLbkt}, + {`abc]`, ErrUnmatchedRbkt}, + {`[z-a]`, ErrBadRange}, + {`abc\`, ErrExtraneousBackslash}, + {`a**`, ErrBadClosure}, + {`a*+`, ErrBadClosure}, + {`a??`, ErrBadClosure}, + {`*`, ErrBareClosure}, + {`\x`, ErrBadBackslash}, +} + +func compileTest(t *testing.T, expr string, error os.Error) *Regexp { + re, err := Compile(expr) + if err != error { + t.Error("compiling `", expr, "`; unexpected error: ", err.String()) + } + return re +} + +func TestGoodCompile(t *testing.T) { + for i := 0; i < len(good_re); i++ { + compileTest(t, good_re[i], nil) + } +} + +func TestBadCompile(t *testing.T) { + for i := 0; i < len(bad_re); i++ { + compileTest(t, bad_re[i].re, bad_re[i].err) + } +} + +func matchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, nil) + if re == nil { + return + } + m := re.MatchString(test.text) + if m != (len(test.matches) > 0) { + t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } + // now try bytes + m = re.Match([]byte(test.text)) + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatch(t *testing.T) { + for _, test := range findTests { + matchTest(t, &test) + } +} + +func matchFunctionTest(t *testing.T, test *FindTest) { + m, err := MatchString(test.pat, test.text) + if err == nil { + return + } + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatchFunction(t *testing.T) { + for _, test := range findTests { + matchFunctionTest(t, &test) + } +} + +type ReplaceTest struct { + pattern, replacement, input, output string +} + +var replaceTests = []ReplaceTest{ + // Test empty input and/or replacement, with pattern that matches the empty string. + {"", "", "", ""}, + {"", "x", "", "x"}, + {"", "", "abc", "abc"}, + {"", "x", "abc", "xaxbxcx"}, + + // Test empty input and/or replacement, with pattern that does not match the empty string. + {"b", "", "", ""}, + {"b", "x", "", ""}, + {"b", "", "abc", "ac"}, + {"b", "x", "abc", "axc"}, + {"y", "", "", ""}, + {"y", "x", "", ""}, + {"y", "", "abc", "abc"}, + {"y", "x", "abc", "abc"}, + + // Multibyte characters -- verify that we don't try to match in the middle + // of a character. + {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, + {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, + + // Start and end of a string. + {"^[a-c]*", "x", "abcdabc", "xdabc"}, + {"[a-c]*$", "x", "abcdabc", "abcdx"}, + {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]*", "x", "abc", "x"}, + {"[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*", "x", "dabce", "xdabce"}, + {"[a-c]*$", "x", "dabce", "dabcex"}, + {"^[a-c]*$", "x", "dabce", "dabce"}, + {"^[a-c]*", "x", "", "x"}, + {"[a-c]*$", "x", "", "x"}, + {"^[a-c]*$", "x", "", "x"}, + + {"^[a-c]+", "x", "abcdabc", "xdabc"}, + {"[a-c]+$", "x", "abcdabc", "abcdx"}, + {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]+", "x", "abc", "x"}, + {"[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+", "x", "dabce", "dabce"}, + {"[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+", "x", "", ""}, + {"[a-c]+$", "x", "", ""}, + {"^[a-c]+$", "x", "", ""}, + + // Other cases. + {"abc", "def", "abcdefg", "defdefg"}, + {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, + {"abc", "", "abcdabc", "d"}, + {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, + {"abc", "d", "", ""}, + {"abc", "d", "abc", "d"}, + {".+", "x", "abc", "x"}, + {"[a-c]*", "x", "def", "xdxexfx"}, + {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, + {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, +} + +type ReplaceFuncTest struct { + pattern string + replacement func(string) string + input, output string +} + +var replaceFuncTests = []ReplaceFuncTest{ + {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, + {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, + {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, +} + +func TestReplaceAll(t *testing.T) { + for _, tc := range replaceTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.Replace(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.Replace(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllFunc(t *testing.T) { + for _, tc := range replaceFuncTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +type QuoteMetaTest struct { + pattern, output string +} + +var quoteMetaTests = []QuoteMetaTest{ + {``, ``}, + {`foo`, `foo`}, + {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[{\]}\\\|,<\.>/\?~`}, +} + +func TestQuoteMeta(t *testing.T) { + for _, tc := range quoteMetaTests { + // Verify that QuoteMeta returns the expected string. + quoted := QuoteMeta(tc.pattern) + if quoted != tc.output { + t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", + tc.pattern, quoted, tc.output) + continue + } + + // Verify that the quoted string is in fact treated as expected + // by Compile -- i.e. that it matches the original, unquoted string. + if tc.pattern != "" { + re, err := Compile(quoted) + if err != nil { + t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) + continue + } + src := "abc" + tc.pattern + "def" + repl := "xyz" + replaced := re.ReplaceAllString(src, repl) + expected := "abcxyzdef" + if replaced != expected { + t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", + tc.pattern, src, repl, replaced, expected) + } + } + } +} + +type numSubexpCase struct { + input string + expected int +} + +var numSubexpCases = []numSubexpCase{ + {``, 0}, + {`.*`, 0}, + {`abba`, 0}, + {`ab(b)a`, 1}, + {`ab(.*)a`, 1}, + {`(.*)ab(.*)a`, 2}, + {`(.*)(ab)(.*)a`, 3}, + {`(.*)((a)b)(.*)a`, 4}, + {`(.*)(\(ab)(.*)a`, 3}, + {`(.*)(\(a\)b)(.*)a`, 3}, +} + +func TestNumSubexp(t *testing.T) { + for _, c := range numSubexpCases { + re := MustCompile(c.input) + n := re.NumSubexp() + if n != c.expected { + t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) + } + } +} + +func BenchmarkLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + b.StopTimer() + re := MustCompile(x) + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkNotLiteral(b *testing.B) { + x := strings.Repeat("x", 49) + b.StopTimer() + re := MustCompile("^" + x) + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass(b *testing.B) { + b.StopTimer() + x := strings.Repeat("xxxx", 20) + "w" + re := MustCompile("[abcdw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass_InRange(b *testing.B) { + b.StopTimer() + // 'b' is betwen 'a' and 'c', so the charclass + // range checking is no help here. + x := strings.Repeat("bbbb", 20) + "c" + re := MustCompile("[ac]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkReplaceAll(b *testing.B) { + x := "abcdefghijklmnopqrstuvwxyz" + b.StopTimer() + re := MustCompile("[cjrw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.ReplaceAllString(x, "") + } +} diff --git a/libgo/go/regexp/find_test.go b/libgo/go/regexp/find_test.go new file mode 100644 index 00000000000..07f5586f2b3 --- /dev/null +++ b/libgo/go/regexp/find_test.go @@ -0,0 +1,454 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "fmt" + "testing" +) + +// For each pattern/text pair, what is the expected output of each function? +// We can derive the textual results from the indexed results, the non-submatch +// results from the submatched results, the single results from the 'all' results, +// and the byte results from the string results. Therefore the table includes +// only the FindAllStringSubmatchIndex result. +type FindTest struct { + pat string + text string + matches [][]int +} + +func (t FindTest) String() string { + return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) +} + +var findTests = []FindTest{ + {``, ``, build(1, 0, 0)}, + {`^abcdefg`, "abcdefg", build(1, 0, 7)}, + {`a+`, "baaab", build(1, 1, 4)}, + {"abcd..", "abcdef", build(1, 0, 6)}, + {`a`, "a", build(1, 0, 1)}, + {`x`, "y", nil}, + {`b`, "abc", build(1, 1, 2)}, + {`.`, "a", build(1, 0, 1)}, + {`.*`, "abcdef", build(1, 0, 6)}, + {`^`, "abcde", build(1, 0, 0)}, + {`$`, "abcde", build(1, 5, 5)}, + {`^abcd$`, "abcd", build(1, 0, 4)}, + {`^bcd'`, "abcdef", nil}, + {`^abcd$`, "abcde", nil}, + {`a+`, "baaab", build(1, 1, 4)}, + {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, + {`[a-z]+`, "abcd", build(1, 0, 4)}, + {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, + {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, + {`[^\n]+`, "abcd\n", build(1, 0, 4)}, + {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, + {`日本語+`, "日本語", build(1, 0, 9)}, + {`日本語+`, "日本語語語語", build(1, 0, 18)}, + {`()`, "", build(1, 0, 0, 0, 0)}, + {`(a)`, "a", build(1, 0, 1, 0, 1)}, + {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, + {`(.*)`, "", build(1, 0, 0, 0, 0)}, + {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, + {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, + {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, + {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, + {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, + {`\a\b\f\n\r\t\v`, "\a\b\f\n\r\t\v", build(1, 0, 7)}, + {`[\a\b\f\n\r\t\v]+`, "\a\b\f\n\r\t\v", build(1, 0, 7)}, + + {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, + {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, + {`[.]`, ".", build(1, 0, 1)}, + {`/$`, "/abc/", build(1, 4, 5)}, + {`/$`, "/abc", nil}, + + // multiple matches + {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, + {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, + {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, + {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, + {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, + + // fixed bugs + {`ab$`, "cab", build(1, 1, 3)}, + {`axxb$`, "axxcb", nil}, + {`data`, "daXY data", build(1, 5, 9)}, + {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, + + // can backslash-escape any punctuation + {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {"\\`", "`", build(1, 0, 1)}, + {"[\\`]+", "`", build(1, 0, 1)}, + + // long set of matches (longer than startSize) + { + ".", + "qwertyuiopasdfghjklzxcvbnm1234567890", + build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, + 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, + 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), + }, +} + +// build is a helper to construct a [][]int by extracting n sequences from x. +// This represents n matches with len(x)/n submatches each. +func build(n int, x ...int) [][]int { + ret := make([][]int, n) + runLength := len(x) / n + j := 0 + for i := range ret { + ret[i] = make([]int, runLength) + copy(ret[i], x[j:]) + j += runLength + if j > len(x) { + panic("invalid build entry") + } + } + return ret +} + +// First the simple cases. + +func TestFind(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).Find([]byte(test.text)) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != string(result) { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func TestFindString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindString(test.text) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != "": + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == "": + // Tricky because an empty result has two meanings: no match or empty match. + if test.matches[0][0] != test.matches[0][1] { + t.Errorf("expected match; got none: %s", test) + } + case test.matches != nil && result != "": + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != result { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func testFindIndex(test *FindTest, result []int, t *testing.T) { + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.matches[0] + if expect[0] != result[0] || expect[1] != result[1] { + t.Errorf("expected %v got %v: %s", expect, result, test) + } + } +} + +func TestFindIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) + } +} + +func TestFindStringIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) + } +} + +// Now come the simple All cases. + +func TestFindAll(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAll([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != string(result[k]) { + t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) + } + } + } + } +} + +func TestFindAllString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllString(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != result[k] { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } + } +} + +func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + return + } + for k, e := range test.matches { + if e[0] != result[k][0] || e[1] != result[k][1] { + t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) + } + } + } +} + +func TestFindAllIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) + } +} + +// Now come the Submatch cases. + +func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != nil { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != string(result[k/2]) { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchBytes(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != "" { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != result[k/2] { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindStringSubmatch(test.text) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchString(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { + if len(expect) != len(result) { + t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) + return + } + for k, e := range expect { + if e != result[k] { + t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) + } + } +} + +func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchIndices(test, 0, test.matches[0], result, t) + } +} + +func TestFindSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) + } +} + +func TestFindStringSubmatchndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) + } +} + +// Now come the monster AllSubmatch cases. + +func TestFindAllSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchBytes(&test, k, match, result[k], t) + } + } + } +} + +func TestFindAllStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchString(&test, k, match, result[k], t) + } + } + } +} + +func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchIndices(test, k, match, result[k], t) + } + } +} + +func TestFindAllSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringSubmatchndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) + } +} diff --git a/libgo/go/regexp/regexp.go b/libgo/go/regexp/regexp.go new file mode 100644 index 00000000000..d3f03ad790b --- /dev/null +++ b/libgo/go/regexp/regexp.go @@ -0,0 +1,1400 @@ +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package regexp implements a simple regular expression library. +// +// The syntax of the regular expressions accepted is: +// +// regexp: +// concatenation { '|' concatenation } +// concatenation: +// { closure } +// closure: +// term [ '*' | '+' | '?' ] +// term: +// '^' +// '$' +// '.' +// character +// '[' [ '^' ] { character-range } ']' +// '(' regexp ')' +// character-range: +// character [ '-' character ] +// +// All characters are UTF-8-encoded code points. Backslashes escape special +// characters, including inside character classes. The standard Go character +// escapes are also recognized: \a \b \f \n \r \t \v. +// +// There are 16 methods of Regexp that match a regular expression and identify +// the matched text. Their names are matched by this regular expression: +// +// Find(All)?(String)?(Submatch)?(Index)? +// +// If 'All' is present, the routine matches successive non-overlapping +// matches of the entire expression. Empty matches abutting a preceding +// match are ignored. The return value is a slice containing the successive +// return values of the corresponding non-'All' routine. These routines take +// an extra integer argument, n; if n >= 0, the function returns at most n +// matches/submatches. +// +// If 'String' is present, the argument is a string; otherwise it is a slice +// of bytes; return values are adjusted as appropriate. +// +// If 'Submatch' is present, the return value is a slice identifying the +// successive submatches of the expression. Submatches are matches of +// parenthesized subexpressions within the regular expression, numbered from +// left to right in order of opening parenthesis. Submatch 0 is the match of +// the entire expression, submatch 1 the match of the first parenthesized +// subexpression, and so on. +// +// If 'Index' is present, matches and submatches are identified by byte index +// pairs within the input string: result[2*n:2*n+1] identifies the indexes of +// the nth submatch. The pair for n==0 identifies the match of the entire +// expression. If 'Index' is not present, the match is identified by the +// text of the match/submatch. If an index is negative, it means that +// subexpression did not match any string in the input. +// +// (There are a few other methods that do not match this pattern.) +// +package regexp + +import ( + "bytes" + "io" + "os" + "strings" + "utf8" +) + +var debug = false + +// Error is the local type for a parsing error. +type Error string + +func (e Error) String() string { + return string(e) +} + +// Error codes returned by failures to parse an expression. +var ( + ErrInternal = Error("internal error") + ErrUnmatchedLpar = Error("unmatched '('") + ErrUnmatchedRpar = Error("unmatched ')'") + ErrUnmatchedLbkt = Error("unmatched '['") + ErrUnmatchedRbkt = Error("unmatched ']'") + ErrBadRange = Error("bad range in character class") + ErrExtraneousBackslash = Error("extraneous backslash") + ErrBadClosure = Error("repeated closure (**, ++, etc.)") + ErrBareClosure = Error("closure applies to nothing") + ErrBadBackslash = Error("illegal backslash escape") +) + +// An instruction executed by the NFA +type instr interface { + kind() int // the type of this instruction: _CHAR, _ANY, etc. + next() instr // the instruction to execute after this one + setNext(i instr) + index() int + setIndex(i int) + print() +} + +// Fields and methods common to all instructions +type common struct { + _next instr + _index int +} + +func (c *common) next() instr { return c._next } +func (c *common) setNext(i instr) { c._next = i } +func (c *common) index() int { return c._index } +func (c *common) setIndex(i int) { c._index = i } + +// Regexp is the representation of a compiled regular expression. +// The public interface is entirely through methods. +type Regexp struct { + expr string // the original expression + prefix string // initial plain text string + prefixBytes []byte // initial plain text bytes + inst []instr + start instr // first instruction of machine + prefixStart instr // where to start if there is a prefix + nbra int // number of brackets in expression, for subexpressions +} + +const ( + _START = iota // beginning of program + _END // end of program: success + _BOT // '^' beginning of text + _EOT // '$' end of text + _CHAR // 'a' regular character + _CHARCLASS // [a-z] character class + _ANY // '.' any character including newline + _NOTNL // [^\n] special case: any character but newline + _BRA // '(' parenthesized expression + _EBRA // ')'; end of '(' parenthesized expression + _ALT // '|' alternation + _NOP // do nothing; makes it easy to link without patching +) + +// --- START start of program +type _Start struct { + common +} + +func (start *_Start) kind() int { return _START } +func (start *_Start) print() { print("start") } + +// --- END end of program +type _End struct { + common +} + +func (end *_End) kind() int { return _END } +func (end *_End) print() { print("end") } + +// --- BOT beginning of text +type _Bot struct { + common +} + +func (bot *_Bot) kind() int { return _BOT } +func (bot *_Bot) print() { print("bot") } + +// --- EOT end of text +type _Eot struct { + common +} + +func (eot *_Eot) kind() int { return _EOT } +func (eot *_Eot) print() { print("eot") } + +// --- CHAR a regular character +type _Char struct { + common + char int +} + +func (char *_Char) kind() int { return _CHAR } +func (char *_Char) print() { print("char ", string(char.char)) } + +func newChar(char int) *_Char { + c := new(_Char) + c.char = char + return c +} + +// --- CHARCLASS [a-z] + +type _CharClass struct { + common + negate bool // is character class negated? ([^a-z]) + // slice of int, stored pairwise: [a-z] is (a,z); x is (x,x): + ranges []int + cmin, cmax int +} + +func (cclass *_CharClass) kind() int { return _CHARCLASS } + +func (cclass *_CharClass) print() { + print("charclass") + if cclass.negate { + print(" (negated)") + } + for i := 0; i < len(cclass.ranges); i += 2 { + l := cclass.ranges[i] + r := cclass.ranges[i+1] + if l == r { + print(" [", string(l), "]") + } else { + print(" [", string(l), "-", string(r), "]") + } + } +} + +func (cclass *_CharClass) addRange(a, b int) { + // range is a through b inclusive + cclass.ranges = append(cclass.ranges, a, b) + if a < cclass.cmin { + cclass.cmin = a + } + if b > cclass.cmax { + cclass.cmax = b + } +} + +func (cclass *_CharClass) matches(c int) bool { + if c < cclass.cmin || c > cclass.cmax { + return cclass.negate + } + ranges := cclass.ranges + for i := 0; i < len(ranges); i = i + 2 { + if ranges[i] <= c && c <= ranges[i+1] { + return !cclass.negate + } + } + return cclass.negate +} + +func newCharClass() *_CharClass { + c := new(_CharClass) + c.ranges = make([]int, 0, 4) + c.cmin = 0x10FFFF + 1 // MaxRune + 1 + c.cmax = -1 + return c +} + +// --- ANY any character +type _Any struct { + common +} + +func (any *_Any) kind() int { return _ANY } +func (any *_Any) print() { print("any") } + +// --- NOTNL any character but newline +type _NotNl struct { + common +} + +func (notnl *_NotNl) kind() int { return _NOTNL } +func (notnl *_NotNl) print() { print("notnl") } + +// --- BRA parenthesized expression +type _Bra struct { + common + n int // subexpression number +} + +func (bra *_Bra) kind() int { return _BRA } +func (bra *_Bra) print() { print("bra", bra.n) } + +// --- EBRA end of parenthesized expression +type _Ebra struct { + common + n int // subexpression number +} + +func (ebra *_Ebra) kind() int { return _EBRA } +func (ebra *_Ebra) print() { print("ebra ", ebra.n) } + +// --- ALT alternation +type _Alt struct { + common + left instr // other branch +} + +func (alt *_Alt) kind() int { return _ALT } +func (alt *_Alt) print() { print("alt(", alt.left.index(), ")") } + +// --- NOP no operation +type _Nop struct { + common +} + +func (nop *_Nop) kind() int { return _NOP } +func (nop *_Nop) print() { print("nop") } + +func (re *Regexp) add(i instr) instr { + i.setIndex(len(re.inst)) + re.inst = append(re.inst, i) + return i +} + +type parser struct { + re *Regexp + nlpar int // number of unclosed lpars + pos int + ch int +} + +func (p *parser) error(err Error) { + panic(err) +} + +const endOfFile = -1 + +func (p *parser) c() int { return p.ch } + +func (p *parser) nextc() int { + if p.pos >= len(p.re.expr) { + p.ch = endOfFile + } else { + c, w := utf8.DecodeRuneInString(p.re.expr[p.pos:]) + p.ch = c + p.pos += w + } + return p.ch +} + +func newParser(re *Regexp) *parser { + p := new(parser) + p.re = re + p.nextc() // load p.ch + return p +} + +func special(c int) bool { + for _, r := range `\.+*?()|[]^$` { + if c == r { + return true + } + } + return false +} + +func ispunct(c int) bool { + for _, r := range "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" { + if c == r { + return true + } + } + return false +} + +var escapes = []byte("abfnrtv") +var escaped = []byte("\a\b\f\n\r\t\v") + +func escape(c int) int { + for i, b := range escapes { + if int(b) == c { + return i + } + } + return -1 +} + +func (p *parser) charClass() instr { + cc := newCharClass() + if p.c() == '^' { + cc.negate = true + p.nextc() + } + left := -1 + for { + switch c := p.c(); c { + case ']', endOfFile: + if left >= 0 { + p.error(ErrBadRange) + } + // Is it [^\n]? + if cc.negate && len(cc.ranges) == 2 && + cc.ranges[0] == '\n' && cc.ranges[1] == '\n' { + nl := new(_NotNl) + p.re.add(nl) + return nl + } + // Special common case: "[a]" -> "a" + if !cc.negate && len(cc.ranges) == 2 && cc.ranges[0] == cc.ranges[1] { + c := newChar(cc.ranges[0]) + p.re.add(c) + return c + } + p.re.add(cc) + return cc + case '-': // do this before backslash processing + p.error(ErrBadRange) + case '\\': + c = p.nextc() + switch { + case c == endOfFile: + p.error(ErrExtraneousBackslash) + case ispunct(c): + // c is as delivered + case escape(c) >= 0: + c = int(escaped[escape(c)]) + default: + p.error(ErrBadBackslash) + } + fallthrough + default: + p.nextc() + switch { + case left < 0: // first of pair + if p.c() == '-' { // range + p.nextc() + left = c + } else { // single char + cc.addRange(c, c) + } + case left <= c: // second of pair + cc.addRange(left, c) + left = -1 + default: + p.error(ErrBadRange) + } + } + } + return nil +} + +func (p *parser) term() (start, end instr) { + switch c := p.c(); c { + case '|', endOfFile: + return nil, nil + case '*', '+': + p.error(ErrBareClosure) + case ')': + if p.nlpar == 0 { + p.error(ErrUnmatchedRpar) + } + return nil, nil + case ']': + p.error(ErrUnmatchedRbkt) + case '^': + p.nextc() + start = p.re.add(new(_Bot)) + return start, start + case '$': + p.nextc() + start = p.re.add(new(_Eot)) + return start, start + case '.': + p.nextc() + start = p.re.add(new(_Any)) + return start, start + case '[': + p.nextc() + start = p.charClass() + if p.c() != ']' { + p.error(ErrUnmatchedLbkt) + } + p.nextc() + return start, start + case '(': + p.nextc() + p.nlpar++ + p.re.nbra++ // increment first so first subexpr is \1 + nbra := p.re.nbra + start, end = p.regexp() + if p.c() != ')' { + p.error(ErrUnmatchedLpar) + } + p.nlpar-- + p.nextc() + bra := new(_Bra) + p.re.add(bra) + ebra := new(_Ebra) + p.re.add(ebra) + bra.n = nbra + ebra.n = nbra + if start == nil { + if end == nil { + p.error(ErrInternal) + return + } + start = ebra + } else { + end.setNext(ebra) + } + bra.setNext(start) + return bra, ebra + case '\\': + c = p.nextc() + switch { + case c == endOfFile: + p.error(ErrExtraneousBackslash) + case ispunct(c): + // c is as delivered + case escape(c) >= 0: + c = int(escaped[escape(c)]) + default: + p.error(ErrBadBackslash) + } + fallthrough + default: + p.nextc() + start = newChar(c) + p.re.add(start) + return start, start + } + panic("unreachable") +} + +func (p *parser) closure() (start, end instr) { + start, end = p.term() + if start == nil { + return + } + switch p.c() { + case '*': + // (start,end)*: + alt := new(_Alt) + p.re.add(alt) + end.setNext(alt) // after end, do alt + alt.left = start // alternate brach: return to start + start = alt // alt becomes new (start, end) + end = alt + case '+': + // (start,end)+: + alt := new(_Alt) + p.re.add(alt) + end.setNext(alt) // after end, do alt + alt.left = start // alternate brach: return to start + end = alt // start is unchanged; end is alt + case '?': + // (start,end)?: + alt := new(_Alt) + p.re.add(alt) + nop := new(_Nop) + p.re.add(nop) + alt.left = start // alternate branch is start + alt.setNext(nop) // follow on to nop + end.setNext(nop) // after end, go to nop + start = alt // start is now alt + end = nop // end is nop pointed to by both branches + default: + return + } + switch p.nextc() { + case '*', '+', '?': + p.error(ErrBadClosure) + } + return +} + +func (p *parser) concatenation() (start, end instr) { + for { + nstart, nend := p.closure() + switch { + case nstart == nil: // end of this concatenation + if start == nil { // this is the empty string + nop := p.re.add(new(_Nop)) + return nop, nop + } + return + case start == nil: // this is first element of concatenation + start, end = nstart, nend + default: + end.setNext(nstart) + end = nend + } + } + panic("unreachable") +} + +func (p *parser) regexp() (start, end instr) { + start, end = p.concatenation() + for { + switch p.c() { + default: + return + case '|': + p.nextc() + nstart, nend := p.concatenation() + alt := new(_Alt) + p.re.add(alt) + alt.left = start + alt.setNext(nstart) + nop := new(_Nop) + p.re.add(nop) + end.setNext(nop) + nend.setNext(nop) + start, end = alt, nop + } + } + panic("unreachable") +} + +func unNop(i instr) instr { + for i.kind() == _NOP { + i = i.next() + } + return i +} + +func (re *Regexp) eliminateNops() { + for _, inst := range re.inst { + if inst.kind() == _END { + continue + } + inst.setNext(unNop(inst.next())) + if inst.kind() == _ALT { + alt := inst.(*_Alt) + alt.left = unNop(alt.left) + } + } +} + +func (re *Regexp) dump() { + print("prefix <", re.prefix, ">\n") + for _, inst := range re.inst { + print(inst.index(), ": ") + inst.print() + if inst.kind() != _END { + print(" -> ", inst.next().index()) + } + print("\n") + } +} + +func (re *Regexp) doParse() { + p := newParser(re) + start := new(_Start) + re.add(start) + s, e := p.regexp() + start.setNext(s) + re.start = start + e.setNext(re.add(new(_End))) + + if debug { + re.dump() + println() + } + + re.eliminateNops() + if debug { + re.dump() + println() + } + re.setPrefix() + if debug { + re.dump() + println() + } +} + +// Extract regular text from the beginning of the pattern. +// That text can be used by doExecute to speed up matching. +func (re *Regexp) setPrefix() { + var b []byte + var utf = make([]byte, utf8.UTFMax) + // First instruction is start; skip that. + i := re.inst[0].next().index() +Loop: + for i < len(re.inst) { + inst := re.inst[i] + // stop if this is not a char + if inst.kind() != _CHAR { + break + } + // stop if this char can be followed by a match for an empty string, + // which includes closures, ^, and $. + switch re.inst[inst.next().index()].kind() { + case _BOT, _EOT, _ALT: + break Loop + } + n := utf8.EncodeRune(inst.(*_Char).char, utf) + b = bytes.Add(b, utf[0:n]) + i = inst.next().index() + } + // point prefixStart instruction to first non-CHAR after prefix + re.prefixStart = re.inst[i] + re.prefixBytes = b + re.prefix = string(b) +} + +// Compile parses a regular expression and returns, if successful, a Regexp +// object that can be used to match against text. +func Compile(str string) (regexp *Regexp, error os.Error) { + regexp = new(Regexp) + // doParse will panic if there is a parse error. + defer func() { + if e := recover(); e != nil { + regexp = nil + error = e.(Error) // Will re-panic if error was not an Error, e.g. nil-pointer exception + } + }() + regexp.expr = str + regexp.inst = make([]instr, 0, 10) + regexp.doParse() + return +} + +// MustCompile is like Compile but panics if the expression cannot be parsed. +// It simplifies safe initialization of global variables holding compiled regular +// expressions. +func MustCompile(str string) *Regexp { + regexp, error := Compile(str) + if error != nil { + panic(`regexp: compiling "` + str + `": ` + error.String()) + } + return regexp +} + +// NumSubexp returns the number of parenthesized subexpressions in this Regexp. +func (re *Regexp) NumSubexp() int { return re.nbra } + +// The match arena allows us to reduce the garbage generated by tossing +// match vectors away as we execute. Matches are ref counted and returned +// to a free list when no longer active. Increases a simple benchmark by 22X. +type matchArena struct { + head *matchVec + len int // length of match vector +} + +type matchVec struct { + m []int // pairs of bracketing submatches. 0th is start,end + ref int + next *matchVec +} + +func (a *matchArena) new() *matchVec { + if a.head == nil { + const N = 10 + block := make([]matchVec, N) + for i := 0; i < N; i++ { + b := &block[i] + b.next = a.head + a.head = b + } + } + m := a.head + a.head = m.next + m.ref = 0 + if m.m == nil { + m.m = make([]int, a.len) + } + return m +} + +func (a *matchArena) free(m *matchVec) { + m.ref-- + if m.ref == 0 { + m.next = a.head + a.head = m + } +} + +func (a *matchArena) copy(m *matchVec) *matchVec { + m1 := a.new() + copy(m1.m, m.m) + return m1 +} + +func (a *matchArena) noMatch() *matchVec { + m := a.new() + for i := range m.m { + m.m[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac" + } + m.ref = 1 + return m +} + +type state struct { + inst instr // next instruction to execute + prefixed bool // this match began with a fixed prefix + match *matchVec +} + +// Append new state to to-do list. Leftmost-longest wins so avoid +// adding a state that's already active. The matchVec will be inc-ref'ed +// if it is assigned to a state. +func (a *matchArena) addState(s []state, inst instr, prefixed bool, match *matchVec, pos, end int) []state { + switch inst.kind() { + case _BOT: + if pos == 0 { + s = a.addState(s, inst.next(), prefixed, match, pos, end) + } + return s + case _EOT: + if pos == end { + s = a.addState(s, inst.next(), prefixed, match, pos, end) + } + return s + case _BRA: + n := inst.(*_Bra).n + match.m[2*n] = pos + s = a.addState(s, inst.next(), prefixed, match, pos, end) + return s + case _EBRA: + n := inst.(*_Ebra).n + match.m[2*n+1] = pos + s = a.addState(s, inst.next(), prefixed, match, pos, end) + return s + } + index := inst.index() + l := len(s) + // States are inserted in order so it's sufficient to see if we have the same + // instruction; no need to see if existing match is earlier (it is). + for i := 0; i < l; i++ { + if s[i].inst.index() == index { + return s + } + } + s = append(s, state{inst, prefixed, match}) + match.ref++ + if inst.kind() == _ALT { + s = a.addState(s, inst.(*_Alt).left, prefixed, a.copy(match), pos, end) + // give other branch a copy of this match vector + s = a.addState(s, inst.next(), prefixed, a.copy(match), pos, end) + } + return s +} + +// Accepts either string or bytes - the logic is identical either way. +// If bytes == nil, scan str. +func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int { + var s [2][]state + s[0] = make([]state, 0, 10) + s[1] = make([]state, 0, 10) + in, out := 0, 1 + var final state + found := false + end := len(str) + if bytestr != nil { + end = len(bytestr) + } + // fast check for initial plain substring + prefixed := false // has this iteration begun by skipping a prefix? + if re.prefix != "" { + var advance int + if bytestr == nil { + advance = strings.Index(str[pos:], re.prefix) + } else { + advance = bytes.Index(bytestr[pos:], re.prefixBytes) + } + if advance == -1 { + return nil + } + pos += advance + len(re.prefix) + prefixed = true + } + arena := &matchArena{nil, 2 * (re.nbra + 1)} + for pos <= end { + if !found { + // prime the pump if we haven't seen a match yet + match := arena.noMatch() + match.m[0] = pos + if prefixed { + s[out] = arena.addState(s[out], re.prefixStart, true, match, pos, end) + prefixed = false // next iteration should start at beginning of machine. + } else { + s[out] = arena.addState(s[out], re.start.next(), false, match, pos, end) + } + arena.free(match) // if addState saved it, ref was incremented + } + in, out = out, in // old out state is new in state + // clear out old state + old := s[out] + for _, state := range old { + arena.free(state.match) + } + s[out] = old[0:0] // truncate state vector + if found && len(s[in]) == 0 { + // machine has completed + break + } + charwidth := 1 + c := endOfFile + if pos < end { + if bytestr == nil { + c, charwidth = utf8.DecodeRuneInString(str[pos:end]) + } else { + c, charwidth = utf8.DecodeRune(bytestr[pos:end]) + } + } + pos += charwidth + for _, st := range s[in] { + switch st.inst.kind() { + case _BOT: + case _EOT: + case _CHAR: + if c == st.inst.(*_Char).char { + s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + } + case _CHARCLASS: + if st.inst.(*_CharClass).matches(c) { + s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + } + case _ANY: + if c != endOfFile { + s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + } + case _NOTNL: + if c != endOfFile && c != '\n' { + s[out] = arena.addState(s[out], st.inst.next(), st.prefixed, st.match, pos, end) + } + case _BRA: + case _EBRA: + case _ALT: + case _END: + // choose leftmost longest + if !found || // first + st.match.m[0] < final.match.m[0] || // leftmost + (st.match.m[0] == final.match.m[0] && pos-charwidth > final.match.m[1]) { // longest + if final.match != nil { + arena.free(final.match) + } + final = st + final.match.ref++ + final.match.m[1] = pos - charwidth + } + found = true + default: + st.inst.print() + panic("unknown instruction in execute") + } + } + } + if final.match == nil { + return nil + } + // if match found, back up start of match by width of prefix. + if final.prefixed && len(final.match.m) > 0 { + final.match.m[0] -= len(re.prefix) + } + return final.match.m +} + +// MatchString returns whether the Regexp matches the string s. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) MatchString(s string) bool { return len(re.doExecute(s, nil, 0)) > 0 } + +// Match returns whether the Regexp matches the byte slice b. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0 } + + +// MatchString checks whether a textual regular expression +// matches a string. More complicated queries need +// to use Compile and the full Regexp interface. +func MatchString(pattern string, s string) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchString(s), nil +} + +// Match checks whether a textual regular expression +// matches a byte slice. More complicated queries need +// to use Compile and the full Regexp interface. +func Match(pattern string, b []byte) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.Match(b), nil +} + +// ReplaceAllString returns a copy of src in which all matches for the Regexp +// have been replaced by repl. No support is provided for expressions +// (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllString(src, repl string) string { + return re.ReplaceAllStringFunc(src, func(string) string { return repl }) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches for the +// Regexp have been replaced by the return value of of function repl (whose +// first argument is the matched string). No support is provided for +// expressions (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + buf := new(bytes.Buffer) + for searchPos <= len(src) { + a := re.doExecute(src, nil, searchPos) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + io.WriteString(buf, src[lastMatchEnd:a[0]]) + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + io.WriteString(buf, repl(src[a[0]:a[1]])) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + _, width := utf8.DecodeRuneInString(src[searchPos:]) + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + io.WriteString(buf, src[lastMatchEnd:]) + + return buf.String() +} + +// ReplaceAll returns a copy of src in which all matches for the Regexp +// have been replaced by repl. No support is provided for expressions +// (e.g. \1 or $1) in the replacement text. +func (re *Regexp) ReplaceAll(src, repl []byte) []byte { + return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) +} + +// ReplaceAllFunc returns a copy of src in which all matches for the +// Regexp have been replaced by the return value of of function repl (whose +// first argument is the matched []byte). No support is provided for +// expressions (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + buf := new(bytes.Buffer) + for searchPos <= len(src) { + a := re.doExecute("", src, searchPos) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + buf.Write(src[lastMatchEnd:a[0]]) + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + buf.Write(repl(src[a[0]:a[1]])) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + _, width := utf8.DecodeRune(src[searchPos:]) + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + buf.Write(src[lastMatchEnd:]) + + return buf.Bytes() +} + +// QuoteMeta returns a string that quotes all regular expression metacharacters +// inside the argument text; the returned string is a regular expression matching +// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. +func QuoteMeta(s string) string { + b := make([]byte, 2*len(s)) + + // A byte loop is correct because all metacharacters are ASCII. + j := 0 + for i := 0; i < len(s); i++ { + if special(int(s[i])) { + b[j] = '\\' + j++ + } + b[j] = s[i] + j++ + } + return string(b[0:j]) +} + +// Find matches in slice b if b is non-nil, otherwise find matches in string s. +func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { + var end int + if b == nil { + end = len(s) + } else { + end = len(b) + } + + for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { + matches := re.doExecute(s, b, pos) + if len(matches) == 0 { + break + } + + accept := true + if matches[1] == pos { + // We've found an empty match. + if matches[0] == prevMatchEnd { + // We don't allow an empty match right + // after a previous match, so ignore it. + accept = false + } + var width int + if b == nil { + _, width = utf8.DecodeRuneInString(s[pos:end]) + } else { + _, width = utf8.DecodeRune(b[pos:end]) + } + if width > 0 { + pos += width + } else { + pos = end + 1 + } + } else { + pos = matches[1] + } + prevMatchEnd = matches[1] + + if accept { + deliver(matches) + i++ + } + } +} + +// Find returns a slice holding the text of the leftmost match in b of the regular expression. +// A return value of nil indicates no match. +func (re *Regexp) Find(b []byte) []byte { + a := re.doExecute("", b, 0) + if a == nil { + return nil + } + return b[a[0]:a[1]] +} + +// FindIndex returns a two-element slice of integers defining the location of +// the leftmost match in b of the regular expression. The match itself is at +// b[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindIndex(b []byte) (loc []int) { + a := re.doExecute("", b, 0) + if a == nil { + return nil + } + return a[0:2] +} + +// FindString returns a string holding the text of the leftmost match in s of the regular +// expression. If there is no match, the return value is an empty string, +// but it will also be empty if the regular expression successfully matches +// an empty string. Use FindStringIndex or FindStringSubmatch if it is +// necessary to distinguish these cases. +func (re *Regexp) FindString(s string) string { + a := re.doExecute(s, nil, 0) + if a == nil { + return "" + } + return s[a[0]:a[1]] +} + +// FindStringIndex returns a two-element slice of integers defining the +// location of the leftmost match in s of the regular expression. The match +// itself is at s[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindStringIndex(s string) []int { + a := re.doExecute(s, nil, 0) + if a == nil { + return nil + } + return a[0:2] +} + +// FindSubmatch returns a slice of slices holding the text of the leftmost +// match of the regular expression in b and the matches, if any, of its +// subexpressions, as defined by the 'Submatch' descriptions in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatch(b []byte) [][]byte { + a := re.doExecute("", b, 0) + if a == nil { + return nil + } + ret := make([][]byte, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = b[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindSubmatchIndex returns a slice holding the index pairs identifying the +// leftmost match of the regular expression in b and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatchIndex(b []byte) []int { + return re.doExecute("", b, 0) +} + +// FindStringSubmatch returns a slice of strings holding the text of the +// leftmost match of the regular expression in s and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatch(s string) []string { + a := re.doExecute(s, nil, 0) + if a == nil { + return nil + } + ret := make([]string, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = s[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindStringSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression in s and the +// matches, if any, of its subexpressions, as defined by the 'Submatch' and +// 'Index' descriptions in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatchIndex(s string) []int { + return re.doExecute(s, nil, 0) +} + +const startSize = 10 // The size at which to start a slice in the 'All' routines. + +// FindAll is the 'All' version of Find; it returns a slice of all successive +// matches of the expression, as defined by the 'All' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAll(b []byte, n int) [][]byte { + if n < 0 { + n = len(b) + 1 + } + result := make([][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, b[match[0]:match[1]]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllString is the 'All' version of FindString; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllString(s string, n int) []string { + if n < 0 { + n = len(s) + 1 + } + result := make([]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, s[match[0]:match[1]]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a +// slice of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice +// of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { + if n < 0 { + n = len(b) + 1 + } + result := make([][][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + slice := make([][]byte, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = b[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns +// a slice of all successive matches of the expression, as defined by the +// 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it +// returns a slice of all successive matches of the expression, as defined by +// the 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { + if n < 0 { + n = len(s) + 1 + } + result := make([][]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + slice := make([]string, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = s[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatchIndex is the 'All' version of +// FindStringSubmatchIndex; it returns a slice of all successive matches of +// the expression, as defined by the 'All' description in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result +} |