diff options
author | Tim Shen <timshen91@gmail.com> | 2013-08-22 00:58:28 +0000 |
---|---|---|
committer | Tim Shen <timshen@gcc.gnu.org> | 2013-08-22 00:58:28 +0000 |
commit | 1b488e33b66845b5eb1ac0dbff934fc2d8f75890 (patch) | |
tree | cd120fcb09910f198fb7bde5934a7cd9b3e79d4d /libstdc++-v3/include | |
parent | 9ad30113d636b59a11688d541149942b16b6ee5b (diff) | |
download | gcc-1b488e33b66845b5eb1ac0dbff934fc2d8f75890.tar.gz |
regex.h: Executor caller.
2013-08-22 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h: Executor caller.
* include/bits/regex_executor.h: Fix empty grouping problem.
* include/bits/regex_executor.tcc: Same.
* testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc:
New.
From-SVN: r201914
Diffstat (limited to 'libstdc++-v3/include')
-rw-r--r-- | libstdc++-v3/include/bits/regex.h | 38 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_executor.h | 94 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_executor.tcc | 126 |
3 files changed, 169 insertions, 89 deletions
diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 8fa979fa375..f5932edaadb 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -2211,7 +2211,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (__re._M_automaton == nullptr) return false; - if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match()) + __detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match(); + if (__m.size() > 0 && __m[0].matched) { for (auto __it : __m) if (!__it.matched) @@ -2371,22 +2372,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__re._M_automaton == nullptr) return false; for (auto __cur = __first; __cur != __last; ++__cur) // Any KMP-like algo? - if (__detail::__get_executor(__cur, __last, __m, __re, __flags) - ->_M_search_from_first()) - { - for (auto __it : __m) - if (!__it.matched) - __it.first = __it.second = __last; - __m.at(__m.size()).first = __first; - __m.at(__m.size()).second = __m[0].first; - __m.at(__m.size()+1).first = __m[0].second; - __m.at(__m.size()+1).second = __last; - __m.at(__m.size()).matched = - (__m.prefix().first != __m.prefix().second); - __m.at(__m.size()+1).matched = - (__m.suffix().first != __m.suffix().second); - return true; - } + { + __detail::__get_executor(__cur, __last, __m, __re, __flags) + ->_M_search_from_first(); + if (__m.size() > 0 && __m[0].matched) + { + for (auto __it : __m) + if (!__it.matched) + __it.first = __it.second = __last; + __m.at(__m.size()).first = __first; + __m.at(__m.size()).second = __m[0].first; + __m.at(__m.size()+1).first = __m[0].second; + __m.at(__m.size()+1).second = __last; + __m.at(__m.size()).matched = + (__m.prefix().first != __m.prefix().second); + __m.at(__m.size()+1).matched = + (__m.suffix().first != __m.suffix().second); + return true; + } + } return false; } diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 0006a29b614..e2d6e6e412d 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -28,12 +28,17 @@ * Do not attempt to use it directly. @headername{regex} */ +// TODO: convert comments to doxygen format. + namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename, typename> class basic_regex; + template<typename> + class sub_match; + template<typename, typename> class match_results; _GLIBCXX_END_NAMESPACE_VERSION @@ -52,19 +57,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION class _Executor { public: - typedef match_results<_BiIter, _Alloc> _ResultsT; - typedef regex_constants::match_flag_type _FlagT; + typedef match_results<_BiIter, _Alloc> _ResultsT; + typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec; + typedef regex_constants::match_flag_type _FlagT; virtual ~_Executor() { } // Set matched when string exactly match the pattern. - virtual bool + virtual void _M_match() = 0; // Set matched when some prefix of the string matches the pattern. - virtual bool + virtual void _M_search_from_first() = 0; protected: @@ -74,20 +80,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _ResultsT& __results, _FlagT __flags, _SizeT __size) - : _M_current(__begin), _M_end(__end), - _M_results(__results), _M_flags(__flags) + : _M_current(__begin), _M_end(__end), _M_results(__results), + _M_flags(__flags) { - __results.resize(__size + 2); - for (auto __it : __results) - __it.matched = false; + __size += 2; + _M_results.resize(__size); + for (auto __i = 0; __i < __size; __i++) + _M_results[__i].matched = false; } - _BiIter _M_current; - _BiIter _M_end; - _ResultsT& _M_results; - _FlagT _M_flags; + _BiIter _M_current; + _BiIter _M_end; + _ResultsVec& _M_results; + _FlagT _M_flags; }; + // A _DFSExecutor perform a DFS on given NFA and input string. At the very + // beginning the executor stands in the start state, then it try every + // possible state transition in current state recursively. Some state + // transitions consume input string, say, a single-char-matcher or a + // back-reference matcher; some not, like assertion or other anchor nodes. + // When the input is exhausted and the current state is an accepting state, + // the whole executor return true. + // + // TODO: This approach is exponentially slow for certain input. + // Try to compile the NFA to a DFA. + // + // Time complexity: exponential + // Space complexity: O(__end - __begin) template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> class _DFSExecutor @@ -97,6 +117,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; typedef _NFA<_CharT, _TraitsT> _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; + typedef typename _BaseT::_ResultsVec _ResultsVec; typedef regex_constants::match_flag_type _FlagT; _DFSExecutor(_BiIter __begin, @@ -105,37 +126,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const _RegexT& __nfa, _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(_TraitsT()), _M_nfa(__nfa) + _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results) { } - bool + void _M_match() - { return _M_dfs<true>(_M_nfa._M_start()); } + { _M_dfs<true>(_M_nfa._M_start()); } - bool + void _M_search_from_first() - { return _M_dfs<false>(_M_nfa._M_start()); } + { _M_dfs<false>(_M_nfa._M_start()); } private: template<bool __match_mode> bool _M_dfs(_StateIdT __i); + _ResultsVec _M_results_ret; _TraitsT _M_traits; const _RegexT& _M_nfa; }; - // It's essentially a variant of Single-Source-Shortest-Path problem, where, - // the matching results is the final distance and should be minimized. - // Instead of using Dijkstra Algorithm, I pick up the queue-optimizaed - // (BFS-like) Bellman-Ford algorithm, - // SPFA(http://en.wikipedia.org/wiki/Shortest_Path_Faster_Algorithm). + // Like the DFS approach, it try every possible state transition; Unlike DFS, + // it uses a queue instead of a stack to store matching states. It's a BFS + // approach. + // + // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) explained + // this algorithm clearly. // // Every entry of _M_covered saves the solution(grouping status) for every - // matching head. When states transfer, solutions will be compared and + // matching head. When states transit, solutions will be compared and // deduplicated(based on which greedy mode we have). // - // Time complexity: O(_M_str_cur.size() * _M_nfa.size()) + // Time complexity: O((__end - __begin) * _M_nfa.size()) // Space complexity: O(_M_nfa.size() * _M_nfa.mark_count()) template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> @@ -146,12 +169,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; typedef _NFA<_CharT, _TraitsT> _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; - typedef std::unique_ptr<_ResultsT> _ResultsPtr; + typedef typename _BaseT::_ResultsVec _ResultsVec; + typedef std::unique_ptr<_ResultsVec> _ResultsPtr; typedef regex_constants::match_flag_type _FlagT; _BFSExecutor(_BiIter __begin, _BiIter __end, - _ResultsT& __results, + _ResultsT& __results, const _RegexT& __nfa, _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), @@ -159,21 +183,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (_M_nfa._M_start() != _S_invalid_state_id) _M_covered[_M_nfa._M_start()] = - _ResultsPtr(new _ResultsT(this->_M_results)); + _ResultsPtr(new _ResultsVec(this->_M_results)); _M_e_closure(); } - bool + void _M_match() - { return _M_main_loop<true>(); } + { _M_main_loop<true>(); } - bool + void _M_search_from_first() - { return _M_main_loop<false>(); } + { _M_main_loop<false>(); } private: template<bool __match_mode> - bool + void _M_main_loop(); void @@ -183,13 +207,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_move(); bool - _M_match_less_than(_StateIdT __u, _StateIdT __v) const; + _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const; bool _M_includes_some() const; std::map<_StateIdT, _ResultsPtr> _M_covered; - const _RegexT& _M_nfa; + const _RegexT& _M_nfa; }; //@} regex-detail diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 08b4915a3e3..9115ea0ebf2 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -34,19 +34,18 @@ namespace __detail { _GLIBCXX_BEGIN_NAMESPACE_VERSION - // TODO: This is too slow. Try to compile the NFA to a DFA. template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> template<bool __match_mode> bool _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: _M_dfs(_StateIdT __i) { - auto& __current = this->_M_current; - auto& __end = this->_M_end; - auto& __results = this->_M_results; if (__i == _S_invalid_state_id) // This is not that certain. Need deeper investigate. return false; + auto& __current = this->_M_current; + auto& __end = this->_M_end; + auto& __results = _M_results_ret; const auto& __state = _M_nfa[__i]; bool __ret = false; switch (__state._M_opcode) @@ -59,14 +58,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION || _M_dfs<__match_mode>(__state._M_next); break; case _S_opcode_subexpr_begin: - __results.at(__state._M_subexpr).first = __current; - __ret = _M_dfs<__match_mode>(__state._M_next); + // Here's the critical part: if there's nothing changed since last + // visit, do NOT continue. This prevents the executor from get into + // infinite loop when use "()*" to match "". + // + // Every change on __results will be roll back after the recursion + // step finished. + if (!__results[__state._M_subexpr].matched + || __results[__state._M_subexpr].first != __current) + { + auto __back = __current; + __results[__state._M_subexpr].first = __current; + __ret = _M_dfs<__match_mode>(__state._M_next); + __results[__state._M_subexpr].first = __back; + } break; case _S_opcode_subexpr_end: - __results.at(__state._M_subexpr).second = __current; - __results.at(__state._M_subexpr).matched = true; - __ret = _M_dfs<__match_mode>(__state._M_next); - __results.at(__state._M_subexpr).matched = __ret; + if (__results[__state._M_subexpr].second != __current + || __results[__state._M_subexpr].matched != true) + { + auto __back = __results[__state._M_subexpr]; + __results[__state._M_subexpr].second = __current; + __results[__state._M_subexpr].matched = true; + __ret = _M_dfs<__match_mode>(__state._M_next); + __results[__state._M_subexpr] = __back; + } + else + __ret = _M_dfs<__match_mode>(__state._M_next); break; case _S_opcode_match: if (__current != __end && __state._M_matches(*__current)) @@ -82,7 +100,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // If matched, keep going; else just return to try another state. case _S_opcode_backref: { - auto& __submatch = __results.at(__state._M_backref_index); + auto& __submatch = __results[__state._M_backref_index]; if (!__submatch.matched) break; auto __last = __current; @@ -92,12 +110,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION ++__last; if (_M_traits.transform(__submatch.first, __submatch.second) == _M_traits.transform(__current, __last)) - { - auto __backup = __current; - __current = __last; + if (__last != __current) + { + auto __backup = __current; + __current = __last; + __ret = _M_dfs<__match_mode>(__state._M_next); + __current = __backup; + } + else __ret = _M_dfs<__match_mode>(__state._M_next); - __current = __backup; - } } break; case _S_opcode_accept: @@ -105,6 +126,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __ret = __current == __end; else __ret = true; + if (__ret) + this->_M_results = __results; break; default: _GLIBCXX_DEBUG_ASSERT(false); @@ -115,22 +138,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> template<bool __match_mode> - bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: + void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: _M_main_loop() { while (this->_M_current != this->_M_end) { if (!__match_mode) if (_M_includes_some()) - return true; + return; _M_move(); ++this->_M_current; _M_e_closure(); } - return _M_includes_some(); + _M_includes_some(); } - // The SPFA approach. template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: @@ -152,13 +174,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const auto& __state = _M_nfa[__u]; // Can be implemented using method, but there're too much arguments. + // I would use macro function before C++11, but lambda is a better + // choice, since hopefully compiler can inline it. auto __add_visited_state = [&](_StateIdT __v) { if (__v == _S_invalid_state_id) return; - if (_M_match_less_than(__u, __v)) + if (_M_covered.count(__u) != 0 + && (_M_covered.count(__v) == 0 + || _M_match_less_than(*_M_covered[__u], *_M_covered[__v]))) { - _M_covered[__v] = _ResultsPtr(new _ResultsT(*_M_covered[__u])); + _M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u])); // if a state is updated, it's outgoing neighbors should be // reconsidered too. Push them to the queue. if (!__in_q[__v]) @@ -176,13 +202,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __add_visited_state(__state._M_alt); break; case _S_opcode_subexpr_begin: - _M_covered[__u]->at(__state._M_subexpr).first = __current; - __add_visited_state(__state._M_next); + { + auto& __cu = *_M_covered[__u]; + auto __back = __cu[__state._M_subexpr].first; + __cu[__state._M_subexpr].first = __current; + __add_visited_state(__state._M_next); + __cu[__state._M_subexpr].first = __back; + } break; case _S_opcode_subexpr_end: - _M_covered[__u]->at(__state._M_subexpr).second = __current; - _M_covered[__u]->at(__state._M_subexpr).matched = true; - __add_visited_state(__state._M_next); + { + auto& __cu = *_M_covered[__u]; + auto __back = __cu[__state._M_subexpr]; + __cu[__state._M_subexpr].second = __current; + __cu[__state._M_subexpr].matched = true; + __add_visited_state(__state._M_next); + __cu[__state._M_subexpr] = __back; + } break; case _S_opcode_match: break; @@ -206,9 +242,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const auto& __state = _M_nfa[__it.first]; if (__state._M_opcode == _S_opcode_match && __state._M_matches(*this->_M_current)) - if (_M_match_less_than(__it.first, __state._M_next) - && __state._M_next != _S_invalid_state_id) - __next[__state._M_next] = move(__it.second); + if (__state._M_next != _S_invalid_state_id) + if (__next.count(__state._M_next) == 0 + || _M_match_less_than(*__it.second, *__next[__state._M_next])) + __next[__state._M_next] = move(__it.second); } _M_covered = move(__next); } @@ -216,14 +253,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: - _M_match_less_than(_StateIdT __u, _StateIdT __v) const + _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const { - if (_M_covered.count(__u) == 0) - return false; - if (_M_covered.count(__v) > 0) - return true; // TODO: Greedy and Non-greedy support - return true; + _GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size()); + auto __size = __u.size(); + for (auto __i = 0; __i < __size; __i++) + { + auto& __uit = __u[__i], __vit = __v[__i]; + if (__uit.matched && !__vit.matched) + return true; + if (!__uit.matched && __vit.matched) + return false; + if (__uit.matched && __vit.matched) + { + // GREEDY + if (__uit.first != __vit.first) + return __uit.first < __vit.first; + if (__uit.second != __vit.second) + return __uit.second > __vit.second; + } + } + return false; } template<typename _BiIter, typename _Alloc, @@ -265,11 +316,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>> _ExecutorPtr; typedef _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _DFSExecutorT; + typedef _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _BFSExecutorT; auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> (__re._M_automaton); if (__p->_M_has_backref) return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags)); - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags)); + return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags)); } _GLIBCXX_END_NAMESPACE_VERSION |