summaryrefslogtreecommitdiff
path: root/libstdc++-v3/include
diff options
context:
space:
mode:
authorTim Shen <timshen91@gmail.com>2013-08-22 00:58:28 +0000
committerTim Shen <timshen@gcc.gnu.org>2013-08-22 00:58:28 +0000
commit1b488e33b66845b5eb1ac0dbff934fc2d8f75890 (patch)
treecd120fcb09910f198fb7bde5934a7cd9b3e79d4d /libstdc++-v3/include
parent9ad30113d636b59a11688d541149942b16b6ee5b (diff)
downloadgcc-1b488e33b66845b5eb1ac0dbff934fc2d8f75890.tar.gz
regex.h: Executor caller.
2013-08-22 Tim Shen <timshen91@gmail.com> * include/bits/regex.h: Executor caller. * include/bits/regex_executor.h: Fix empty grouping problem. * include/bits/regex_executor.tcc: Same. * testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc: New. From-SVN: r201914
Diffstat (limited to 'libstdc++-v3/include')
-rw-r--r--libstdc++-v3/include/bits/regex.h38
-rw-r--r--libstdc++-v3/include/bits/regex_executor.h94
-rw-r--r--libstdc++-v3/include/bits/regex_executor.tcc126
3 files changed, 169 insertions, 89 deletions
diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h
index 8fa979fa375..f5932edaadb 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -2211,7 +2211,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (__re._M_automaton == nullptr)
return false;
- if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match())
+ __detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match();
+ if (__m.size() > 0 && __m[0].matched)
{
for (auto __it : __m)
if (!__it.matched)
@@ -2371,22 +2372,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (__re._M_automaton == nullptr)
return false;
for (auto __cur = __first; __cur != __last; ++__cur) // Any KMP-like algo?
- if (__detail::__get_executor(__cur, __last, __m, __re, __flags)
- ->_M_search_from_first())
- {
- for (auto __it : __m)
- if (!__it.matched)
- __it.first = __it.second = __last;
- __m.at(__m.size()).first = __first;
- __m.at(__m.size()).second = __m[0].first;
- __m.at(__m.size()+1).first = __m[0].second;
- __m.at(__m.size()+1).second = __last;
- __m.at(__m.size()).matched =
- (__m.prefix().first != __m.prefix().second);
- __m.at(__m.size()+1).matched =
- (__m.suffix().first != __m.suffix().second);
- return true;
- }
+ {
+ __detail::__get_executor(__cur, __last, __m, __re, __flags)
+ ->_M_search_from_first();
+ if (__m.size() > 0 && __m[0].matched)
+ {
+ for (auto __it : __m)
+ if (!__it.matched)
+ __it.first = __it.second = __last;
+ __m.at(__m.size()).first = __first;
+ __m.at(__m.size()).second = __m[0].first;
+ __m.at(__m.size()+1).first = __m[0].second;
+ __m.at(__m.size()+1).second = __last;
+ __m.at(__m.size()).matched =
+ (__m.prefix().first != __m.prefix().second);
+ __m.at(__m.size()+1).matched =
+ (__m.suffix().first != __m.suffix().second);
+ return true;
+ }
+ }
return false;
}
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 0006a29b614..e2d6e6e412d 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -28,12 +28,17 @@
* Do not attempt to use it directly. @headername{regex}
*/
+// TODO: convert comments to doxygen format.
+
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename, typename>
class basic_regex;
+ template<typename>
+ class sub_match;
+
template<typename, typename>
class match_results;
_GLIBCXX_END_NAMESPACE_VERSION
@@ -52,19 +57,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
class _Executor
{
public:
- typedef match_results<_BiIter, _Alloc> _ResultsT;
- typedef regex_constants::match_flag_type _FlagT;
+ typedef match_results<_BiIter, _Alloc> _ResultsT;
+ typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
+ typedef regex_constants::match_flag_type _FlagT;
virtual
~_Executor()
{ }
// Set matched when string exactly match the pattern.
- virtual bool
+ virtual void
_M_match() = 0;
// Set matched when some prefix of the string matches the pattern.
- virtual bool
+ virtual void
_M_search_from_first() = 0;
protected:
@@ -74,20 +80,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_ResultsT& __results,
_FlagT __flags,
_SizeT __size)
- : _M_current(__begin), _M_end(__end),
- _M_results(__results), _M_flags(__flags)
+ : _M_current(__begin), _M_end(__end), _M_results(__results),
+ _M_flags(__flags)
{
- __results.resize(__size + 2);
- for (auto __it : __results)
- __it.matched = false;
+ __size += 2;
+ _M_results.resize(__size);
+ for (auto __i = 0; __i < __size; __i++)
+ _M_results[__i].matched = false;
}
- _BiIter _M_current;
- _BiIter _M_end;
- _ResultsT& _M_results;
- _FlagT _M_flags;
+ _BiIter _M_current;
+ _BiIter _M_end;
+ _ResultsVec& _M_results;
+ _FlagT _M_flags;
};
+ // A _DFSExecutor perform a DFS on given NFA and input string. At the very
+ // beginning the executor stands in the start state, then it try every
+ // possible state transition in current state recursively. Some state
+ // transitions consume input string, say, a single-char-matcher or a
+ // back-reference matcher; some not, like assertion or other anchor nodes.
+ // When the input is exhausted and the current state is an accepting state,
+ // the whole executor return true.
+ //
+ // TODO: This approach is exponentially slow for certain input.
+ // Try to compile the NFA to a DFA.
+ //
+ // Time complexity: exponential
+ // Space complexity: O(__end - __begin)
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
class _DFSExecutor
@@ -97,6 +117,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
typedef _NFA<_CharT, _TraitsT> _RegexT;
typedef typename _BaseT::_ResultsT _ResultsT;
+ typedef typename _BaseT::_ResultsVec _ResultsVec;
typedef regex_constants::match_flag_type _FlagT;
_DFSExecutor(_BiIter __begin,
@@ -105,37 +126,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
const _RegexT& __nfa,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
- _M_traits(_TraitsT()), _M_nfa(__nfa)
+ _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results)
{ }
- bool
+ void
_M_match()
- { return _M_dfs<true>(_M_nfa._M_start()); }
+ { _M_dfs<true>(_M_nfa._M_start()); }
- bool
+ void
_M_search_from_first()
- { return _M_dfs<false>(_M_nfa._M_start()); }
+ { _M_dfs<false>(_M_nfa._M_start()); }
private:
template<bool __match_mode>
bool
_M_dfs(_StateIdT __i);
+ _ResultsVec _M_results_ret;
_TraitsT _M_traits;
const _RegexT& _M_nfa;
};
- // It's essentially a variant of Single-Source-Shortest-Path problem, where,
- // the matching results is the final distance and should be minimized.
- // Instead of using Dijkstra Algorithm, I pick up the queue-optimizaed
- // (BFS-like) Bellman-Ford algorithm,
- // SPFA(http://en.wikipedia.org/wiki/Shortest_Path_Faster_Algorithm).
+ // Like the DFS approach, it try every possible state transition; Unlike DFS,
+ // it uses a queue instead of a stack to store matching states. It's a BFS
+ // approach.
+ //
+ // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) explained
+ // this algorithm clearly.
//
// Every entry of _M_covered saves the solution(grouping status) for every
- // matching head. When states transfer, solutions will be compared and
+ // matching head. When states transit, solutions will be compared and
// deduplicated(based on which greedy mode we have).
//
- // Time complexity: O(_M_str_cur.size() * _M_nfa.size())
+ // Time complexity: O((__end - __begin) * _M_nfa.size())
// Space complexity: O(_M_nfa.size() * _M_nfa.mark_count())
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
@@ -146,12 +169,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
typedef _NFA<_CharT, _TraitsT> _RegexT;
typedef typename _BaseT::_ResultsT _ResultsT;
- typedef std::unique_ptr<_ResultsT> _ResultsPtr;
+ typedef typename _BaseT::_ResultsVec _ResultsVec;
+ typedef std::unique_ptr<_ResultsVec> _ResultsPtr;
typedef regex_constants::match_flag_type _FlagT;
_BFSExecutor(_BiIter __begin,
_BiIter __end,
- _ResultsT& __results,
+ _ResultsT& __results,
const _RegexT& __nfa,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
@@ -159,21 +183,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (_M_nfa._M_start() != _S_invalid_state_id)
_M_covered[_M_nfa._M_start()] =
- _ResultsPtr(new _ResultsT(this->_M_results));
+ _ResultsPtr(new _ResultsVec(this->_M_results));
_M_e_closure();
}
- bool
+ void
_M_match()
- { return _M_main_loop<true>(); }
+ { _M_main_loop<true>(); }
- bool
+ void
_M_search_from_first()
- { return _M_main_loop<false>(); }
+ { _M_main_loop<false>(); }
private:
template<bool __match_mode>
- bool
+ void
_M_main_loop();
void
@@ -183,13 +207,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_move();
bool
- _M_match_less_than(_StateIdT __u, _StateIdT __v) const;
+ _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const;
bool
_M_includes_some() const;
std::map<_StateIdT, _ResultsPtr> _M_covered;
- const _RegexT& _M_nfa;
+ const _RegexT& _M_nfa;
};
//@} regex-detail
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 08b4915a3e3..9115ea0ebf2 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -34,19 +34,18 @@ namespace __detail
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
- // TODO: This is too slow. Try to compile the NFA to a DFA.
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
template<bool __match_mode>
bool _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
_M_dfs(_StateIdT __i)
{
- auto& __current = this->_M_current;
- auto& __end = this->_M_end;
- auto& __results = this->_M_results;
if (__i == _S_invalid_state_id)
// This is not that certain. Need deeper investigate.
return false;
+ auto& __current = this->_M_current;
+ auto& __end = this->_M_end;
+ auto& __results = _M_results_ret;
const auto& __state = _M_nfa[__i];
bool __ret = false;
switch (__state._M_opcode)
@@ -59,14 +58,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|| _M_dfs<__match_mode>(__state._M_next);
break;
case _S_opcode_subexpr_begin:
- __results.at(__state._M_subexpr).first = __current;
- __ret = _M_dfs<__match_mode>(__state._M_next);
+ // Here's the critical part: if there's nothing changed since last
+ // visit, do NOT continue. This prevents the executor from get into
+ // infinite loop when use "()*" to match "".
+ //
+ // Every change on __results will be roll back after the recursion
+ // step finished.
+ if (!__results[__state._M_subexpr].matched
+ || __results[__state._M_subexpr].first != __current)
+ {
+ auto __back = __current;
+ __results[__state._M_subexpr].first = __current;
+ __ret = _M_dfs<__match_mode>(__state._M_next);
+ __results[__state._M_subexpr].first = __back;
+ }
break;
case _S_opcode_subexpr_end:
- __results.at(__state._M_subexpr).second = __current;
- __results.at(__state._M_subexpr).matched = true;
- __ret = _M_dfs<__match_mode>(__state._M_next);
- __results.at(__state._M_subexpr).matched = __ret;
+ if (__results[__state._M_subexpr].second != __current
+ || __results[__state._M_subexpr].matched != true)
+ {
+ auto __back = __results[__state._M_subexpr];
+ __results[__state._M_subexpr].second = __current;
+ __results[__state._M_subexpr].matched = true;
+ __ret = _M_dfs<__match_mode>(__state._M_next);
+ __results[__state._M_subexpr] = __back;
+ }
+ else
+ __ret = _M_dfs<__match_mode>(__state._M_next);
break;
case _S_opcode_match:
if (__current != __end && __state._M_matches(*__current))
@@ -82,7 +100,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// If matched, keep going; else just return to try another state.
case _S_opcode_backref:
{
- auto& __submatch = __results.at(__state._M_backref_index);
+ auto& __submatch = __results[__state._M_backref_index];
if (!__submatch.matched)
break;
auto __last = __current;
@@ -92,12 +110,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
++__last;
if (_M_traits.transform(__submatch.first, __submatch.second)
== _M_traits.transform(__current, __last))
- {
- auto __backup = __current;
- __current = __last;
+ if (__last != __current)
+ {
+ auto __backup = __current;
+ __current = __last;
+ __ret = _M_dfs<__match_mode>(__state._M_next);
+ __current = __backup;
+ }
+ else
__ret = _M_dfs<__match_mode>(__state._M_next);
- __current = __backup;
- }
}
break;
case _S_opcode_accept:
@@ -105,6 +126,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__ret = __current == __end;
else
__ret = true;
+ if (__ret)
+ this->_M_results = __results;
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
@@ -115,22 +138,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
template<bool __match_mode>
- bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
+ void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
_M_main_loop()
{
while (this->_M_current != this->_M_end)
{
if (!__match_mode)
if (_M_includes_some())
- return true;
+ return;
_M_move();
++this->_M_current;
_M_e_closure();
}
- return _M_includes_some();
+ _M_includes_some();
}
- // The SPFA approach.
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
@@ -152,13 +174,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
const auto& __state = _M_nfa[__u];
// Can be implemented using method, but there're too much arguments.
+ // I would use macro function before C++11, but lambda is a better
+ // choice, since hopefully compiler can inline it.
auto __add_visited_state = [&](_StateIdT __v)
{
if (__v == _S_invalid_state_id)
return;
- if (_M_match_less_than(__u, __v))
+ if (_M_covered.count(__u) != 0
+ && (_M_covered.count(__v) == 0
+ || _M_match_less_than(*_M_covered[__u], *_M_covered[__v])))
{
- _M_covered[__v] = _ResultsPtr(new _ResultsT(*_M_covered[__u]));
+ _M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u]));
// if a state is updated, it's outgoing neighbors should be
// reconsidered too. Push them to the queue.
if (!__in_q[__v])
@@ -176,13 +202,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__add_visited_state(__state._M_alt);
break;
case _S_opcode_subexpr_begin:
- _M_covered[__u]->at(__state._M_subexpr).first = __current;
- __add_visited_state(__state._M_next);
+ {
+ auto& __cu = *_M_covered[__u];
+ auto __back = __cu[__state._M_subexpr].first;
+ __cu[__state._M_subexpr].first = __current;
+ __add_visited_state(__state._M_next);
+ __cu[__state._M_subexpr].first = __back;
+ }
break;
case _S_opcode_subexpr_end:
- _M_covered[__u]->at(__state._M_subexpr).second = __current;
- _M_covered[__u]->at(__state._M_subexpr).matched = true;
- __add_visited_state(__state._M_next);
+ {
+ auto& __cu = *_M_covered[__u];
+ auto __back = __cu[__state._M_subexpr];
+ __cu[__state._M_subexpr].second = __current;
+ __cu[__state._M_subexpr].matched = true;
+ __add_visited_state(__state._M_next);
+ __cu[__state._M_subexpr] = __back;
+ }
break;
case _S_opcode_match:
break;
@@ -206,9 +242,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
const auto& __state = _M_nfa[__it.first];
if (__state._M_opcode == _S_opcode_match
&& __state._M_matches(*this->_M_current))
- if (_M_match_less_than(__it.first, __state._M_next)
- && __state._M_next != _S_invalid_state_id)
- __next[__state._M_next] = move(__it.second);
+ if (__state._M_next != _S_invalid_state_id)
+ if (__next.count(__state._M_next) == 0
+ || _M_match_less_than(*__it.second, *__next[__state._M_next]))
+ __next[__state._M_next] = move(__it.second);
}
_M_covered = move(__next);
}
@@ -216,14 +253,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
- _M_match_less_than(_StateIdT __u, _StateIdT __v) const
+ _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const
{
- if (_M_covered.count(__u) == 0)
- return false;
- if (_M_covered.count(__v) > 0)
- return true;
// TODO: Greedy and Non-greedy support
- return true;
+ _GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size());
+ auto __size = __u.size();
+ for (auto __i = 0; __i < __size; __i++)
+ {
+ auto& __uit = __u[__i], __vit = __v[__i];
+ if (__uit.matched && !__vit.matched)
+ return true;
+ if (!__uit.matched && __vit.matched)
+ return false;
+ if (__uit.matched && __vit.matched)
+ {
+ // GREEDY
+ if (__uit.first != __vit.first)
+ return __uit.first < __vit.first;
+ if (__uit.second != __vit.second)
+ return __uit.second > __vit.second;
+ }
+ }
+ return false;
}
template<typename _BiIter, typename _Alloc,
@@ -265,11 +316,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>>
_ExecutorPtr;
typedef _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _DFSExecutorT;
+ typedef _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _BFSExecutorT;
auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>>
(__re._M_automaton);
if (__p->_M_has_backref)
return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags));
- return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags));
+ return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags));
}
_GLIBCXX_END_NAMESPACE_VERSION