From 5aa809137f9e65c56e0b34c15c0fe42be36894d7 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 13:13:40 -0700 Subject: Refactor var Lexer.tokens & func is_keyword --- sqlparse/keywords.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ sqlparse/lexer.py | 71 +++------------------------------------------------- 2 files changed, 70 insertions(+), 68 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py index bfea5d1..9cda48a 100644 --- a/sqlparse/keywords.py +++ b/sqlparse/keywords.py @@ -1,5 +1,72 @@ +# -*- coding: utf-8 -*- + from sqlparse import tokens + +def is_keyword(value): + val = value.upper() + return (KEYWORDS_COMMON.get(val) or KEYWORDS.get(val, tokens.Name)), value + + +SQL_REGEX = { + 'root': [ + (r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single), + # $ matches *before* newline, therefore we have two patterns + # to match Comment.Single + (r'(--|# ).*?$', tokens.Comment.Single), + (r'(\r\n|\r|\n)', tokens.Newline), + (r'\s+', tokens.Whitespace), + (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), + (r':=', tokens.Assignment), + (r'::', tokens.Punctuation), + (r'[*]', tokens.Wildcard), + (r'CASE\b', tokens.Keyword), # extended CASE(foo) + (r"`(``|[^`])*`", tokens.Name), + (r"´(´´|[^´])*´", tokens.Name), + (r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin), + (r'\?{1}', tokens.Name.Placeholder), + (r'%\(\w+\)s', tokens.Name.Placeholder), + (r'%s', tokens.Name.Placeholder), + (r'[$:?]\w+', tokens.Name.Placeholder), + # FIXME(andi): VALUES shouldn't be listed here + # see https://github.com/andialbrecht/sqlparse/pull/64 + (r'VALUES', tokens.Keyword), + (r'(@|##|#)[^\W\d_]\w+', tokens.Name), + # IN is special, it may be followed by a parenthesis, but + # is never a functino, see issue183 + (r'in\b(?=[ (])?', tokens.Keyword), + (r'USING(?=\()', tokens.Keyword), + (r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39 + (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal), + (r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float), + (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float), + (r'[-]?[0-9]+', tokens.Number.Integer), + (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single), + # not a real string literal in ANSI SQL: + (r'(""|".*?[^\\]")', tokens.String.Symbol), + # sqlite names can be escaped with [square brackets]. left bracket + # cannot be preceded by word character or a right bracket -- + # otherwise it's probably an array index + (r'(?=~!]+', tokens.Operator.Comparison), + (r'[+/@#%^&|`?^-]+', tokens.Operator), + ], + 'multiline-comments': [ + (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), + (r'\*/', tokens.Comment.Multiline, '#pop'), + (r'[^/\*]+', tokens.Comment.Multiline), + (r'[/*]', tokens.Comment.Multiline), + ]} + KEYWORDS = { 'ABORT': tokens.Keyword, 'ABS': tokens.Keyword, diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 6bc49ee..a908989 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -16,10 +16,8 @@ import re import sys from sqlparse import tokens -from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON +from sqlparse.keywords import SQL_REGEX from sqlparse.compat import StringIO, string_types, with_metaclass, text_type - - class include(str): pass @@ -35,9 +33,6 @@ class combined(tuple): pass -def is_keyword(value): - test = value.upper() - return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, tokens.Name)), value def apply_filters(stream, filters, lexer=None): @@ -134,9 +129,8 @@ class LexerMeta(type): cls._all_tokens = {} cls._tmpname = 0 processed = cls._all_tokens[cls.__name__] = {} - # tokendefs = tokendefs or cls.tokens[name] - for state in cls.tokens.keys(): - cls._process_state(cls.tokens, processed, state) + for state in SQL_REGEX: + cls._process_state(SQL_REGEX, processed, state) return processed def __call__(cls, *args, **kwds): @@ -160,65 +154,6 @@ class _Lexer(object): tabsize = 0 flags = re.IGNORECASE | re.UNICODE - tokens = { - 'root': [ - (r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single), - # $ matches *before* newline, therefore we have two patterns - # to match Comment.Single - (r'(--|# ).*?$', tokens.Comment.Single), - (r'(\r\n|\r|\n)', tokens.Newline), - (r'\s+', tokens.Whitespace), - (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), - (r':=', tokens.Assignment), - (r'::', tokens.Punctuation), - (r'[*]', tokens.Wildcard), - (r'CASE\b', tokens.Keyword), # extended CASE(foo) - (r"`(``|[^`])*`", tokens.Name), - (r"´(´´|[^´])*´", tokens.Name), - (r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin), - (r'\?{1}', tokens.Name.Placeholder), - (r'%\(\w+\)s', tokens.Name.Placeholder), - (r'%s', tokens.Name.Placeholder), - (r'[$:?]\w+', tokens.Name.Placeholder), - # FIXME(andi): VALUES shouldn't be listed here - # see https://github.com/andialbrecht/sqlparse/pull/64 - (r'VALUES', tokens.Keyword), - (r'(@|##|#)[^\W\d_]\w+', tokens.Name), - # IN is special, it may be followed by a parenthesis, but - # is never a functino, see issue183 - (r'in\b(?=[ (])?', tokens.Keyword), - (r'USING(?=\()', tokens.Keyword), - (r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39 - (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal), - (r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float), - (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float), - (r'[-]?[0-9]+', tokens.Number.Integer), - (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single), - # not a real string literal in ANSI SQL: - (r'(""|".*?[^\\]")', tokens.String.Symbol), - # sqlite names can be escaped with [square brackets]. left bracket - # cannot be preceded by word character or a right bracket -- - # otherwise it's probably an array index - (r'(?=~!]+', tokens.Operator.Comparison), - (r'[+/@#%^&|`?^-]+', tokens.Operator), - ], - 'multiline-comments': [ - (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), - (r'\*/', tokens.Comment.Multiline, '#pop'), - (r'[^/\*]+', tokens.Comment.Multiline), - (r'[/*]', tokens.Comment.Multiline), - ]} - def __init__(self): self.filters = [] -- cgit v1.2.1 From 4c1200b95fd345d502e3da5173cdbcdbd67bfb11 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 12:58:09 -0700 Subject: Remove unused func/class/args in lexer.py Remove unused apply_filters from lexer.py Remove unused filters function from lexer.py Remove unused arguments get_tokens func --- sqlparse/lexer.py | 63 +++---------------------------------------------------- 1 file changed, 3 insertions(+), 60 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index a908989..74a3431 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -18,37 +18,6 @@ import sys from sqlparse import tokens from sqlparse.keywords import SQL_REGEX from sqlparse.compat import StringIO, string_types, with_metaclass, text_type -class include(str): - pass - - -class combined(tuple): - """Indicates a state combined from multiple states.""" - - def __new__(cls, *args): - return tuple.__new__(cls, args) - - def __init__(self, *args): - # tuple.__init__ doesn't do anything - pass - - - - -def apply_filters(stream, filters, lexer=None): - """ - Use this method to apply an iterable of filters to - a stream. If lexer is given it's forwarded to the - filter, otherwise the filter receives `None`. - """ - - def _apply(filter_, stream): - for token in filter_.filter(lexer, stream): - yield token - - for filter_ in filters: - stream = _apply(filter_, stream) - return stream class LexerMeta(type): @@ -65,12 +34,6 @@ class LexerMeta(type): tokenlist = processed[state] = [] rflags = cls.flags for tdef in unprocessed[state]: - if isinstance(tdef, include): - # it's a state reference - assert tdef != state, "circular state reference %r" % state - tokenlist.extend(cls._process_state( - unprocessed, processed, str(tdef))) - continue assert type(tdef) is tuple, "wrong rule def %r" % tdef @@ -101,18 +64,6 @@ class LexerMeta(type): new_state = -int(tdef2[5:]) else: assert False, 'unknown new state %r' % tdef2 - elif isinstance(tdef2, combined): - # combine a new state from existing ones - new_state = '_tmp_%d' % cls._tmpname - cls._tmpname += 1 - itokens = [] - for istate in tdef2: - assert istate != state, \ - 'circular state ref %r' % istate - itokens.extend(cls._process_state(unprocessed, - processed, istate)) - processed[new_state] = itokens - new_state = (new_state,) elif isinstance(tdef2, tuple): # push more than one state for state in tdef2: @@ -157,12 +108,6 @@ class _Lexer(object): def __init__(self): self.filters = [] - def add_filter(self, filter_, **options): - from sqlparse.filters import Filter - if not isinstance(filter_, Filter): - filter_ = filter_(**options) - self.filters.append(filter_) - def _expandtabs(self, text): if self.tabsize > 0: text = text.expandtabs(self.tabsize) @@ -186,7 +131,7 @@ class _Lexer(object): text = text.decode('unicode-escape') return self._expandtabs(text) - def get_tokens(self, text, unfiltered=False): + def get_tokens(self, text): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism @@ -211,11 +156,9 @@ class _Lexer(object): for i, t, v in self.get_tokens_unprocessed(text): yield t, v stream = streamer() - if not unfiltered: - stream = apply_filters(stream, self.filters, self) return stream - def get_tokens_unprocessed(self, stream, stack=('root',)): + def get_tokens_unprocessed(self, stream): """ Split ``text`` into (tokentype, text) pairs. @@ -223,7 +166,7 @@ class _Lexer(object): """ pos = 0 tokendefs = self._tokens # see __call__, pylint:disable=E1101 - statestack = list(stack) + statestack = ['root', ] statetokens = tokendefs[statestack[-1]] known_names = {} -- cgit v1.2.1 From 73f5f61c73da92cb5358c6b50b25e8d2eb20e3be Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 13:34:21 -0700 Subject: Remove undocumented ws handlers from lexer.py Removing nl before tokenizing breaks comments (stripnl/stripall) Remove tab-expansion. --- sqlparse/lexer.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 74a3431..6915a6a 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -100,23 +100,15 @@ class LexerMeta(type): class _Lexer(object): encoding = 'utf-8' - stripall = False - stripnl = False - tabsize = 0 flags = re.IGNORECASE | re.UNICODE def __init__(self): self.filters = [] - def _expandtabs(self, text): - if self.tabsize > 0: - text = text.expandtabs(self.tabsize) - return text - def _decode(self, text): if sys.version_info[0] == 3: if isinstance(text, str): - return self._expandtabs(text) + return text if self.encoding == 'guess': try: text = text.decode('utf-8') @@ -129,7 +121,7 @@ class _Lexer(object): text = text.decode(self.encoding) except UnicodeDecodeError: text = text.decode('unicode-escape') - return self._expandtabs(text) + return text def get_tokens(self, text): """ @@ -141,11 +133,6 @@ class _Lexer(object): wanted and applies registered filters. """ if isinstance(text, string_types): - if self.stripall: - text = text.strip() - elif self.stripnl: - text = text.strip('\n') - if sys.version_info[0] < 3 and isinstance(text, text_type): text = StringIO(text.encode('utf-8')) self.encoding = 'utf-8' -- cgit v1.2.1 From acae8ffbba7ddc310e5bc8aea391c099a57be903 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 14:40:21 -0700 Subject: Remove assert statements from lexer.py --- sqlparse/lexer.py | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 6915a6a..6a7d950 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -27,16 +27,11 @@ class LexerMeta(type): """ def _process_state(cls, unprocessed, processed, state): - assert type(state) is str, "wrong state name %r" % state - assert state[0] != '#', "invalid state name %r" % state if state in processed: return processed[state] tokenlist = processed[state] = [] rflags = cls.flags for tdef in unprocessed[state]: - - assert type(tdef) is tuple, "wrong rule def %r" % tdef - try: rex = re.compile(tdef[0], rflags).match except Exception as err: @@ -44,10 +39,6 @@ class LexerMeta(type): " %r of %r: %s" % (tdef[0], state, cls, err))) - assert type(tdef[1]) is tokens._TokenType or callable(tdef[1]), \ - ('token type must be simple type or callable, not %r' - % (tdef[1],)) - if len(tdef) == 2: new_state = None else: @@ -62,17 +53,9 @@ class LexerMeta(type): new_state = tdef2 elif tdef2[:5] == '#pop:': new_state = -int(tdef2[5:]) - else: - assert False, 'unknown new state %r' % tdef2 elif isinstance(tdef2, tuple): # push more than one state - for state in tdef2: - assert (state in unprocessed or - state in ('#pop', '#push')), \ - 'unknown new state ' + state new_state = tdef2 - else: - assert False, 'unknown new state def %r' % tdef2 tokenlist.append((rex, tdef[1], new_state)) return tokenlist @@ -198,8 +181,6 @@ class _Lexer(object): del statestack[new_state:] elif new_state == '#push': statestack.append(statestack[-1]) - else: - assert False, "wrong state def: %r" % new_state statetokens = tokendefs[statestack[-1]] break else: -- cgit v1.2.1 From 3ab860635cc0de381bdda01240646ab9ae39f162 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 15:29:26 -0700 Subject: Merge process_tokendef and __call__ Remove unused token_variants --- sqlparse/lexer.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 6a7d950..26e29c2 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -59,25 +59,15 @@ class LexerMeta(type): tokenlist.append((rex, tdef[1], new_state)) return tokenlist - def process_tokendef(cls): - cls._all_tokens = {} - cls._tmpname = 0 - processed = cls._all_tokens[cls.__name__] = {} - for state in SQL_REGEX: - cls._process_state(SQL_REGEX, processed, state) - return processed - - def __call__(cls, *args, **kwds): + def __call__(cls, *args): if not hasattr(cls, '_tokens'): cls._all_tokens = {} - cls._tmpname = 0 - if hasattr(cls, 'token_variants') and cls.token_variants: - # don't process yet - pass - else: - cls._tokens = cls.process_tokendef() + processed = cls._all_tokens[cls.__name__] = {} - return type.__call__(cls, *args, **kwds) + for state in SQL_REGEX: + cls._process_state(SQL_REGEX, processed, state) + cls._tokens = processed + return type.__call__(cls, *args) class _Lexer(object): -- cgit v1.2.1 From 6b95b43014db358527f53e8b191999d2f753f0a9 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 15:27:46 -0700 Subject: Merge process_state and __call__ Other Changes: Only #pop state is used by multiline comments. Remove unused states Simplify RegEx compilation RegEx Exception too generic --- sqlparse/lexer.py | 50 ++++++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 26e29c2..1bdc4c3 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -26,46 +26,28 @@ class LexerMeta(type): self.tokens on the first instantiation. """ - def _process_state(cls, unprocessed, processed, state): - if state in processed: - return processed[state] - tokenlist = processed[state] = [] - rflags = cls.flags - for tdef in unprocessed[state]: - try: - rex = re.compile(tdef[0], rflags).match - except Exception as err: - raise ValueError(("uncompilable regex %r in state" - " %r of %r: %s" - % (tdef[0], state, cls, err))) - - if len(tdef) == 2: - new_state = None - else: - tdef2 = tdef[2] - if isinstance(tdef2, str): - # an existing state - if tdef2 == '#pop': - new_state = -1 - elif tdef2 in unprocessed: - new_state = (tdef2,) - elif tdef2 == '#push': - new_state = tdef2 - elif tdef2[:5] == '#pop:': - new_state = -int(tdef2[5:]) - elif isinstance(tdef2, tuple): - # push more than one state - new_state = tdef2 - tokenlist.append((rex, tdef[1], new_state)) - return tokenlist - def __call__(cls, *args): if not hasattr(cls, '_tokens'): cls._all_tokens = {} processed = cls._all_tokens[cls.__name__] = {} for state in SQL_REGEX: - cls._process_state(SQL_REGEX, processed, state) + processed[state] = [] + + for tdef in SQL_REGEX[state]: + rex = re.compile(tdef[0], cls.flags).match + + if len(tdef) == 2: + new_state = None + else: + # Only Multiline comments + tdef2 = tdef[2] + # an existing state + if tdef2 == '#pop': + new_state = -1 + elif tdef2 in SQL_REGEX: + new_state = (tdef2,) + processed[state].append((rex, tdef[1], new_state)) cls._tokens = processed return type.__call__(cls, *args) -- cgit v1.2.1 From 7e88aa243ed3356655b3a86da42e4d5dffb2bf10 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 17:28:54 -0700 Subject: Bid Adieu to metaclass The singleton pattern isn't applicable since only one language is being implemented. Simplify Lexer initialization. Fix compat func `u` on Py3. Signature didn't match Py2. Feature isn't used yet. --- sqlparse/compat.py | 16 +-------------- sqlparse/lexer.py | 57 ++++++++++++++++-------------------------------------- 2 files changed, 18 insertions(+), 55 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/compat.py b/sqlparse/compat.py index 334883b..84d0c96 100644 --- a/sqlparse/compat.py +++ b/sqlparse/compat.py @@ -14,7 +14,7 @@ PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] == 3 if PY3: - def u(s): + def u(s, encoding=None): return str(s) @@ -37,17 +37,3 @@ elif PY2: text_type = unicode string_types = (basestring,) from StringIO import StringIO - - -# Directly copied from six: -def with_metaclass(meta, *bases): - """Create a base class with a metaclass.""" - - # This requires a bit of explanation: the basic idea is to make a dummy - # metaclass for one level of class instantiation that replaces itself with - # the actual metaclass. - class metaclass(meta): - def __new__(cls, name, this_bases, d): - return meta(name, bases, d) - - return type.__new__(metaclass, 'temporary_class', (), {}) diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 1bdc4c3..d7a8d28 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -17,48 +17,29 @@ import sys from sqlparse import tokens from sqlparse.keywords import SQL_REGEX -from sqlparse.compat import StringIO, string_types, with_metaclass, text_type +from sqlparse.compat import StringIO, string_types, text_type -class LexerMeta(type): - """ - Metaclass for Lexer, creates the self._tokens attribute from - self.tokens on the first instantiation. - """ - - def __call__(cls, *args): - if not hasattr(cls, '_tokens'): - cls._all_tokens = {} - processed = cls._all_tokens[cls.__name__] = {} - - for state in SQL_REGEX: - processed[state] = [] - - for tdef in SQL_REGEX[state]: - rex = re.compile(tdef[0], cls.flags).match - - if len(tdef) == 2: - new_state = None - else: - # Only Multiline comments - tdef2 = tdef[2] - # an existing state - if tdef2 == '#pop': - new_state = -1 - elif tdef2 in SQL_REGEX: - new_state = (tdef2,) - processed[state].append((rex, tdef[1], new_state)) - cls._tokens = processed - return type.__call__(cls, *args) - - -class _Lexer(object): - +class Lexer(object): encoding = 'utf-8' flags = re.IGNORECASE | re.UNICODE def __init__(self): - self.filters = [] + self._tokens = {} + + for state in SQL_REGEX: + self._tokens[state] = [] + + for tdef in SQL_REGEX[state]: + rex = re.compile(tdef[0], self.flags).match + new_state = None + if len(tdef) > 2: + # Only Multiline comments + if tdef[2] == '#pop': + new_state = -1 + elif tdef[2] in SQL_REGEX: + new_state = (tdef[2],) + self._tokens[state].append((rex, tdef[1], new_state)) def _decode(self, text): if sys.version_info[0] == 3: @@ -170,10 +151,6 @@ class _Lexer(object): break -class Lexer(with_metaclass(LexerMeta, _Lexer)): - pass - - def tokenize(sql, encoding=None): """Tokenize sql. -- cgit v1.2.1 From 099435cff2c79f1455e532e1e5423332d01f3519 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 22:43:06 -0700 Subject: Refactor lexer.tokens_unprocessed Remove unused return pos Remove redundant streamer func Remove unreached else statement Clean-up Lexer tokenization --- sqlparse/lexer.py | 87 +++++++++++++++++-------------------------------------- 1 file changed, 27 insertions(+), 60 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index d7a8d28..ca76eb1 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -17,7 +17,8 @@ import sys from sqlparse import tokens from sqlparse.keywords import SQL_REGEX -from sqlparse.compat import StringIO, string_types, text_type +from sqlparse.compat import StringIO, string_types, text_type, range +from sqlparse.utils import consume class Lexer(object): @@ -75,11 +76,7 @@ class Lexer(object): else: text = StringIO(text) - def streamer(): - for i, t, v in self.get_tokens_unprocessed(text): - yield t, v - stream = streamer() - return stream + return self.get_tokens_unprocessed(text) def get_tokens_unprocessed(self, stream): """ @@ -87,68 +84,38 @@ class Lexer(object): ``stack`` is the inital stack (default: ``['root']``) """ - pos = 0 - tokendefs = self._tokens # see __call__, pylint:disable=E1101 statestack = ['root', ] - statetokens = tokendefs[statestack[-1]] - known_names = {} + statetokens = self._tokens['root'] text = stream.read() text = self._decode(text) + iterable = iter(range(len(text))) - while 1: + for pos in iterable: for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) - if m: - value = m.group() - if value in known_names: - yield pos, known_names[value], value - elif type(action) is tokens._TokenType: - yield pos, action, value - elif hasattr(action, '__call__'): - ttype, value = action(value) - known_names[value] = ttype - yield pos, ttype, value - else: - for item in action(self, m): - yield item - pos = m.end() - if new_state is not None: - # state transition - if isinstance(new_state, tuple): - for state in new_state: - if state == '#pop': - statestack.pop() - elif state == '#push': - statestack.append(statestack[-1]) - elif ( - # Ugly hack - multiline-comments - # are not stackable - state != 'multiline-comments' - or not statestack - or statestack[-1] != 'multiline-comments' - ): - statestack.append(state) - elif isinstance(new_state, int): - # pop - del statestack[new_state:] - elif new_state == '#push': - statestack.append(statestack[-1]) - statetokens = tokendefs[statestack[-1]] - break + + if not m: + continue + elif isinstance(action, tokens._TokenType): + yield action, m.group() + elif callable(action): + yield action(m.group()) + + if isinstance(new_state, tuple): + for state in new_state: + # fixme: multiline-comments not stackable + if not (state == 'multiline-comments' + and statestack[-1] == 'multiline-comments'): + statestack.append(state) + elif isinstance(new_state, int): + del statestack[new_state:] + statetokens = self._tokens[statestack[-1]] + + consume(iterable, m.end() - pos - 1) + break else: - try: - if text[pos] == '\n': - # at EOL, reset state to "root" - pos += 1 - statestack = ['root'] - statetokens = tokendefs['root'] - yield pos, tokens.Text, u'\n' - continue - yield pos, tokens.Error, text[pos] - pos += 1 - except IndexError: - break + yield tokens.Error, text[pos] def tokenize(sql, encoding=None): -- cgit v1.2.1 From ce1374796a6dca53f44f1bd3fe09c6aa17574652 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 23:31:48 -0700 Subject: Remove encoding guessing on lexer.py --- sqlparse/lexer.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index ca76eb1..a371af7 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -43,17 +43,7 @@ class Lexer(object): self._tokens[state].append((rex, tdef[1], new_state)) def _decode(self, text): - if sys.version_info[0] == 3: - if isinstance(text, str): - return text - if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') - else: + if not isinstance(text, text_type): try: text = text.decode(self.encoding) except UnicodeDecodeError: -- cgit v1.2.1 From faacd60c2769008cf1cf439c969e6183cdb754fc Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Sat, 28 May 2016 23:32:41 -0700 Subject: Simplify handling of encoding in lexer.py --- sqlparse/lexer.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index a371af7..a93f7a7 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -13,7 +13,6 @@ # and to allow some customizations. import re -import sys from sqlparse import tokens from sqlparse.keywords import SQL_REGEX @@ -42,14 +41,6 @@ class Lexer(object): new_state = (tdef[2],) self._tokens[state].append((rex, tdef[1], new_state)) - def _decode(self, text): - if not isinstance(text, text_type): - try: - text = text.decode(self.encoding) - except UnicodeDecodeError: - text = text.decode('unicode-escape') - return text - def get_tokens(self, text): """ Return an iterable of (tokentype, value) pairs generated from @@ -58,18 +49,7 @@ class Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. - """ - if isinstance(text, string_types): - if sys.version_info[0] < 3 and isinstance(text, text_type): - text = StringIO(text.encode('utf-8')) - self.encoding = 'utf-8' - else: - text = StringIO(text) - - return self.get_tokens_unprocessed(text) - def get_tokens_unprocessed(self, stream): - """ Split ``text`` into (tokentype, text) pairs. ``stack`` is the inital stack (default: ``['root']``) @@ -77,8 +57,16 @@ class Lexer(object): statestack = ['root', ] statetokens = self._tokens['root'] - text = stream.read() - text = self._decode(text) + if isinstance(text, string_types): + text = StringIO(text) + + text = text.read() + if not isinstance(text, text_type): + try: + text = text.decode(self.encoding) + except UnicodeDecodeError: + text = text.decode('unicode-escape') + iterable = iter(range(len(text))) for pos in iterable: -- cgit v1.2.1 From e98b1922a10e8a5c2608d4cbe8cb9fa76c50baa5 Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Thu, 2 Jun 2016 12:21:05 -0700 Subject: Pass encoding into lexer as a parameter --- sqlparse/lexer.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index a93f7a7..d2ae8f6 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -21,7 +21,6 @@ from sqlparse.utils import consume class Lexer(object): - encoding = 'utf-8' flags = re.IGNORECASE | re.UNICODE def __init__(self): @@ -41,7 +40,7 @@ class Lexer(object): new_state = (tdef[2],) self._tokens[state].append((rex, tdef[1], new_state)) - def get_tokens(self, text): + def get_tokens(self, text, encoding=None): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism @@ -54,6 +53,7 @@ class Lexer(object): ``stack`` is the inital stack (default: ``['root']``) """ + encoding = encoding or 'utf-8' statestack = ['root', ] statetokens = self._tokens['root'] @@ -63,7 +63,7 @@ class Lexer(object): text = text.read() if not isinstance(text, text_type): try: - text = text.decode(self.encoding) + text = text.decode(encoding) except UnicodeDecodeError: text = text.decode('unicode-escape') @@ -102,7 +102,4 @@ def tokenize(sql, encoding=None): Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream of ``(token type, value)`` items. """ - lexer = Lexer() - if encoding is not None: - lexer.encoding = encoding - return lexer.get_tokens(sql) + return Lexer().get_tokens(sql, encoding) -- cgit v1.2.1 From 8240d962ae0f09119fde7b1575924068f02c6d8c Mon Sep 17 00:00:00 2001 From: Victor Uriarte Date: Thu, 2 Jun 2016 12:23:21 -0700 Subject: Replace iter(range(len(...))) with enumerate --- sqlparse/compat.py | 2 -- sqlparse/lexer.py | 9 ++++----- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'sqlparse') diff --git a/sqlparse/compat.py b/sqlparse/compat.py index 84d0c96..c1aacf6 100644 --- a/sqlparse/compat.py +++ b/sqlparse/compat.py @@ -18,7 +18,6 @@ if PY3: return str(s) - range = range text_type = str string_types = (str,) from io import StringIO @@ -33,7 +32,6 @@ elif PY2: return unicode(s, encoding) - range = xrange text_type = unicode string_types = (basestring,) from StringIO import StringIO diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index d2ae8f6..bb7fb48 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -16,7 +16,7 @@ import re from sqlparse import tokens from sqlparse.keywords import SQL_REGEX -from sqlparse.compat import StringIO, string_types, text_type, range +from sqlparse.compat import StringIO, string_types, text_type from sqlparse.utils import consume @@ -67,9 +67,8 @@ class Lexer(object): except UnicodeDecodeError: text = text.decode('unicode-escape') - iterable = iter(range(len(text))) - - for pos in iterable: + iterable = enumerate(text) + for pos, char in iterable: for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) @@ -93,7 +92,7 @@ class Lexer(object): consume(iterable, m.end() - pos - 1) break else: - yield tokens.Error, text[pos] + yield tokens.Error, char def tokenize(sql, encoding=None): -- cgit v1.2.1