diff options
author | Simon Heisterkamp <simon@heisterkamp.dk> | 2022-11-30 14:51:58 +0000 |
---|---|---|
committer | Andi Albrecht <albrecht.andi@gmail.com> | 2023-01-02 08:54:47 +0100 |
commit | 9a1cb5dddd1545c30b1e3a2c6f5d3514d079d93e (patch) | |
tree | 07bf50ff251f9fe0f092a166373db47e1a4d62e7 | |
parent | 8b789f286e1b6cbf05c15020ea7544cb7f02f8f7 (diff) | |
download | sqlparse-9a1cb5dddd1545c30b1e3a2c6f5d3514d079d93e.tar.gz |
configurable syntax
-rw-r--r-- | sqlparse/keywords.py | 22 | ||||
-rw-r--r-- | sqlparse/lexer.py | 87 | ||||
-rw-r--r-- | tests/test_keywords.py | 3 |
3 files changed, 82 insertions, 30 deletions
diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py index dff5e1c..ce53781 100644 --- a/sqlparse/keywords.py +++ b/sqlparse/keywords.py @@ -6,23 +6,17 @@ # the BSD License: https://opensource.org/licenses/BSD-3-Clause import re +from typing import Dict, List, Tuple, Callable, Union from sqlparse import tokens +# object() only supports "is" and is useful as a marker +PROCESS_AS_KEYWORD = object() -def is_keyword(value): - """Checks for a keyword. - - If the given value is in one of the KEYWORDS_* dictionary - it's considered a keyword. Otherwise tokens.Name is returned. - """ - val = value.upper() - return (KEYWORDS_COMMON.get(val) - or KEYWORDS_ORACLE.get(val) - or KEYWORDS_PLPGSQL.get(val) - or KEYWORDS_HQL.get(val) - or KEYWORDS_MSACCESS.get(val) - or KEYWORDS.get(val, tokens.Name)), value +SQL_REGEX_TYPE = List[ + Tuple[Callable, Union[type(PROCESS_AS_KEYWORD), tokens._TokenType]] +] +KEYWORDS_TYPE = Dict[str, tokens._TokenType] SQL_REGEX = { @@ -99,7 +93,7 @@ SQL_REGEX = { (r'(NOT\s+)?(REGEXP)\b', tokens.Operator.Comparison), # Check for keywords, also returns tokens.Name if regex matches # but the match isn't a keyword. - (r'[0-9_\w][_$#\w]*', is_keyword), + (r'[0-9_\w][_$#\w]*', PROCESS_AS_KEYWORD), (r'[;:()\[\],\.]', tokens.Punctuation), (r'[<>=~!]+', tokens.Operator.Comparison), (r'[+/@#%^&|^-]+', tokens.Operator), diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 4397f18..61c52a9 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -13,19 +13,74 @@ # and to allow some customizations. from io import TextIOBase +from typing import List -from sqlparse import tokens -from sqlparse.keywords import SQL_REGEX +from sqlparse import tokens, keywords from sqlparse.utils import consume -class Lexer: - """Lexer - Empty class. Leaving for backwards-compatibility - """ +class _LexerSingletonMetaclass(type): + _lexer_instance = None + + def __call__(cls, *args, **kwargs): + if _LexerSingletonMetaclass._lexer_instance is None: + _LexerSingletonMetaclass._lexer_instance = super( + _LexerSingletonMetaclass, cls + ).__call__(*args, **kwargs) + return _LexerSingletonMetaclass._lexer_instance + + +class Lexer(metaclass=_LexerSingletonMetaclass): + """The Lexer supports configurable syntax. + To add support for additional keywords, use the `add_keywords` method.""" + + _SQL_REGEX: keywords.SQL_REGEX_TYPE + _keywords: List[keywords.KEYWORDS_TYPE] + + def default_initialization(self): + """Initialize the lexer with default dictionaries. + Useful if you need to revert custom syntax settings.""" + self.clear() + self.set_SQL_REGEX(keywords.SQL_REGEX) + self.add_keywords(keywords.KEYWORDS_COMMON) + self.add_keywords(keywords.KEYWORDS_ORACLE) + self.add_keywords(keywords.KEYWORDS_PLPGSQL) + self.add_keywords(keywords.KEYWORDS_HQL) + self.add_keywords(keywords.KEYWORDS_MSACCESS) + self.add_keywords(keywords.KEYWORDS) + + def __init__(self): + self.default_initialization() + + def clear(self): + """Clear all syntax configurations. + Useful if you want to load a reduced set of syntax configurations.""" + self._SQL_REGEX = [] + self._keywords = [] + + def set_SQL_REGEX(self, SQL_REGEX: keywords.SQL_REGEX_TYPE): + """Set the list of regex that will parse the SQL.""" + self._SQL_REGEX = SQL_REGEX + + def add_keywords(self, keywords: keywords.KEYWORDS_TYPE): + """Add keyword dictionaries. Keywords are looked up in the same order + that dictionaries were added.""" + self._keywords.append(keywords) + + def is_keyword(self, value): + """Checks for a keyword. + + If the given value is in one of the KEYWORDS_* dictionary + it's considered a keyword. Otherwise tokens.Name is returned. + """ + val = value.upper() + for kwdict in self._keywords: + if val in kwdict: + return kwdict[val], value + else: + return tokens.Name, value - @staticmethod - def get_tokens(text, encoding=None): + def get_tokens(self, text, encoding=None): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism @@ -48,24 +103,26 @@ class Lexer: text = text.decode(encoding) else: try: - text = text.decode('utf-8') + text = text.decode("utf-8") except UnicodeDecodeError: - text = text.decode('unicode-escape') + text = text.decode("unicode-escape") else: - raise TypeError("Expected text or file-like object, got {!r}". - format(type(text))) + raise TypeError( + "Expected text or file-like object, got {!r}" + .format(type(text)) + ) iterable = enumerate(text) for pos, char in iterable: - for rexmatch, action in SQL_REGEX: + for rexmatch, action in self._SQL_REGEX: m = rexmatch(text, pos) if not m: continue elif isinstance(action, tokens._TokenType): yield action, m.group() - elif callable(action): - yield action(m.group()) + elif action is keywords.PROCESS_AS_KEYWORD: + yield self.is_keyword(m.group()) consume(iterable, m.end() - pos - 1) break diff --git a/tests/test_keywords.py b/tests/test_keywords.py index d4ded4b..a3b1b38 100644 --- a/tests/test_keywords.py +++ b/tests/test_keywords.py @@ -2,6 +2,7 @@ import pytest from sqlparse import tokens from sqlparse.keywords import SQL_REGEX +from sqlparse.lexer import Lexer class TestSQLREGEX: @@ -9,5 +10,5 @@ class TestSQLREGEX: '1.', '-1.', '.1', '-.1']) def test_float_numbers(self, number): - ttype = next(tt for action, tt in SQL_REGEX if action(number)) + ttype = next(tt for action, tt in Lexer()._SQL_REGEX if action(number)) assert tokens.Number.Float == ttype |