configurable syntax

author: Simon Heisterkamp <simon@heisterkamp.dk> 2022-11-30 14:51:58 +0000
committer: Andi Albrecht <albrecht.andi@gmail.com> 2023-01-02 08:54:47 +0100
commit: 9a1cb5dddd1545c30b1e3a2c6f5d3514d079d93e (patch)
tree: 07bf50ff251f9fe0f092a166373db47e1a4d62e7
parent: 8b789f286e1b6cbf05c15020ea7544cb7f02f8f7 (diff)
download: sqlparse-9a1cb5dddd1545c30b1e3a2c6f5d3514d079d93e.tar.gz
3 files changed, 82 insertions, 30 deletions
diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py
index dff5e1c..ce53781 100644
--- a/sqlparse/keywords.py
+++ b/sqlparse/keywords.py
@@ -6,23 +6,17 @@
 # the BSD License: https://opensource.org/licenses/BSD-3-Clause
 
 import re
+from typing import Dict, List, Tuple, Callable, Union
 
 from sqlparse import tokens
 
+# object() only supports "is" and is useful as a marker
+PROCESS_AS_KEYWORD = object()
 
-def is_keyword(value):
-    """Checks for a keyword.
-
-    If the given value is in one of the KEYWORDS_* dictionary
-    it's considered a keyword. Otherwise tokens.Name is returned.
-    """
-    val = value.upper()
-    return (KEYWORDS_COMMON.get(val)
-            or KEYWORDS_ORACLE.get(val)
-            or KEYWORDS_PLPGSQL.get(val)
-            or KEYWORDS_HQL.get(val)
-            or KEYWORDS_MSACCESS.get(val)
-            or KEYWORDS.get(val, tokens.Name)), value
+SQL_REGEX_TYPE = List[
+    Tuple[Callable, Union[type(PROCESS_AS_KEYWORD), tokens._TokenType]]
+]
+KEYWORDS_TYPE = Dict[str, tokens._TokenType]
 
 
 SQL_REGEX = {
@@ -99,7 +93,7 @@ SQL_REGEX = {
         (r'(NOT\s+)?(REGEXP)\b', tokens.Operator.Comparison),
         # Check for keywords, also returns tokens.Name if regex matches
         # but the match isn't a keyword.
-        (r'[0-9_\w][_$#\w]*', is_keyword),
+        (r'[0-9_\w][_$#\w]*', PROCESS_AS_KEYWORD),
         (r'[;:()\[\],\.]', tokens.Punctuation),
         (r'[<>=~!]+', tokens.Operator.Comparison),
         (r'[+/@#%^&|^-]+', tokens.Operator),
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index 4397f18..61c52a9 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -13,19 +13,74 @@
 # and to allow some customizations.
 
 from io import TextIOBase
+from typing import List
 
-from sqlparse import tokens
-from sqlparse.keywords import SQL_REGEX
+from sqlparse import tokens, keywords
 from sqlparse.utils import consume
 
 
-class Lexer:
-    """Lexer
-    Empty class. Leaving for backwards-compatibility
-    """
+class _LexerSingletonMetaclass(type):
+    _lexer_instance = None
+
+    def __call__(cls, *args, **kwargs):
+        if _LexerSingletonMetaclass._lexer_instance is None:
+            _LexerSingletonMetaclass._lexer_instance = super(
+                _LexerSingletonMetaclass, cls
+            ).__call__(*args, **kwargs)
+        return _LexerSingletonMetaclass._lexer_instance
+
+
+class Lexer(metaclass=_LexerSingletonMetaclass):
+    """The Lexer supports configurable syntax.
+    To add support for additional keywords, use the `add_keywords` method."""
+
+    _SQL_REGEX: keywords.SQL_REGEX_TYPE
+    _keywords: List[keywords.KEYWORDS_TYPE]
+
+    def default_initialization(self):
+        """Initialize the lexer with default dictionaries.
+        Useful if you need to revert custom syntax settings."""
+        self.clear()
+        self.set_SQL_REGEX(keywords.SQL_REGEX)
+        self.add_keywords(keywords.KEYWORDS_COMMON)
+        self.add_keywords(keywords.KEYWORDS_ORACLE)
+        self.add_keywords(keywords.KEYWORDS_PLPGSQL)
+        self.add_keywords(keywords.KEYWORDS_HQL)
+        self.add_keywords(keywords.KEYWORDS_MSACCESS)
+        self.add_keywords(keywords.KEYWORDS)
+
+    def __init__(self):
+        self.default_initialization()
+
+    def clear(self):
+        """Clear all syntax configurations.
+        Useful if you want to load a reduced set of syntax configurations."""
+        self._SQL_REGEX = []
+        self._keywords = []
+
+    def set_SQL_REGEX(self, SQL_REGEX: keywords.SQL_REGEX_TYPE):
+        """Set the list of regex that will parse the SQL."""
+        self._SQL_REGEX = SQL_REGEX
+
+    def add_keywords(self, keywords: keywords.KEYWORDS_TYPE):
+        """Add keyword dictionaries. Keywords are looked up in the same order
+        that dictionaries were added."""
+        self._keywords.append(keywords)
+
+    def is_keyword(self, value):
+        """Checks for a keyword.
+
+        If the given value is in one of the KEYWORDS_* dictionary
+        it's considered a keyword. Otherwise tokens.Name is returned.
+        """
+        val = value.upper()
+        for kwdict in self._keywords:
+            if val in kwdict:
+                return kwdict[val], value
+        else:
+            return tokens.Name, value
 
-    @staticmethod
-    def get_tokens(text, encoding=None):
+    def get_tokens(self, text, encoding=None):
         """
         Return an iterable of (tokentype, value) pairs generated from
         `text`. If `unfiltered` is set to `True`, the filtering mechanism
@@ -48,24 +103,26 @@ class Lexer:
                 text = text.decode(encoding)
             else:
                 try:
-                    text = text.decode('utf-8')
+                    text = text.decode("utf-8")
                 except UnicodeDecodeError:
-                    text = text.decode('unicode-escape')
+                    text = text.decode("unicode-escape")
         else:
-            raise TypeError("Expected text or file-like object, got {!r}".
-                            format(type(text)))
+            raise TypeError(
+                "Expected text or file-like object, got {!r}"
+                .format(type(text))
+            )
 
         iterable = enumerate(text)
         for pos, char in iterable:
-            for rexmatch, action in SQL_REGEX:
+            for rexmatch, action in self._SQL_REGEX:
                 m = rexmatch(text, pos)
 
                 if not m:
                     continue
                 elif isinstance(action, tokens._TokenType):
                     yield action, m.group()
-                elif callable(action):
-                    yield action(m.group())
+                elif action is keywords.PROCESS_AS_KEYWORD:
+                    yield self.is_keyword(m.group())
 
                 consume(iterable, m.end() - pos - 1)
                 break
diff --git a/tests/test_keywords.py b/tests/test_keywords.py
index d4ded4b..a3b1b38 100644
--- a/tests/test_keywords.py
+++ b/tests/test_keywords.py
@@ -2,6 +2,7 @@ import pytest
 
 from sqlparse import tokens
 from sqlparse.keywords import SQL_REGEX
+from sqlparse.lexer import Lexer
 
 
 class TestSQLREGEX:
@@ -9,5 +10,5 @@ class TestSQLREGEX:
                                         '1.', '-1.',
                                         '.1', '-.1'])
     def test_float_numbers(self, number):
-        ttype = next(tt for action, tt in SQL_REGEX if action(number))
+        ttype = next(tt for action, tt in Lexer()._SQL_REGEX if action(number))
         assert tokens.Number.Float == ttype
author	Simon Heisterkamp <simon@heisterkamp.dk>	2022-11-30 14:51:58 +0000
committer	Andi Albrecht <albrecht.andi@gmail.com>	2023-01-02 08:54:47 +0100
commit	9a1cb5dddd1545c30b1e3a2c6f5d3514d079d93e (patch)
tree	07bf50ff251f9fe0f092a166373db47e1a4d62e7
parent	8b789f286e1b6cbf05c15020ea7544cb7f02f8f7 (diff)
download	sqlparse-9a1cb5dddd1545c30b1e3a2c6f5d3514d079d93e.tar.gz