From e0d3928ba69d73ba874ca03ec4395e94cf1ab293 Mon Sep 17 00:00:00 2001
From: Simon Heisterkamp <simon@heisterkamp.dk>
Date: Thu, 1 Dec 2022 10:35:42 +0000
Subject: lexer documentation

---
 docs/source/extending.rst |  66 +++++++++++++++++++
 docs/source/index.rst     |   1 +
 sqlparse/keywords.py      | 164 ++++++++++++++++++++++------------------------
 sqlparse/lexer.py         |   5 +-
 tests/test_parse.py       |   6 +-
 5 files changed, 151 insertions(+), 91 deletions(-)
 create mode 100644 docs/source/extending.rst

diff --git a/docs/source/extending.rst b/docs/source/extending.rst
new file mode 100644
index 0000000..f1bd551
--- /dev/null
+++ b/docs/source/extending.rst
@@ -0,0 +1,66 @@
+Extending :mod:`sqlparse`
+=========================
+
+.. module:: sqlparse
+   :synopsis: Extending parsing capability of sqlparse.
+
+The :mod:`sqlparse` module uses a sql grammar that was tuned through usage and numerous
+PR to fit a broad range of SQL syntaxes, but it cannot cater to every given case since
+some SQL dialects have adopted conflicting meanings of certain keywords. Sqlparse
+therefore exposes a mechanism to configure the fundamental keywords and regular
+expressions that parse the language as described below.
+
+If you find an adaptation that works for your specific use-case. Please consider
+contributing it back to the community by opening a PR on
+`GitHub <https://github.com/andialbrecht/sqlparse>`_.
+
+Configuring the Lexer
+---------------------
+
+The lexer is a singleton class that breaks down the stream of characters into language
+tokens. It does this by using a sequence of regular expressions and keywords that are
+listed in the file ``sqlparse.keywords``. Instead of applying these fixed grammar
+definitions directly, the lexer is default initialized in its method called
+``default_initialization()``. As an api user, you can adapt the Lexer configuration by
+applying your own configuration logic. To do so, start out by clearing previous
+configurations with ``.clear()``, then apply the SQL list with
+``.set_SQL_REGEX(SQL_REGEX)``, and apply keyword lists with ``.add_keywords(KEYWORDS)``.
+
+You can do so by re-using the expressions in ``sqlparse.keywords`` (see example below),
+leaving parts out, or by making up your own master list.
+
+See the expected types of the arguments by inspecting their structure in
+``sqlparse.keywords``.
+(For compatibility with python 3.4, this library does not use type-hints.)
+
+The following example adds support for the expression ``ZORDER BY``, and adds ``BAR`` as
+a keyword to the lexer:
+
+..  code-block:: python
+
+    import re
+
+    import sqlparse
+    from sqlparse import keywords
+    from sqlparse.lexer import Lexer
+
+    lex = Lexer()
+    lex.clear()
+
+    my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword)
+
+    # slice the default SQL_REGEX to inject the custom object
+    lex.set_SQL_REGEX(
+        keywords.SQL_REGEX[:38]
+        + [my_regex]
+        + keywords.SQL_REGEX[38:]
+    )
+    lex.add_keywords(keywords.KEYWORDS_COMMON)
+    lex.add_keywords(keywords.KEYWORDS_ORACLE)
+    lex.add_keywords(keywords.KEYWORDS_PLPGSQL)
+    lex.add_keywords(keywords.KEYWORDS_HQL)
+    lex.add_keywords(keywords.KEYWORDS_MSACCESS)
+    lex.add_keywords(keywords.KEYWORDS)
+    lex.add_keywords({'BAR', sqlparse.tokens.Keyword})
+
+    sqlparse.parse("select * from foo zorder by bar;")
diff --git a/docs/source/index.rst b/docs/source/index.rst
index cba3314..e18d2b3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -20,6 +20,7 @@ Contents
    api
    analyzing
    ui
+   extending
    changes
    license
    indices
diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py
index 6bc7937..f04f928 100644
--- a/sqlparse/keywords.py
+++ b/sqlparse/keywords.py
@@ -5,96 +5,92 @@
 # This module is part of python-sqlparse and is released under
 # the BSD License: https://opensource.org/licenses/BSD-3-Clause
 
-import re
-
 from sqlparse import tokens
 
 # object() only supports "is" and is useful as a marker
+# use this marker to specify that the given regex in SQL_REGEX
+# shall be processed further through a lookup in the KEYWORDS dictionaries
 PROCESS_AS_KEYWORD = object()
 
 
-SQL_REGEX = {
-    'root': [
-        (r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint),
-        (r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint),
-
-        (r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single),
-        (r'/\*[\s\S]*?\*/', tokens.Comment.Multiline),
-
-        (r'(\r\n|\r|\n)', tokens.Newline),
-        (r'\s+?', tokens.Whitespace),
-
-        (r':=', tokens.Assignment),
-        (r'::', tokens.Punctuation),
-
-        (r'\*', tokens.Wildcard),
-
-        (r"`(``|[^`])*`", tokens.Name),
-        (r"´(´´|[^´])*´", tokens.Name),
-        (r'((?<!\S)\$(?:[_A-ZÀ-Ü]\w*)?\$)[\s\S]*?\1', tokens.Literal),
-
-        (r'\?', tokens.Name.Placeholder),
-        (r'%(\(\w+\))?s', tokens.Name.Placeholder),
-        (r'(?<!\w)[$:?]\w+', tokens.Name.Placeholder),
-
-        (r'\\\w+', tokens.Command),
-
-        # FIXME(andi): VALUES shouldn't be listed here
-        # see https://github.com/andialbrecht/sqlparse/pull/64
-        # AS and IN are special, it may be followed by a parenthesis, but
-        # are never functions, see issue183 and issue507
-        (r'(CASE|IN|VALUES|USING|FROM|AS)\b', tokens.Keyword),
-
-        (r'(@|##|#)[A-ZÀ-Ü]\w+', tokens.Name),
-
-        # see issue #39
-        # Spaces around period `schema . name` are valid identifier
-        # TODO: Spaces before period not implemented
-        (r'[A-ZÀ-Ü]\w*(?=\s*\.)', tokens.Name),  # 'Name'.
-        # FIXME(atronah): never match,
-        # because `re.match` doesn't work with look-behind regexp feature
-        (r'(?<=\.)[A-ZÀ-Ü]\w*', tokens.Name),  # .'Name'
-        (r'[A-ZÀ-Ü]\w*(?=\()', tokens.Name),  # side effect: change kw to func
-        (r'-?0x[\dA-F]+', tokens.Number.Hexadecimal),
-        (r'-?\d+(\.\d+)?E-?\d+', tokens.Number.Float),
-        (r'(?![_A-ZÀ-Ü])-?(\d+(\.\d*)|\.\d+)(?![_A-ZÀ-Ü])',
-         tokens.Number.Float),
-        (r'(?![_A-ZÀ-Ü])-?\d+(?![_A-ZÀ-Ü])', tokens.Number.Integer),
-        (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
-        # not a real string literal in ANSI SQL:
-        (r'"(""|\\\\|\\"|[^"])*"', tokens.String.Symbol),
-        (r'(""|".*?[^\\]")', tokens.String.Symbol),
-        # sqlite names can be escaped with [square brackets]. left bracket
-        # cannot be preceded by word character or a right bracket --
-        # otherwise it's probably an array index
-        (r'(?<![\w\])])(\[[^\]\[]+\])', tokens.Name),
-        (r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?'
-         r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
-        (r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword),
-        (r'NOT\s+NULL\b', tokens.Keyword),
-        (r'NULLS\s+(FIRST|LAST)\b', tokens.Keyword),
-        (r'UNION\s+ALL\b', tokens.Keyword),
-        (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
-        (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
-        (r'GROUP\s+BY\b', tokens.Keyword),
-        (r'ORDER\s+BY\b', tokens.Keyword),
-        (r'HANDLER\s+FOR\b', tokens.Keyword),
-        (r'(LATERAL\s+VIEW\s+)'
-         r'(EXPLODE|INLINE|PARSE_URL_TUPLE|POSEXPLODE|STACK)\b',
-         tokens.Keyword),
-        (r"(AT|WITH')\s+TIME\s+ZONE\s+'[^']+'", tokens.Keyword.TZCast),
-        (r'(NOT\s+)?(LIKE|ILIKE|RLIKE)\b', tokens.Operator.Comparison),
-        (r'(NOT\s+)?(REGEXP)\b', tokens.Operator.Comparison),
-        # Check for keywords, also returns tokens.Name if regex matches
-        # but the match isn't a keyword.
-        (r'[0-9_\w][_$#\w]*', PROCESS_AS_KEYWORD),
-        (r'[;:()\[\],\.]', tokens.Punctuation),
-        (r'[<>=~!]+', tokens.Operator.Comparison),
-        (r'[+/@#%^&|^-]+', tokens.Operator),
-    ]}
-
-FLAGS = re.IGNORECASE | re.UNICODE
-SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX['root']]
+SQL_REGEX = [
+    (r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint),
+    (r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint),
+
+    (r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single),
+    (r'/\*[\s\S]*?\*/', tokens.Comment.Multiline),
+
+    (r'(\r\n|\r|\n)', tokens.Newline),
+    (r'\s+?', tokens.Whitespace),
+
+    (r':=', tokens.Assignment),
+    (r'::', tokens.Punctuation),
+
+    (r'\*', tokens.Wildcard),
+
+    (r"`(``|[^`])*`", tokens.Name),
+    (r"´(´´|[^´])*´", tokens.Name),
+    (r'((?<!\S)\$(?:[_A-ZÀ-Ü]\w*)?\$)[\s\S]*?\1', tokens.Literal),
+
+    (r'\?', tokens.Name.Placeholder),
+    (r'%(\(\w+\))?s', tokens.Name.Placeholder),
+    (r'(?<!\w)[$:?]\w+', tokens.Name.Placeholder),
+
+    (r'\\\w+', tokens.Command),
+
+    # FIXME(andi): VALUES shouldn't be listed here
+    # see https://github.com/andialbrecht/sqlparse/pull/64
+    # AS and IN are special, it may be followed by a parenthesis, but
+    # are never functions, see issue183 and issue507
+    (r'(CASE|IN|VALUES|USING|FROM|AS)\b', tokens.Keyword),
+
+    (r'(@|##|#)[A-ZÀ-Ü]\w+', tokens.Name),
+
+    # see issue #39
+    # Spaces around period `schema . name` are valid identifier
+    # TODO: Spaces before period not implemented
+    (r'[A-ZÀ-Ü]\w*(?=\s*\.)', tokens.Name),  # 'Name'.
+    # FIXME(atronah): never match,
+    # because `re.match` doesn't work with look-behind regexp feature
+    (r'(?<=\.)[A-ZÀ-Ü]\w*', tokens.Name),  # .'Name'
+    (r'[A-ZÀ-Ü]\w*(?=\()', tokens.Name),  # side effect: change kw to func
+    (r'-?0x[\dA-F]+', tokens.Number.Hexadecimal),
+    (r'-?\d+(\.\d+)?E-?\d+', tokens.Number.Float),
+    (r'(?![_A-ZÀ-Ü])-?(\d+(\.\d*)|\.\d+)(?![_A-ZÀ-Ü])',
+     tokens.Number.Float),
+    (r'(?![_A-ZÀ-Ü])-?\d+(?![_A-ZÀ-Ü])', tokens.Number.Integer),
+    (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
+    # not a real string literal in ANSI SQL:
+    (r'"(""|\\\\|\\"|[^"])*"', tokens.String.Symbol),
+    (r'(""|".*?[^\\]")', tokens.String.Symbol),
+    # sqlite names can be escaped with [square brackets]. left bracket
+    # cannot be preceded by word character or a right bracket --
+    # otherwise it's probably an array index
+    (r'(?<![\w\])])(\[[^\]\[]+\])', tokens.Name),
+    (r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?'
+     r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
+    (r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword),
+    (r'NOT\s+NULL\b', tokens.Keyword),
+    (r'NULLS\s+(FIRST|LAST)\b', tokens.Keyword),
+    (r'UNION\s+ALL\b', tokens.Keyword),
+    (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
+    (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
+    (r'GROUP\s+BY\b', tokens.Keyword),
+    (r'ORDER\s+BY\b', tokens.Keyword),
+    (r'HANDLER\s+FOR\b', tokens.Keyword),
+    (r'(LATERAL\s+VIEW\s+)'
+     r'(EXPLODE|INLINE|PARSE_URL_TUPLE|POSEXPLODE|STACK)\b',
+     tokens.Keyword),
+    (r"(AT|WITH')\s+TIME\s+ZONE\s+'[^']+'", tokens.Keyword.TZCast),
+    (r'(NOT\s+)?(LIKE|ILIKE|RLIKE)\b', tokens.Operator.Comparison),
+    (r'(NOT\s+)?(REGEXP)\b', tokens.Operator.Comparison),
+    # Check for keywords, also returns tokens.Name if regex matches
+    # but the match isn't a keyword.
+    (r'[0-9_\w][_$#\w]*', PROCESS_AS_KEYWORD),
+    (r'[;:()\[\],\.]', tokens.Punctuation),
+    (r'[<>=~!]+', tokens.Operator.Comparison),
+    (r'[+/@#%^&|^-]+', tokens.Operator),
+]
 
 KEYWORDS = {
     'ABORT': tokens.Keyword,
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index aafb55f..50799df 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -6,7 +6,7 @@
 # the BSD License: https://opensource.org/licenses/BSD-3-Clause
 
 """SQL Lexer"""
-
+import re
 # This code is based on the SqlLexer in pygments.
 # http://pygments.org/
 # It's separated from the rest of pygments to increase performance
@@ -56,7 +56,8 @@ class Lexer(metaclass=_LexerSingletonMetaclass):
 
     def set_SQL_REGEX(self, SQL_REGEX):
         """Set the list of regex that will parse the SQL."""
-        self._SQL_REGEX = SQL_REGEX
+        FLAGS = re.IGNORECASE | re.UNICODE
+        self._SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX]
 
     def add_keywords(self, keywords):
         """Add keyword dictionaries. Keywords are looked up in the same order
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 3ac6500..017f93a 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -1,5 +1,4 @@
 """Tests sqlparse.parse()."""
-import re
 from io import StringIO
 
 import pytest
@@ -538,10 +537,7 @@ def test_configurable_regex():
     lex = Lexer()
     lex.clear()
 
-    my_regex = (
-        re.compile(r"ZORDER\s+BY\b", keywords.FLAGS).match,
-        sqlparse.tokens.Keyword,
-    )
+    my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword)
 
     lex.set_SQL_REGEX(keywords.SQL_REGEX[:38] + [my_regex] + keywords.SQL_REGEX[38:])
     lex.add_keywords(keywords.KEYWORDS_COMMON)
-- 
cgit v1.2.1