From e0d3928ba69d73ba874ca03ec4395e94cf1ab293 Mon Sep 17 00:00:00 2001 From: Simon Heisterkamp Date: Thu, 1 Dec 2022 10:35:42 +0000 Subject: lexer documentation --- docs/source/extending.rst | 66 +++++++++++++++++++ docs/source/index.rst | 1 + sqlparse/keywords.py | 164 ++++++++++++++++++++++------------------------ sqlparse/lexer.py | 5 +- tests/test_parse.py | 6 +- 5 files changed, 151 insertions(+), 91 deletions(-) create mode 100644 docs/source/extending.rst diff --git a/docs/source/extending.rst b/docs/source/extending.rst new file mode 100644 index 0000000..f1bd551 --- /dev/null +++ b/docs/source/extending.rst @@ -0,0 +1,66 @@ +Extending :mod:`sqlparse` +========================= + +.. module:: sqlparse + :synopsis: Extending parsing capability of sqlparse. + +The :mod:`sqlparse` module uses a sql grammar that was tuned through usage and numerous +PR to fit a broad range of SQL syntaxes, but it cannot cater to every given case since +some SQL dialects have adopted conflicting meanings of certain keywords. Sqlparse +therefore exposes a mechanism to configure the fundamental keywords and regular +expressions that parse the language as described below. + +If you find an adaptation that works for your specific use-case. Please consider +contributing it back to the community by opening a PR on +`GitHub `_. + +Configuring the Lexer +--------------------- + +The lexer is a singleton class that breaks down the stream of characters into language +tokens. It does this by using a sequence of regular expressions and keywords that are +listed in the file ``sqlparse.keywords``. Instead of applying these fixed grammar +definitions directly, the lexer is default initialized in its method called +``default_initialization()``. As an api user, you can adapt the Lexer configuration by +applying your own configuration logic. To do so, start out by clearing previous +configurations with ``.clear()``, then apply the SQL list with +``.set_SQL_REGEX(SQL_REGEX)``, and apply keyword lists with ``.add_keywords(KEYWORDS)``. + +You can do so by re-using the expressions in ``sqlparse.keywords`` (see example below), +leaving parts out, or by making up your own master list. + +See the expected types of the arguments by inspecting their structure in +``sqlparse.keywords``. +(For compatibility with python 3.4, this library does not use type-hints.) + +The following example adds support for the expression ``ZORDER BY``, and adds ``BAR`` as +a keyword to the lexer: + +.. code-block:: python + + import re + + import sqlparse + from sqlparse import keywords + from sqlparse.lexer import Lexer + + lex = Lexer() + lex.clear() + + my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword) + + # slice the default SQL_REGEX to inject the custom object + lex.set_SQL_REGEX( + keywords.SQL_REGEX[:38] + + [my_regex] + + keywords.SQL_REGEX[38:] + ) + lex.add_keywords(keywords.KEYWORDS_COMMON) + lex.add_keywords(keywords.KEYWORDS_ORACLE) + lex.add_keywords(keywords.KEYWORDS_PLPGSQL) + lex.add_keywords(keywords.KEYWORDS_HQL) + lex.add_keywords(keywords.KEYWORDS_MSACCESS) + lex.add_keywords(keywords.KEYWORDS) + lex.add_keywords({'BAR', sqlparse.tokens.Keyword}) + + sqlparse.parse("select * from foo zorder by bar;") diff --git a/docs/source/index.rst b/docs/source/index.rst index cba3314..e18d2b3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,6 +20,7 @@ Contents api analyzing ui + extending changes license indices diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py index 6bc7937..f04f928 100644 --- a/sqlparse/keywords.py +++ b/sqlparse/keywords.py @@ -5,96 +5,92 @@ # This module is part of python-sqlparse and is released under # the BSD License: https://opensource.org/licenses/BSD-3-Clause -import re - from sqlparse import tokens # object() only supports "is" and is useful as a marker +# use this marker to specify that the given regex in SQL_REGEX +# shall be processed further through a lookup in the KEYWORDS dictionaries PROCESS_AS_KEYWORD = object() -SQL_REGEX = { - 'root': [ - (r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint), - (r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint), - - (r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single), - (r'/\*[\s\S]*?\*/', tokens.Comment.Multiline), - - (r'(\r\n|\r|\n)', tokens.Newline), - (r'\s+?', tokens.Whitespace), - - (r':=', tokens.Assignment), - (r'::', tokens.Punctuation), - - (r'\*', tokens.Wildcard), - - (r"`(``|[^`])*`", tokens.Name), - (r"´(´´|[^´])*´", tokens.Name), - (r'((?=~!]+', tokens.Operator.Comparison), - (r'[+/@#%^&|^-]+', tokens.Operator), - ]} - -FLAGS = re.IGNORECASE | re.UNICODE -SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX['root']] +SQL_REGEX = [ + (r'(--|# )\+.*?(\r\n|\r|\n|$)', tokens.Comment.Single.Hint), + (r'/\*\+[\s\S]*?\*/', tokens.Comment.Multiline.Hint), + + (r'(--|# ).*?(\r\n|\r|\n|$)', tokens.Comment.Single), + (r'/\*[\s\S]*?\*/', tokens.Comment.Multiline), + + (r'(\r\n|\r|\n)', tokens.Newline), + (r'\s+?', tokens.Whitespace), + + (r':=', tokens.Assignment), + (r'::', tokens.Punctuation), + + (r'\*', tokens.Wildcard), + + (r"`(``|[^`])*`", tokens.Name), + (r"´(´´|[^´])*´", tokens.Name), + (r'((?=~!]+', tokens.Operator.Comparison), + (r'[+/@#%^&|^-]+', tokens.Operator), +] KEYWORDS = { 'ABORT': tokens.Keyword, diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index aafb55f..50799df 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -6,7 +6,7 @@ # the BSD License: https://opensource.org/licenses/BSD-3-Clause """SQL Lexer""" - +import re # This code is based on the SqlLexer in pygments. # http://pygments.org/ # It's separated from the rest of pygments to increase performance @@ -56,7 +56,8 @@ class Lexer(metaclass=_LexerSingletonMetaclass): def set_SQL_REGEX(self, SQL_REGEX): """Set the list of regex that will parse the SQL.""" - self._SQL_REGEX = SQL_REGEX + FLAGS = re.IGNORECASE | re.UNICODE + self._SQL_REGEX = [(re.compile(rx, FLAGS).match, tt) for rx, tt in SQL_REGEX] def add_keywords(self, keywords): """Add keyword dictionaries. Keywords are looked up in the same order diff --git a/tests/test_parse.py b/tests/test_parse.py index 3ac6500..017f93a 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -1,5 +1,4 @@ """Tests sqlparse.parse().""" -import re from io import StringIO import pytest @@ -538,10 +537,7 @@ def test_configurable_regex(): lex = Lexer() lex.clear() - my_regex = ( - re.compile(r"ZORDER\s+BY\b", keywords.FLAGS).match, - sqlparse.tokens.Keyword, - ) + my_regex = (r"ZORDER\s+BY\b", sqlparse.tokens.Keyword) lex.set_SQL_REGEX(keywords.SQL_REGEX[:38] + [my_regex] + keywords.SQL_REGEX[38:]) lex.add_keywords(keywords.KEYWORDS_COMMON) -- cgit v1.2.1