summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorptmcg <ptmcg@austin.rr.com>2023-04-18 15:13:09 -0500
committerptmcg <ptmcg@austin.rr.com>2023-04-18 15:13:09 -0500
commit0b73a048c8987c88360b25d5da5d177e2f758a49 (patch)
tree267c8ff9ac53e78d2fe8f4e63c29227417d27e23
parentd0d6a81ff27c3448a51fcffbd397d7135203a93b (diff)
downloadpyparsing-git-0b73a048c8987c88360b25d5da5d177e2f758a49.tar.gz
Add some helpful comments to the unicode_denormalizer.py example script
-rw-r--r--examples/unicode_denormalizer.py28
1 files changed, 21 insertions, 7 deletions
diff --git a/examples/unicode_denormalizer.py b/examples/unicode_denormalizer.py
index 4bc3efa..5955c13 100644
--- a/examples/unicode_denormalizer.py
+++ b/examples/unicode_denormalizer.py
@@ -24,14 +24,23 @@ import unicodedata
import pyparsing as pp
ppu = pp.pyparsing_unicode
-ident_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_0123456789·"
-
+_· = "_·"
+ident_chars = (
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789" + _·
+)
+
+# build map of each ASCII character to a list of
+# all the characters in the Basic Multilingual Plane
+# that NFKC normalize back to that ASCII character
ident_char_map = {}.fromkeys(ident_chars, "")
-for ch in ppu.identbodychars:
+for ch in ppu.BMP.identbodychars:
normal = unicodedata.normalize("NFKC", ch)
if normal in ident_char_map:
ident_char_map[normal] += ch
+# ligatures will also normalize back to ASCII
ligature_map = {
'ffl': 'ffl ffl ffl ffl ffl',
'ffi': 'ffi ffi ffi ffi ffi',
@@ -60,13 +69,13 @@ def make_mixed_font(t):
return ''.join(ret)
+# define a pyparsing expression to match any identifier
identifier = pp.pyparsing_common.identifier
identifier.add_parse_action(make_mixed_font)
+# match quoted strings (which may be f-strings)
python_quoted_string = pp.Opt(pp.Char("fF")("f_string_prefix")) + (
- pp.quotedString
- | pp.QuotedString('"""', multiline=True, unquoteResults=False)
- | pp.QuotedString("'''", multiline=True, unquoteResults=False)
+ pp.python_quoted_string
)("quoted_string_body")
@@ -81,7 +90,12 @@ def mix_fstring_expressions(t):
python_quoted_string.add_parse_action(mix_fstring_expressions)
-any_keyword = pp.MatchFirst(map(pp.Keyword, list(keyword.kwlist) + getattr(keyword, "softkwlist", [])))
+# match keywords separately from identifiers - keywords must be kept in their
+# original ASCII
+any_keyword = pp.one_of(
+ keyword.kwlist + getattr(keyword, "softkwlist", []),
+ as_keyword=True
+)
# quoted strings and keywords will be parsed, but left untransformed
transformer = python_quoted_string | any_keyword | identifier