summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorptmcg <ptmcg@austin.rr.com>2023-04-18 21:27:16 -0500
committerptmcg <ptmcg@austin.rr.com>2023-04-18 21:27:16 -0500
commit063c9404a850174c566a5bc63f32f8c2c9fbe8dd (patch)
tree2a033d342354822291fdbb93eaa29db99e692256
parent59623c2cd2de437f0587a003ccab66a1d2adf8be (diff)
downloadpyparsing-git-063c9404a850174c566a5bc63f32f8c2c9fbe8dd.tar.gz
Some format cleanup in unicode_denormalizer.py; handle uppercase ligatures; add a few more comments and helpful variable names
-rw-r--r--examples/unicode_denormalizer.py79
1 files changed, 49 insertions, 30 deletions
diff --git a/examples/unicode_denormalizer.py b/examples/unicode_denormalizer.py
index 6eee875..fe2f180 100644
--- a/examples/unicode_denormalizer.py
+++ b/examples/unicode_denormalizer.py
@@ -26,49 +26,60 @@ ppu = pp.pyparsing_unicode
_· = "_·"
ident_chars = (
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- "abcdefghijklmnopqrstuvwxyz"
- "0123456789" + _·
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+ + "0123456789" + _·
)
# build map of each ASCII character to a string of
# all the characters in the Basic Multilingual Plane
# that NFKC normalizes back to that ASCII character
-ident_char_map = {}.fromkeys(ident_chars, "")
+ident_char_map = {c: [] for c in ident_chars}
for ch in ppu.BMP.identbodychars:
normal = unicodedata.normalize("NFKC", ch)
if normal in ident_char_map:
- ident_char_map[normal] += ch
+ ident_char_map[normal].append(ch)
# ligatures will also normalize back to ASCII
+# (doubled elements have higher chance of being chosen by random.choice)
ligature_map = {
- 'ffl': 'ffl ffl ffl ffl ffl',
- 'ffi': 'ffi ffi ffi ffi ffi',
- 'ff': 'ff ff',
- 'fi': 'fi fi',
- 'fl': 'fl fl',
-
- 'ij': 'ij ij',
- 'lj': 'lj lj',
- 'nj': 'nj nj',
- 'dz': 'dz dz',
- 'ii': 'ii ⅱ',
- 'iv': 'iv ⅳ',
- 'vi': 'vi ⅵ',
- 'ix': 'ix ⅸ',
- 'xi': 'xi ⅺ',
+ 'IJ': ('IJ', 'IJ', 'IJ'),
+ 'LJ': ('LJ', 'LJ', 'LJ'),
+ 'NJ': ('NJ', 'NJ', 'NJ'),
+ 'DZ': ('DZ', 'DZ', 'DZ'),
+ 'II': ('Ⅱ', 'Ⅱ', 'II'),
+ 'IV': ('Ⅳ', 'Ⅳ', 'IV'),
+ 'VI': ('Ⅵ', 'Ⅵ', 'VI'),
+ 'IX': ('Ⅸ', 'Ⅸ', 'IX'),
+ 'XI': ('Ⅺ', 'Ⅺ', 'XI'),
+ 'ffl': ('ffl', 'ffl', 'ffl', 'ffl', 'ffl'),
+ 'ffi': ('ffi', 'ffi', 'ffi', 'ffi', 'ffi'),
+ 'ff': ('ff', 'ff', 'ff'),
+ 'fi': ('fi', 'fi', 'fi'),
+ 'fl': ('fl', 'fl', 'fl'),
+ 'ij': ('ij', 'ij', 'ij'),
+ 'lj': ('lj', 'lj', 'lj'),
+ 'nj': ('nj', 'nj', 'nj'),
+ 'dz': ('dz', 'dz', 'dz'),
+ 'ii': ('ⅱ', 'ⅱ', 'ii'),
+ 'iv': ('ⅳ', 'ⅳ', 'iv'),
+ 'vi': ('ⅵ', 'ⅵ', 'vi'),
+ 'ix': ('ⅸ', 'ⅸ', 'ix'),
+ 'xi': ('ⅺ', 'ⅺ', 'xi'),
}
-ligature_transformer = pp.oneOf(ligature_map).add_parse_action(
- lambda t: random.choice(ligature_map[t[0]].split())
+
+ligature_transformer = pp.one_of(ligature_map).add_parse_action(
+ lambda t: random.choice(ligature_map[t[0]])
)
def make_mixed_font(t):
- t_0 = t[0]
+ # extract leading character and remainder to process separately
+ t_first, t_rest = t[0][0], t[0][1:]
+
# a leading '_' must be written using the ASCII character '_'
- ret = ['_' if t_0[0] == '_'
- else random.choice(ident_char_map.get(t_0[0], t_0[0]))]
- t_rest = ligature_transformer.transform_string(t_0[1:])
+ ret = ['_' if t_first == '_'
+ else random.choice(ident_char_map.get(t_first, t_first))]
+ t_rest = ligature_transformer.transform_string(t_rest)
ret.extend(random.choice(ident_char_map.get(c, c)) for c in t_rest)
return ''.join(ret)
@@ -87,10 +98,18 @@ python_quoted_string = pp.Opt(pp.Char("fF")("f_string_prefix")) + (
def mix_fstring_expressions(t):
if not t.f_string_prefix:
return
+
+ # define an expression and transformer to handle embedded
+ # f-string field expressions
fstring_arg = pp.QuotedString("{", end_quote_char="}")
- fstring_arg.add_parse_action(lambda tt: "{" + transformer.transform_string(tt[0]) + "}")
- ret = t.f_string_prefix + fstring_arg.transform_string(t.quoted_string_body)
- return ret
+ fstring_arg.add_parse_action(
+ lambda tt: "{" + transformer.transform_string(tt[0]) + "}"
+ )
+
+ return (
+ t.f_string_prefix
+ + fstring_arg.transform_string(t.quoted_string_body)
+ )
# add parse action to transform identifiers in f-strings
python_quoted_string.add_parse_action(mix_fstring_expressions)
@@ -129,7 +148,7 @@ def demo():
code = compile(transformed, "inline source", mode="exec")
exec(code)
- if 0:
+ if 1:
# pick some code from the stdlib
import unittest.util as lib_module
import inspect