summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorptmcg <ptmcg@austin.rr.com>2023-01-13 05:24:37 -0600
committerptmcg <ptmcg@austin.rr.com>2023-01-13 05:24:37 -0600
commitcc94b5a6d608e7f25be15c4487cbab25f606e0d8 (patch)
tree224ad44ae49d3527383287bb5b900ed896842ac5
parent318ec7e3b945068d36f5c98b1de81003c773c6c4 (diff)
downloadpyparsing-git-cc94b5a6d608e7f25be15c4487cbab25f606e0d8.tar.gz
Add pyparsing.unicode.identifier class property
-rw-r--r--CHANGES13
-rw-r--r--pyparsing/__init__.py2
-rw-r--r--pyparsing/unicode.py52
-rw-r--r--tests/test_unit.py117
4 files changed, 110 insertions, 74 deletions
diff --git a/CHANGES b/CHANGES
index 36a09b8..18edee1 100644
--- a/CHANGES
+++ b/CHANGES
@@ -28,6 +28,19 @@ help from Devin J. Pohly in structuring the code to enable this peaceful transit
Suggested by Antony Lee (issue #412), PR (#413) by Devin J. Pohly.
+- Added new class property `identifier` to all Unicode set classes in `pyparsing.unicode`,
+ using the class's values for `cls.identchars` and `cls.identbodychars`. Now Unicode-aware
+ parsers that formerly wrote:
+
+ ppu = pyparsing.unicode
+ ident = Word(ppu.Greek.identchars, ppu.Greek.identbodychars)
+
+ can now write:
+
+ ident = ppu.Greek.identifier
+ # or
+ # ident = ppu.Ελληνικά.identifier
+
- Reworked `delimited_list` function into the new `DelimitedList` class.
`DelimitedList` has the same constructor interface as `delimited_list`, and
in this release, `delimited_list` changes from a function to a synonym for
diff --git a/pyparsing/__init__.py b/pyparsing/__init__.py
index 18be13c..1557b60 100644
--- a/pyparsing/__init__.py
+++ b/pyparsing/__init__.py
@@ -121,7 +121,7 @@ class version_info(NamedTuple):
__version_info__ = version_info(3, 0, 10, "final", 0)
-__version_time__ = "22 Dec 2022 08:16 UTC"
+__version_time__ = "13 Jan 2023 11:09 UTC"
__version__ = __version_info__.__version__
__versionTime__ = __version_time__
__author__ = "Paul McGuire <ptmcg.gm+pyparsing@gmail.com>"
diff --git a/pyparsing/unicode.py b/pyparsing/unicode.py
index 9bc5e1d..b0a87b2 100644
--- a/pyparsing/unicode.py
+++ b/pyparsing/unicode.py
@@ -64,27 +64,27 @@ class unicode_set:
@_lazyclassproperty
def printables(cls):
- "all non-whitespace characters in this range"
+ """all non-whitespace characters in this range"""
return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
@_lazyclassproperty
def alphas(cls):
- "all alphabetic characters in this range"
+ """all alphabetic characters in this range"""
return "".join(filter(str.isalpha, cls._chars_for_ranges))
@_lazyclassproperty
def nums(cls):
- "all numeric digit characters in this range"
+ """all numeric digit characters in this range"""
return "".join(filter(str.isdigit, cls._chars_for_ranges))
@_lazyclassproperty
def alphanums(cls):
- "all alphanumeric characters in this range"
+ """all alphanumeric characters in this range"""
return cls.alphas + cls.nums
@_lazyclassproperty
def identchars(cls):
- "all characters in this range that are valid identifier characters, plus underscore '_'"
+ """all characters in this range that are valid identifier characters, plus underscore '_'"""
return "".join(
sorted(
set(
@@ -114,6 +114,16 @@ class unicode_set:
)
)
+ @_lazyclassproperty
+ def identifier(cls):
+ """
+ a pyparsing Word expression for an identifier using this range's definitions for
+ identchars and identbodychars
+ """
+ from pyparsing import Word
+
+ return Word(cls.identchars, cls.identbodychars)
+
class pyparsing_unicode(unicode_set):
"""
@@ -128,32 +138,32 @@ class pyparsing_unicode(unicode_set):
]
class BasicMultilingualPlane(unicode_set):
- "Unicode set for the Basic Multilingual Plane"
+ """Unicode set for the Basic Multilingual Plane"""
_ranges: UnicodeRangeList = [
(0x0020, 0xFFFF),
]
class Latin1(unicode_set):
- "Unicode set for Latin-1 Unicode Character Range"
+ """Unicode set for Latin-1 Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0020, 0x007E),
(0x00A0, 0x00FF),
]
class LatinA(unicode_set):
- "Unicode set for Latin-A Unicode Character Range"
+ """Unicode set for Latin-A Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0100, 0x017F),
]
class LatinB(unicode_set):
- "Unicode set for Latin-B Unicode Character Range"
+ """Unicode set for Latin-B Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0180, 0x024F),
]
class Greek(unicode_set):
- "Unicode set for Greek Unicode Character Ranges"
+ """Unicode set for Greek Unicode Character Ranges"""
_ranges: UnicodeRangeList = [
(0x0342, 0x0345),
(0x0370, 0x0377),
@@ -193,7 +203,7 @@ class pyparsing_unicode(unicode_set):
]
class Cyrillic(unicode_set):
- "Unicode set for Cyrillic Unicode Character Range"
+ """Unicode set for Cyrillic Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0400, 0x052F),
(0x1C80, 0x1C88),
@@ -206,7 +216,7 @@ class pyparsing_unicode(unicode_set):
]
class Chinese(unicode_set):
- "Unicode set for Chinese Unicode Character Range"
+ """Unicode set for Chinese Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x2E80, 0x2E99),
(0x2E9B, 0x2EF3),
@@ -229,7 +239,7 @@ class pyparsing_unicode(unicode_set):
]
class Japanese(unicode_set):
- "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"
+ """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
class Kanji(unicode_set):
"Unicode set for Kanji Unicode Character Range"
@@ -239,7 +249,7 @@ class pyparsing_unicode(unicode_set):
]
class Hiragana(unicode_set):
- "Unicode set for Hiragana Unicode Character Range"
+ """Unicode set for Hiragana Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x3041, 0x3096),
(0x3099, 0x30A0),
@@ -251,7 +261,7 @@ class pyparsing_unicode(unicode_set):
]
class Katakana(unicode_set):
- "Unicode set for Katakana Unicode Character Range"
+ """Unicode set for Katakana Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x3099, 0x309C),
(0x30A0, 0x30FF),
@@ -275,7 +285,7 @@ class pyparsing_unicode(unicode_set):
)
class Hangul(unicode_set):
- "Unicode set for Hangul (Korean) Unicode Character Range"
+ """Unicode set for Hangul (Korean) Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x1100, 0x11FF),
(0x302E, 0x302F),
@@ -297,17 +307,17 @@ class pyparsing_unicode(unicode_set):
Korean = Hangul
class CJK(Chinese, Japanese, Hangul):
- "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
+ """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
class Thai(unicode_set):
- "Unicode set for Thai Unicode Character Range"
+ """Unicode set for Thai Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0E01, 0x0E3A),
(0x0E3F, 0x0E5B)
]
class Arabic(unicode_set):
- "Unicode set for Arabic Unicode Character Range"
+ """Unicode set for Arabic Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0600, 0x061B),
(0x061E, 0x06FF),
@@ -315,7 +325,7 @@ class pyparsing_unicode(unicode_set):
]
class Hebrew(unicode_set):
- "Unicode set for Hebrew Unicode Character Range"
+ """Unicode set for Hebrew Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0591, 0x05C7),
(0x05D0, 0x05EA),
@@ -329,7 +339,7 @@ class pyparsing_unicode(unicode_set):
]
class Devanagari(unicode_set):
- "Unicode set for Devanagari Unicode Character Range"
+ """Unicode set for Devanagari Unicode Character Range"""
_ranges: UnicodeRangeList = [
(0x0900, 0x097F),
(0xA8E0, 0xA8FF)
diff --git a/tests/test_unit.py b/tests/test_unit.py
index b50117b..34c2736 100644
--- a/tests/test_unit.py
+++ b/tests/test_unit.py
@@ -116,9 +116,7 @@ class TestCase(unittest.TestCase):
yield
if getattr(ar, "warning", None) is not None:
- print(
- f"Raised expected warning: {type(ar.warning).__name__}: {ar.warning}"
- )
+ print(f"Raised expected warning: {type(ar.warning).__name__}: {ar.warning}")
else:
print(f"Expected {expected_warning_type.__name__} warning not raised")
return ar
@@ -2931,7 +2929,10 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
with self.assertRaises(
TypeError, msg="ParserElement * (str, str) should raise error"
):
- expr = pp.Word(pp.alphas)("first") + pp.Word(pp.nums)("second") * ("2", "3")
+ expr = pp.Word(pp.alphas)("first") + pp.Word(pp.nums)("second") * (
+ "2",
+ "3",
+ )
def testParserElementMulByZero(self):
alpwd = pp.Word(pp.alphas)
@@ -2956,12 +2957,16 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
# ParserElement * str
with self.subTest():
- with self.assertRaises(TypeError, msg="ParserElement * str should raise error"):
+ with self.assertRaises(
+ TypeError, msg="ParserElement * str should raise error"
+ ):
expr = pp.Word(pp.alphas)("first") + pp.Word(pp.nums)("second") * "3"
# str * ParserElement
with self.subTest():
- with self.assertRaises(TypeError, msg="str * ParserElement should raise error"):
+ with self.assertRaises(
+ TypeError, msg="str * ParserElement should raise error"
+ ):
expr = pp.Word(pp.alphas)("first") + "3" * pp.Word(pp.nums)("second")
# ParserElement * int
@@ -8284,24 +8289,27 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
self.assertEqual("bool [, bool]...", str(bool_list2))
with self.subTest():
- street_address = pp.common.integer.set_name("integer") + pp.Word(pp.alphas)[1, ...].set_name("street_name")
+ street_address = pp.common.integer.set_name("integer") + pp.Word(pp.alphas)[
+ 1, ...
+ ].set_name("street_name")
self.assertEqual(
"{integer street_name} [, {integer street_name}]...",
- str(pp.delimitedList(street_address))
+ str(pp.delimitedList(street_address)),
)
with self.subTest():
operand = pp.Char(pp.alphas).set_name("var")
- math = pp.infixNotation(operand,
- [
- (pp.one_of("+ -"), 2, pp.opAssoc.LEFT),
- ])
+ math = pp.infixNotation(
+ operand,
+ [
+ (pp.one_of("+ -"), 2, pp.opAssoc.LEFT),
+ ],
+ )
self.assertEqual(
"Forward: + | - term [, Forward: + | - term]...",
- str(pp.delimitedList(math))
+ str(pp.delimitedList(math)),
)
-
def testDelimitedListOfStrLiterals(self):
expr = pp.delimitedList("ABC")
print(str(expr))
@@ -8333,9 +8341,9 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
def testDelimitedListParseActions1(self):
# from issue #408
- keyword = pp.Keyword('foobar')
+ keyword = pp.Keyword("foobar")
untyped_identifier = ~keyword + pp.Word(pp.alphas)
- dotted_vars = pp.delimited_list(untyped_identifier, delim='.')
+ dotted_vars = pp.delimited_list(untyped_identifier, delim=".")
lvalue = pp.Opt(dotted_vars)
# uncomment this line to see the problem
@@ -8344,27 +8352,29 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
# stmt = pp.Opt(dotted_vars)
def parse_identifier(toks):
- print('YAY!', toks)
+ print("YAY!", toks)
untyped_identifier.set_parse_action(parse_identifier)
save_stdout = StringIO()
with contextlib.redirect_stdout(save_stdout):
- dotted_vars.parse_string('B.C')
+ dotted_vars.parse_string("B.C")
self.assertEqual(
- dedent("""\
+ dedent(
+ """\
YAY! ['B']
YAY! ['C']
- """),
- save_stdout.getvalue()
+ """
+ ),
+ save_stdout.getvalue(),
)
def testDelimitedListParseActions2(self):
# from issue #408
- keyword = pp.Keyword('foobar')
+ keyword = pp.Keyword("foobar")
untyped_identifier = ~keyword + pp.Word(pp.alphas)
- dotted_vars = pp.delimited_list(untyped_identifier, delim='.')
+ dotted_vars = pp.delimited_list(untyped_identifier, delim=".")
lvalue = pp.Opt(dotted_vars)
# uncomment this line to see the problem
@@ -8373,27 +8383,29 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
# stmt = pp.Opt(dotted_vars)
def parse_identifier(toks):
- print('YAY!', toks)
+ print("YAY!", toks)
untyped_identifier.set_parse_action(parse_identifier)
save_stdout = StringIO()
with contextlib.redirect_stdout(save_stdout):
- dotted_vars.parse_string('B.C')
+ dotted_vars.parse_string("B.C")
self.assertEqual(
- dedent("""\
+ dedent(
+ """\
YAY! ['B']
YAY! ['C']
- """),
- save_stdout.getvalue()
+ """
+ ),
+ save_stdout.getvalue(),
)
def testDelimitedListParseActions3(self):
# from issue #408
- keyword = pp.Keyword('foobar')
+ keyword = pp.Keyword("foobar")
untyped_identifier = ~keyword + pp.Word(pp.alphas)
- dotted_vars = pp.delimited_list(untyped_identifier, delim='.')
+ dotted_vars = pp.delimited_list(untyped_identifier, delim=".")
lvalue = pp.Opt(dotted_vars)
# uncomment this line to see the problem
@@ -8402,20 +8414,22 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
stmt = pp.Opt(dotted_vars)
def parse_identifier(toks):
- print('YAY!', toks)
+ print("YAY!", toks)
untyped_identifier.set_parse_action(parse_identifier)
save_stdout = StringIO()
with contextlib.redirect_stdout(save_stdout):
- dotted_vars.parse_string('B.C')
+ dotted_vars.parse_string("B.C")
self.assertEqual(
- dedent("""\
+ dedent(
+ """\
YAY! ['B']
YAY! ['C']
- """),
- save_stdout.getvalue()
+ """
+ ),
+ save_stdout.getvalue(),
)
def testEnableDebugOnNamedExpressions(self):
@@ -8667,11 +8681,14 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
def testSetDebugRecursivelyWithForward(self):
expr = pp.Word(pp.alphas).set_name("innermost")
- contained = pp.infix_notation(expr, [
- ('NOT', 1, pp.opAssoc.RIGHT),
- ('AND', 2, pp.opAssoc.LEFT),
- ('OR', 2, pp.opAssoc.LEFT),
- ])
+ contained = pp.infix_notation(
+ expr,
+ [
+ ("NOT", 1, pp.opAssoc.RIGHT),
+ ("AND", 2, pp.opAssoc.LEFT),
+ ("OR", 2, pp.opAssoc.LEFT),
+ ],
+ )
contained.set_debug(recurse=True)
self.assertTrue(expr.debug)
@@ -8925,17 +8942,11 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
ppu = pp.pyparsing_unicode
latin_identifier = pp.Word(pp.identchars, pp.identbodychars)("latin*")
- japanese_identifier = pp.Word(
- ppu.Japanese.identchars, ppu.Japanese.identbodychars
- )("japanese*")
- cjk_identifier = pp.Word(ppu.CJK.identchars, ppu.CJK.identbodychars)("cjk*")
- greek_identifier = pp.Word(ppu.Greek.identchars, ppu.Greek.identbodychars)(
- "greek*"
- )
- cyrillic_identifier = pp.Word(
- ppu.Cyrillic.identchars, ppu.Cyrillic.identbodychars
- )("cyrillic*")
- thai_identifier = pp.Word(ppu.Thai.identchars, ppu.Thai.identbodychars)("thai*")
+ japanese_identifier = ppu.Japanese.identifier("japanese*")
+ cjk_identifier = ppu.CJK.identifier("cjk*")
+ greek_identifier = ppu.Greek.identifier("greek*")
+ cyrillic_identifier = ppu.Cyrillic.identifier("cyrillic*")
+ thai_identifier = ppu.Thai.identifier("thai*")
idents = (
latin_identifier
| japanese_identifier
@@ -9113,7 +9124,9 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
def testValidation(grmr, gnam, isValid):
try:
grmr.streamline()
- with self.assertWarns(DeprecationWarning, msg="failed to warn validate() is deprecated"):
+ with self.assertWarns(
+ DeprecationWarning, msg="failed to warn validate() is deprecated"
+ ):
grmr.validate()
self.assertTrue(isValid, "validate() accepted invalid grammar " + gnam)
except pp.RecursiveGrammarException as rge: