diff options
author | ptmcg <ptmcg@austin.rr.com> | 2023-01-13 05:24:37 -0600 |
---|---|---|
committer | ptmcg <ptmcg@austin.rr.com> | 2023-01-13 05:24:37 -0600 |
commit | cc94b5a6d608e7f25be15c4487cbab25f606e0d8 (patch) | |
tree | 224ad44ae49d3527383287bb5b900ed896842ac5 | |
parent | 318ec7e3b945068d36f5c98b1de81003c773c6c4 (diff) | |
download | pyparsing-git-cc94b5a6d608e7f25be15c4487cbab25f606e0d8.tar.gz |
Add pyparsing.unicode.identifier class property
-rw-r--r-- | CHANGES | 13 | ||||
-rw-r--r-- | pyparsing/__init__.py | 2 | ||||
-rw-r--r-- | pyparsing/unicode.py | 52 | ||||
-rw-r--r-- | tests/test_unit.py | 117 |
4 files changed, 110 insertions, 74 deletions
@@ -28,6 +28,19 @@ help from Devin J. Pohly in structuring the code to enable this peaceful transit Suggested by Antony Lee (issue #412), PR (#413) by Devin J. Pohly. +- Added new class property `identifier` to all Unicode set classes in `pyparsing.unicode`, + using the class's values for `cls.identchars` and `cls.identbodychars`. Now Unicode-aware + parsers that formerly wrote: + + ppu = pyparsing.unicode + ident = Word(ppu.Greek.identchars, ppu.Greek.identbodychars) + + can now write: + + ident = ppu.Greek.identifier + # or + # ident = ppu.Ελληνικά.identifier + - Reworked `delimited_list` function into the new `DelimitedList` class. `DelimitedList` has the same constructor interface as `delimited_list`, and in this release, `delimited_list` changes from a function to a synonym for diff --git a/pyparsing/__init__.py b/pyparsing/__init__.py index 18be13c..1557b60 100644 --- a/pyparsing/__init__.py +++ b/pyparsing/__init__.py @@ -121,7 +121,7 @@ class version_info(NamedTuple): __version_info__ = version_info(3, 0, 10, "final", 0) -__version_time__ = "22 Dec 2022 08:16 UTC" +__version_time__ = "13 Jan 2023 11:09 UTC" __version__ = __version_info__.__version__ __versionTime__ = __version_time__ __author__ = "Paul McGuire <ptmcg.gm+pyparsing@gmail.com>" diff --git a/pyparsing/unicode.py b/pyparsing/unicode.py index 9bc5e1d..b0a87b2 100644 --- a/pyparsing/unicode.py +++ b/pyparsing/unicode.py @@ -64,27 +64,27 @@ class unicode_set: @_lazyclassproperty def printables(cls): - "all non-whitespace characters in this range" + """all non-whitespace characters in this range""" return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) @_lazyclassproperty def alphas(cls): - "all alphabetic characters in this range" + """all alphabetic characters in this range""" return "".join(filter(str.isalpha, cls._chars_for_ranges)) @_lazyclassproperty def nums(cls): - "all numeric digit characters in this range" + """all numeric digit characters in this range""" return "".join(filter(str.isdigit, cls._chars_for_ranges)) @_lazyclassproperty def alphanums(cls): - "all alphanumeric characters in this range" + """all alphanumeric characters in this range""" return cls.alphas + cls.nums @_lazyclassproperty def identchars(cls): - "all characters in this range that are valid identifier characters, plus underscore '_'" + """all characters in this range that are valid identifier characters, plus underscore '_'""" return "".join( sorted( set( @@ -114,6 +114,16 @@ class unicode_set: ) ) + @_lazyclassproperty + def identifier(cls): + """ + a pyparsing Word expression for an identifier using this range's definitions for + identchars and identbodychars + """ + from pyparsing import Word + + return Word(cls.identchars, cls.identbodychars) + class pyparsing_unicode(unicode_set): """ @@ -128,32 +138,32 @@ class pyparsing_unicode(unicode_set): ] class BasicMultilingualPlane(unicode_set): - "Unicode set for the Basic Multilingual Plane" + """Unicode set for the Basic Multilingual Plane""" _ranges: UnicodeRangeList = [ (0x0020, 0xFFFF), ] class Latin1(unicode_set): - "Unicode set for Latin-1 Unicode Character Range" + """Unicode set for Latin-1 Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0020, 0x007E), (0x00A0, 0x00FF), ] class LatinA(unicode_set): - "Unicode set for Latin-A Unicode Character Range" + """Unicode set for Latin-A Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0100, 0x017F), ] class LatinB(unicode_set): - "Unicode set for Latin-B Unicode Character Range" + """Unicode set for Latin-B Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0180, 0x024F), ] class Greek(unicode_set): - "Unicode set for Greek Unicode Character Ranges" + """Unicode set for Greek Unicode Character Ranges""" _ranges: UnicodeRangeList = [ (0x0342, 0x0345), (0x0370, 0x0377), @@ -193,7 +203,7 @@ class pyparsing_unicode(unicode_set): ] class Cyrillic(unicode_set): - "Unicode set for Cyrillic Unicode Character Range" + """Unicode set for Cyrillic Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0400, 0x052F), (0x1C80, 0x1C88), @@ -206,7 +216,7 @@ class pyparsing_unicode(unicode_set): ] class Chinese(unicode_set): - "Unicode set for Chinese Unicode Character Range" + """Unicode set for Chinese Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x2E80, 0x2E99), (0x2E9B, 0x2EF3), @@ -229,7 +239,7 @@ class pyparsing_unicode(unicode_set): ] class Japanese(unicode_set): - "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges" + """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges""" class Kanji(unicode_set): "Unicode set for Kanji Unicode Character Range" @@ -239,7 +249,7 @@ class pyparsing_unicode(unicode_set): ] class Hiragana(unicode_set): - "Unicode set for Hiragana Unicode Character Range" + """Unicode set for Hiragana Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x3041, 0x3096), (0x3099, 0x30A0), @@ -251,7 +261,7 @@ class pyparsing_unicode(unicode_set): ] class Katakana(unicode_set): - "Unicode set for Katakana Unicode Character Range" + """Unicode set for Katakana Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x3099, 0x309C), (0x30A0, 0x30FF), @@ -275,7 +285,7 @@ class pyparsing_unicode(unicode_set): ) class Hangul(unicode_set): - "Unicode set for Hangul (Korean) Unicode Character Range" + """Unicode set for Hangul (Korean) Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x1100, 0x11FF), (0x302E, 0x302F), @@ -297,17 +307,17 @@ class pyparsing_unicode(unicode_set): Korean = Hangul class CJK(Chinese, Japanese, Hangul): - "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range" + """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range""" class Thai(unicode_set): - "Unicode set for Thai Unicode Character Range" + """Unicode set for Thai Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0E01, 0x0E3A), (0x0E3F, 0x0E5B) ] class Arabic(unicode_set): - "Unicode set for Arabic Unicode Character Range" + """Unicode set for Arabic Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0600, 0x061B), (0x061E, 0x06FF), @@ -315,7 +325,7 @@ class pyparsing_unicode(unicode_set): ] class Hebrew(unicode_set): - "Unicode set for Hebrew Unicode Character Range" + """Unicode set for Hebrew Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0591, 0x05C7), (0x05D0, 0x05EA), @@ -329,7 +339,7 @@ class pyparsing_unicode(unicode_set): ] class Devanagari(unicode_set): - "Unicode set for Devanagari Unicode Character Range" + """Unicode set for Devanagari Unicode Character Range""" _ranges: UnicodeRangeList = [ (0x0900, 0x097F), (0xA8E0, 0xA8FF) diff --git a/tests/test_unit.py b/tests/test_unit.py index b50117b..34c2736 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -116,9 +116,7 @@ class TestCase(unittest.TestCase): yield if getattr(ar, "warning", None) is not None: - print( - f"Raised expected warning: {type(ar.warning).__name__}: {ar.warning}" - ) + print(f"Raised expected warning: {type(ar.warning).__name__}: {ar.warning}") else: print(f"Expected {expected_warning_type.__name__} warning not raised") return ar @@ -2931,7 +2929,10 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): with self.assertRaises( TypeError, msg="ParserElement * (str, str) should raise error" ): - expr = pp.Word(pp.alphas)("first") + pp.Word(pp.nums)("second") * ("2", "3") + expr = pp.Word(pp.alphas)("first") + pp.Word(pp.nums)("second") * ( + "2", + "3", + ) def testParserElementMulByZero(self): alpwd = pp.Word(pp.alphas) @@ -2956,12 +2957,16 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): # ParserElement * str with self.subTest(): - with self.assertRaises(TypeError, msg="ParserElement * str should raise error"): + with self.assertRaises( + TypeError, msg="ParserElement * str should raise error" + ): expr = pp.Word(pp.alphas)("first") + pp.Word(pp.nums)("second") * "3" # str * ParserElement with self.subTest(): - with self.assertRaises(TypeError, msg="str * ParserElement should raise error"): + with self.assertRaises( + TypeError, msg="str * ParserElement should raise error" + ): expr = pp.Word(pp.alphas)("first") + "3" * pp.Word(pp.nums)("second") # ParserElement * int @@ -8284,24 +8289,27 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): self.assertEqual("bool [, bool]...", str(bool_list2)) with self.subTest(): - street_address = pp.common.integer.set_name("integer") + pp.Word(pp.alphas)[1, ...].set_name("street_name") + street_address = pp.common.integer.set_name("integer") + pp.Word(pp.alphas)[ + 1, ... + ].set_name("street_name") self.assertEqual( "{integer street_name} [, {integer street_name}]...", - str(pp.delimitedList(street_address)) + str(pp.delimitedList(street_address)), ) with self.subTest(): operand = pp.Char(pp.alphas).set_name("var") - math = pp.infixNotation(operand, - [ - (pp.one_of("+ -"), 2, pp.opAssoc.LEFT), - ]) + math = pp.infixNotation( + operand, + [ + (pp.one_of("+ -"), 2, pp.opAssoc.LEFT), + ], + ) self.assertEqual( "Forward: + | - term [, Forward: + | - term]...", - str(pp.delimitedList(math)) + str(pp.delimitedList(math)), ) - def testDelimitedListOfStrLiterals(self): expr = pp.delimitedList("ABC") print(str(expr)) @@ -8333,9 +8341,9 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testDelimitedListParseActions1(self): # from issue #408 - keyword = pp.Keyword('foobar') + keyword = pp.Keyword("foobar") untyped_identifier = ~keyword + pp.Word(pp.alphas) - dotted_vars = pp.delimited_list(untyped_identifier, delim='.') + dotted_vars = pp.delimited_list(untyped_identifier, delim=".") lvalue = pp.Opt(dotted_vars) # uncomment this line to see the problem @@ -8344,27 +8352,29 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): # stmt = pp.Opt(dotted_vars) def parse_identifier(toks): - print('YAY!', toks) + print("YAY!", toks) untyped_identifier.set_parse_action(parse_identifier) save_stdout = StringIO() with contextlib.redirect_stdout(save_stdout): - dotted_vars.parse_string('B.C') + dotted_vars.parse_string("B.C") self.assertEqual( - dedent("""\ + dedent( + """\ YAY! ['B'] YAY! ['C'] - """), - save_stdout.getvalue() + """ + ), + save_stdout.getvalue(), ) def testDelimitedListParseActions2(self): # from issue #408 - keyword = pp.Keyword('foobar') + keyword = pp.Keyword("foobar") untyped_identifier = ~keyword + pp.Word(pp.alphas) - dotted_vars = pp.delimited_list(untyped_identifier, delim='.') + dotted_vars = pp.delimited_list(untyped_identifier, delim=".") lvalue = pp.Opt(dotted_vars) # uncomment this line to see the problem @@ -8373,27 +8383,29 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): # stmt = pp.Opt(dotted_vars) def parse_identifier(toks): - print('YAY!', toks) + print("YAY!", toks) untyped_identifier.set_parse_action(parse_identifier) save_stdout = StringIO() with contextlib.redirect_stdout(save_stdout): - dotted_vars.parse_string('B.C') + dotted_vars.parse_string("B.C") self.assertEqual( - dedent("""\ + dedent( + """\ YAY! ['B'] YAY! ['C'] - """), - save_stdout.getvalue() + """ + ), + save_stdout.getvalue(), ) def testDelimitedListParseActions3(self): # from issue #408 - keyword = pp.Keyword('foobar') + keyword = pp.Keyword("foobar") untyped_identifier = ~keyword + pp.Word(pp.alphas) - dotted_vars = pp.delimited_list(untyped_identifier, delim='.') + dotted_vars = pp.delimited_list(untyped_identifier, delim=".") lvalue = pp.Opt(dotted_vars) # uncomment this line to see the problem @@ -8402,20 +8414,22 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): stmt = pp.Opt(dotted_vars) def parse_identifier(toks): - print('YAY!', toks) + print("YAY!", toks) untyped_identifier.set_parse_action(parse_identifier) save_stdout = StringIO() with contextlib.redirect_stdout(save_stdout): - dotted_vars.parse_string('B.C') + dotted_vars.parse_string("B.C") self.assertEqual( - dedent("""\ + dedent( + """\ YAY! ['B'] YAY! ['C'] - """), - save_stdout.getvalue() + """ + ), + save_stdout.getvalue(), ) def testEnableDebugOnNamedExpressions(self): @@ -8667,11 +8681,14 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testSetDebugRecursivelyWithForward(self): expr = pp.Word(pp.alphas).set_name("innermost") - contained = pp.infix_notation(expr, [ - ('NOT', 1, pp.opAssoc.RIGHT), - ('AND', 2, pp.opAssoc.LEFT), - ('OR', 2, pp.opAssoc.LEFT), - ]) + contained = pp.infix_notation( + expr, + [ + ("NOT", 1, pp.opAssoc.RIGHT), + ("AND", 2, pp.opAssoc.LEFT), + ("OR", 2, pp.opAssoc.LEFT), + ], + ) contained.set_debug(recurse=True) self.assertTrue(expr.debug) @@ -8925,17 +8942,11 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): ppu = pp.pyparsing_unicode latin_identifier = pp.Word(pp.identchars, pp.identbodychars)("latin*") - japanese_identifier = pp.Word( - ppu.Japanese.identchars, ppu.Japanese.identbodychars - )("japanese*") - cjk_identifier = pp.Word(ppu.CJK.identchars, ppu.CJK.identbodychars)("cjk*") - greek_identifier = pp.Word(ppu.Greek.identchars, ppu.Greek.identbodychars)( - "greek*" - ) - cyrillic_identifier = pp.Word( - ppu.Cyrillic.identchars, ppu.Cyrillic.identbodychars - )("cyrillic*") - thai_identifier = pp.Word(ppu.Thai.identchars, ppu.Thai.identbodychars)("thai*") + japanese_identifier = ppu.Japanese.identifier("japanese*") + cjk_identifier = ppu.CJK.identifier("cjk*") + greek_identifier = ppu.Greek.identifier("greek*") + cyrillic_identifier = ppu.Cyrillic.identifier("cyrillic*") + thai_identifier = ppu.Thai.identifier("thai*") idents = ( latin_identifier | japanese_identifier @@ -9113,7 +9124,9 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): def testValidation(grmr, gnam, isValid): try: grmr.streamline() - with self.assertWarns(DeprecationWarning, msg="failed to warn validate() is deprecated"): + with self.assertWarns( + DeprecationWarning, msg="failed to warn validate() is deprecated" + ): grmr.validate() self.assertTrue(isValid, "validate() accepted invalid grammar " + gnam) except pp.RecursiveGrammarException as rge: |