Clean up the Lexicon.py generation script and use f-strings to prevent accidentally running it with older Python versions.

author: Stefan Behnel <stefan_ml@behnel.de> 2019-08-24 10:39:09 +0200
committer: Stefan Behnel <stefan_ml@behnel.de> 2019-08-24 11:32:01 +0200
commit: c9dfe708d3e7abad331c16bc866d25896d2735dd (patch)
tree: 8c5b395fbf2ee5d8a7c6431834670e9e2787629d
parent: 270bf960a25374d3eb19a6040c395fd4bf4d6a25 (diff)
download: cython-c9dfe708d3e7abad331c16bc866d25896d2735dd.tar.gz
2 files changed, 54 insertions, 70 deletions
diff --git a/Cython/Compiler/Lexicon.py b/Cython/Compiler/Lexicon.py
index b54dbee5e..30372b188 100644
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -153,14 +153,16 @@ def make_lexicon():
         #debug_file = scanner_dump_file
         )
 
+
 # BEGIN GENERATED CODE
 # generated with:
- # cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
+# cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
 
 unicode_start_ch_any = (
     u"_ªµºˬˮͿΆΌՙەۿܐޱߺࠚࠤࠨऽॐলঽৎৼਫ਼ઽૐૹଽୱஃஜௐఽಀಽೞഽൎලาຄຊຍລວາຽໆༀဿၡႎჇჍቘዀៗៜᢪ"
     u"ᪧὙὛὝιⁱⁿℂℇℕℤΩℨⅎⴧⴭⵯꣻꧏꩺꪱꫀꫂיִמּﹱﹳﹷﹹﹻﹽ𐠈𐠼𐨀𐼧𑅄𑅶𑇚𑇜𑊈𑌽𑍐𑓇𑙄𑣿𑨀𑨺𑩐𑪝𑱀𑵆𑶘𖽐𝒢𝒻𝕆𞸤𞸧𞸹𞸻"
-    u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾")
+    u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾"
+)
 unicode_start_ch_range = (
     u"AZazÀÖØöøˁˆˑˠˤͰʹͶͷͻͽΈΊΎΡΣϵϷҁҊԯԱՖՠֈאתׯײؠيٮٯٱۓۥۦۮۯۺۼܒܯݍޥߊߪߴߵࠀࠕ"
     u"ࡀࡘࡠࡪࢠࢴࢶࢽऄहक़ॡॱঀঅঌএঐওনপরশহড়ঢ়য়ৡৰৱਅਊਏਐਓਨਪਰਲਲ਼ਵਸ਼ਸਹਖ਼ੜੲੴઅઍએઑઓનપરલળવહ"
@@ -177,11 +179,13 @@ unicode_start_ch_range = (
     u"𑈀𑈑𑈓𑈫𑊀𑊆𑊊𑊍𑊏𑊝𑊟𑊨𑊰𑋞𑌅𑌌𑌏𑌐𑌓𑌨𑌪𑌰𑌲𑌳𑌵𑌹𑍝𑍡𑐀𑐴𑑇𑑊𑒀𑒯𑓄𑓅𑖀𑖮𑗘𑗛𑘀𑘯𑚀𑚪𑜀𑜚𑠀𑠫𑢠𑣟𑨋𑨲𑩜𑪃𑪆𑪉𑫀𑫸𑰀𑰈"
     u"𑰊𑰮𑱲𑲏𑴀𑴆𑴈𑴉𑴋𑴰𑵠𑵥𑵧𑵨𑵪𑶉𑻠𑻲𒀀𒎙𒐀𒑮𒒀𒕃𓀀𓐮𔐀𔙆𖠀𖨸𖩀𖩞𖫐𖫭𖬀𖬯𖭀𖭃𖭣𖭷𖭽𖮏𖹀𖹿𖼀𖽄𖾓𖾟𖿠𖿡𗀀𘟱𘠀𘫲𛀀𛄞𛅰𛋻𛰀𛱪"
     u"𛱰𛱼𛲀𛲈𛲐𛲙𝐀𝑔𝑖𝒜𝒞𝒟𝒥𝒦𝒩𝒬𝒮𝒹𝒽𝓃𝓅𝔅𝔇𝔊𝔍𝔔𝔖𝔜𝔞𝔹𝔻𝔾𝕀𝕄𝕊𝕐𝕒𝚥𝚨𝛀𝛂𝛚𝛜𝛺𝛼𝜔𝜖𝜴𝜶𝝎𝝐𝝮𝝰𝞈𝞊𝞨𝞪𝟂𝟄𝟋"
-    u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠")
+    u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠"
+)
 
 
 unicode_continuation_ch_any = (
-    u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁＿𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄")
+    u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁＿𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄"
+)
 unicode_continuation_ch_range = (
     u"09ֽׁׂًؚ֑ׅ̀ͯ҃҇ׄؐ٩۪ۭۖۜ۟ۤۧۨ۰۹ܰ݊ަް߀߉࡙࡛࣓ࣣ߫߳ࠖ࠙ࠛࠣࠥࠧࠩ࠭࣡ःऺ़ाॏ॑ॗॢॣ०९ঁঃ"
     u"াৄেৈো্ৢৣ০৯ਁਃਾੂੇੈੋ੍੦ੱઁઃાૅેૉો્ૢૣ૦૯ૺ૿ଁଃାୄେୈୋ୍ୖୗୢୣ୦୯ாூெைொ்௦௯ఀఄాౄ"
@@ -192,7 +196,8 @@ unicode_continuation_ch_range = (
     u"︠︯︳︴﹍﹏０９ﾞﾟ𐍶𐍺𐒠𐒩𐨁𐨃𐨅𐨆𐨌𐨺𐫦𐨏𐨸𐫥𐴤𐴧𐴰𐴹𐽆𐽐𑀀𑀂𑀸𑁆𑁦𑁯𑁿𑂂𑂰𑂺𑃰𑃹𑄀𑄂𑄧𑄴𑄶𑄿𑅅𑅆𑆀𑆂𑆳𑇀𑇉𑇌𑇐𑇙𑈬𑈷"
     u"𑋟𑋪𑋰𑋹𑌀𑌃𑌻𑌼𑌾𑍄𑍇𑍈𑍋𑍍𑍢𑍣𑍦𑍬𑍰𑍴𑐵𑑆𑑐𑑙𑒰𑓃𑓐𑓙𑖯𑖵𑖸𑗀𑗜𑗝𑘰𑙀𑙐𑙙𑚫𑚷𑛀𑛉𑜝𑜫𑜰𑜹𑠬𑠺𑣠𑣩𑨁𑨊𑨳𑨹𑨻𑨾𑩑𑩛𑪊𑪙"
     u"𑰯𑰶𑰸𑰿𑱐𑱙𑲒𑲧𑲩𑲶𑴱𑴶𑴼𑴽𑴿𑵅𑵐𑵙𑶊𑶎𑶐𑶑𑶓𑶗𑶠𑶩𑻳𑻶𖩠𖩩𖫰𖫴𖬰𖬶𖭐𖭙𖽑𖽾𖾏𖾒𛲝𛲞𝅩𝅥𝅲𝅻𝆂𝆋𝅭𝆅𝆪𝆭𝉂𝉄𝟎𝟿𝨀𝨶𝨻𝩬"
-    u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙")
+    u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙"
+)
 
 
 # END GENERATED CODE
diff --git a/bin/cython-generate-lexicon.py b/bin/cython-generate-lexicon.py
index a66da53c1..e108ca9a0 100755
--- a/bin/cython-generate-lexicon.py
+++ b/bin/cython-generate-lexicon.py
@@ -11,21 +11,23 @@
 #    --overwrite    to update the existing Lexicon.py file
 #    --here         to create a copy of Lexicon.py in the current directory
 
+import functools
+import re
+import os
 import sys
 from io import StringIO
-import os
-import functools
 
 # Make sure we import the right Cython
 cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
 cythonpath, _ = os.path.split(cythonpath)
-if os.path.exists(os.path.join(cythonpath,"Cython")):
+if os.path.exists(os.path.join(cythonpath, "Cython")):
     sys.path.insert(0, cythonpath)
     print("Found (and using) local cython directory")
 # else we aren't in a development directory
 
 from Cython.Compiler import Lexicon
 
+
 def main():
     arg = '--overwrite'
     if len(sys.argv) == 2:
@@ -37,35 +39,24 @@ def main():
 """)
         return
 
-    generated_code = StringIO()
-    print("# generated with:\n #", sys.implementation.name, sys.version, file=generated_code)
-    print(file=generated_code)
-    print(start_expression(), file=generated_code)
-    print(file=generated_code)
-    print(cont_expression(), file=generated_code)
-    print(file=generated_code)
-    generated_code = generated_code.getvalue()
-
-    output = StringIO()
-    mode = 0 # 1 when found generated section, 2 afterwards
+    generated_code = (
+        f"# generated with:\n"
+        f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
+        "\n"
+        f"{generate_character_sets()}\n"
+    )
+
     print("Reading file", Lexicon.__file__)
-    with open(Lexicon.__file__,'r') as f:
-        for line in f:
-            if mode != 1:
-                output.write(line)
-            else:
-                if line.strip() == "# END GENERATED CODE":
-                    mode = 2
-                    output.write(line)
-            if mode == 0:
-                if line.strip() == "# BEGIN GENERATED CODE":
-                    mode = 1
-                    output.write(generated_code)
-
-    if mode != 2:
+    with open(Lexicon.__file__, 'r') as f:
+        parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
+
+    if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
         print("Warning: generated code section not found - code not inserted")
         return
 
+    parts[2] = generated_code
+    output = "".join(parts)
+
     if arg == "--here":
         outfile = "Lexicon.py"
     else:
@@ -73,23 +64,26 @@ def main():
         outfile = Lexicon.__file__
 
     print("Writing to file", outfile)
-    with open(outfile,'w') as f:
-        f.write(output.getvalue())
+    with open(outfile, 'w') as f:
+        f.write(output)
 
 
 # The easiest way to generate an appropriate character set is just to use the str.isidentifier method
 # An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
-@functools.lru_cache(None)
+@functools.lru_cache()
 def get_start_characters_as_number():
     return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
 
+
 def get_continue_characters_as_number():
     return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
 
+
 def get_continue_not_start_as_number():
     start = get_start_characters_as_number()
     cont = get_continue_characters_as_number()
-    return sorted(set(cont)-set(start))
+    return sorted(set(cont) - set(start))
+
 
 def to_ranges(char_num_list):
     # Convert the large lists of character digits to
@@ -106,47 +100,32 @@ def to_ranges(char_num_list):
             if first_good_val == char_num_list[n-1]:
                 single_chars.append(chr(char_num_list[n-1]))
             else:
-                ranges.append(chr(first_good_val)+chr(char_num_list[n-1]))
+                ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
             first_good_val = char_num_list[n]
-    return single_chars, ranges
-
-def make_split_strings(chars, splitby=60):
-    out = []
-    for i in range(0, len(chars), splitby):
-        out.append('u"{}"'.format("".join(chars[i:i+splitby])))
-    return "\n    ".join(out)
-
-def start_expression():
-    output = StringIO()
-    print("unicode_start_ch_any = (\n    ", end='', file=output)
 
-    single_chars, ranges = to_ranges(get_start_characters_as_number())
-    single_chars = "".join(single_chars)
-    ranges = "".join(ranges)
+    return ''.join(single_chars), ''.join(ranges)
 
-    print(make_split_strings(single_chars), end='', file=output)
-    print(")", file=output)
-    print("unicode_start_ch_range = (\n    ", end='', file=output)
-    print(make_split_strings(ranges), end='', file=output)
-    print(")", file=output)
 
-    return output.getvalue()
+def make_split_strings(chars, splitby=60, indent="    "):
+    lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
+    return indent + f"\n{indent}".join(lines)
 
-def cont_expression():
-    output = StringIO()
-    print("unicode_continuation_ch_any = (\n    ", end='', file=output)
 
-    single_chars, ranges = to_ranges(get_continue_not_start_as_number())
-    single_chars = "".join(single_chars)
-    ranges = "".join(ranges)
+def generate_character_sets():
+    declarations = []
+    for char_type, char_generator in [
+        ("unicode_start_ch", get_start_characters_as_number),
+        ("unicode_continuation_ch", get_continue_not_start_as_number),
+    ]:
+        for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
+            declarations.append(
+                f"{char_type}_{set_type} = (\n"
+                f"{make_split_strings(chars)}\n"
+                f")\n"
+            )
 
-    print(make_split_strings(single_chars), end='', file=output)
-    print(")", file=output)
-    print("unicode_continuation_ch_range = (\n    ", end='', file=output)
-    print(make_split_strings(ranges), end='', file=output)
-    print(")", file=output)
+    return "".join(declarations)
 
-    return output.getvalue()
 
 if __name__ == "__main__":
     main()
author	Stefan Behnel <stefan_ml@behnel.de>	2019-08-24 10:39:09 +0200
committer	Stefan Behnel <stefan_ml@behnel.de>	2019-08-24 11:32:01 +0200
commit	c9dfe708d3e7abad331c16bc866d25896d2735dd (patch)
tree	8c5b395fbf2ee5d8a7c6431834670e9e2787629d
parent	270bf960a25374d3eb19a6040c395fd4bf4d6a25 (diff)
download	cython-c9dfe708d3e7abad331c16bc866d25896d2735dd.tar.gz