diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2019-08-24 10:39:09 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2019-08-24 11:32:01 +0200 |
commit | c9dfe708d3e7abad331c16bc866d25896d2735dd (patch) | |
tree | 8c5b395fbf2ee5d8a7c6431834670e9e2787629d | |
parent | 270bf960a25374d3eb19a6040c395fd4bf4d6a25 (diff) | |
download | cython-c9dfe708d3e7abad331c16bc866d25896d2735dd.tar.gz |
Clean up the Lexicon.py generation script and use f-strings to prevent accidentally running it with older Python versions.
-rw-r--r-- | Cython/Compiler/Lexicon.py | 15 | ||||
-rwxr-xr-x | bin/cython-generate-lexicon.py | 109 |
2 files changed, 54 insertions, 70 deletions
diff --git a/Cython/Compiler/Lexicon.py b/Cython/Compiler/Lexicon.py index b54dbee5e..30372b188 100644 --- a/Cython/Compiler/Lexicon.py +++ b/Cython/Compiler/Lexicon.py @@ -153,14 +153,16 @@ def make_lexicon(): #debug_file = scanner_dump_file ) + # BEGIN GENERATED CODE # generated with: - # cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC] +# cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC] unicode_start_ch_any = ( u"_ªµºˬˮͿΆΌՙەۿܐޱߺࠚࠤࠨऽॐলঽৎৼਫ਼ઽૐૹଽୱஃஜௐఽಀಽೞഽൎලาຄຊຍລວາຽໆༀဿၡႎჇჍቘዀៗៜᢪ" u"ᪧὙὛὝιⁱⁿℂℇℕℤΩℨⅎⴧⴭⵯꣻꧏꩺꪱꫀꫂיִמּﹱﹳﹷﹹﹻﹽ𐠈𐠼𐨀𐼧𑅄𑅶𑇚𑇜𑊈𑌽𑍐𑓇𑙄𑣿𑨀𑨺𑩐𑪝𑱀𑵆𑶘𖽐𝒢𝒻𝕆𞸤𞸧𞸹𞸻" - u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾") + u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾" +) unicode_start_ch_range = ( u"AZazÀÖØöøˁˆˑˠˤͰʹͶͷͻͽΈΊΎΡΣϵϷҁҊԯԱՖՠֈאתׯײؠيٮٯٱۓۥۦۮۯۺۼܒܯݍޥߊߪߴߵࠀࠕ" u"ࡀࡘࡠࡪࢠࢴࢶࢽऄहक़ॡॱঀঅঌএঐওনপরশহড়ঢ়য়ৡৰৱਅਊਏਐਓਨਪਰਲਲ਼ਵਸ਼ਸਹਖ਼ੜੲੴઅઍએઑઓનપરલળવહ" @@ -177,11 +179,13 @@ unicode_start_ch_range = ( u"𑈀𑈑𑈓𑈫𑊀𑊆𑊊𑊍𑊏𑊝𑊟𑊨𑊰𑋞𑌅𑌌𑌏𑌐𑌓𑌨𑌪𑌰𑌲𑌳𑌵𑌹𑍝𑍡𑐀𑐴𑑇𑑊𑒀𑒯𑓄𑓅𑖀𑖮𑗘𑗛𑘀𑘯𑚀𑚪𑜀𑜚𑠀𑠫𑢠𑣟𑨋𑨲𑩜𑪃𑪆𑪉𑫀𑫸𑰀𑰈" u"𑰊𑰮𑱲𑲏𑴀𑴆𑴈𑴉𑴋𑴰𑵠𑵥𑵧𑵨𑵪𑶉𑻠𑻲𒀀𒎙𒐀𒑮𒒀𒕃𓀀𓐮𔐀𔙆𖠀𖨸𖩀𖩞𖫐𖫭𖬀𖬯𖭀𖭃𖭣𖭷𖭽𖮏𖹀𖹿𖼀𖽄𖾓𖾟𖿠𖿡𗀀𘟱𘠀𘫲𛀀𛄞𛅰𛋻𛰀𛱪" u"𛱰𛱼𛲀𛲈𛲐𛲙𝐀𝑔𝑖𝒜𝒞𝒟𝒥𝒦𝒩𝒬𝒮𝒹𝒽𝓃𝓅𝔅𝔇𝔊𝔍𝔔𝔖𝔜𝔞𝔹𝔻𝔾𝕀𝕄𝕊𝕐𝕒𝚥𝚨𝛀𝛂𝛚𝛜𝛺𝛼𝜔𝜖𝜴𝜶𝝎𝝐𝝮𝝰𝞈𝞊𝞨𝞪𝟂𝟄𝟋" - u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠") + u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠" +) unicode_continuation_ch_any = ( - u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁_𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄") + u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁_𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄" +) unicode_continuation_ch_range = ( u"09ֽׁׂًؚ֑ׅ̀ͯ҃҇ׄؐ٩۪ۭۖۜ۟ۤۧۨ۰۹ܰ݊ަް߀߉࡙࡛࣓ࣣ߫߳ࠖ࠙ࠛࠣࠥࠧࠩ࠭࣡ःऺ़ाॏ॑ॗॢॣ०९ঁঃ" u"াৄেৈো্ৢৣ০৯ਁਃਾੂੇੈੋ੍੦ੱઁઃાૅેૉો્ૢૣ૦૯ૺ૿ଁଃାୄେୈୋ୍ୖୗୢୣ୦୯ாூெைொ்௦௯ఀఄాౄ" @@ -192,7 +196,8 @@ unicode_continuation_ch_range = ( u"︠︯︳︴﹍﹏09゙゚𐍶𐍺𐒠𐒩𐨁𐨃𐨅𐨆𐨌𐨺𐫦𐨏𐨸𐫥𐴤𐴧𐴰𐴹𐽆𐽐𑀀𑀂𑀸𑁆𑁦𑁯𑁿𑂂𑂰𑂺𑃰𑃹𑄀𑄂𑄧𑄴𑄶𑄿𑅅𑅆𑆀𑆂𑆳𑇀𑇉𑇌𑇐𑇙𑈬𑈷" u"𑋟𑋪𑋰𑋹𑌀𑌃𑌻𑌼𑌾𑍄𑍇𑍈𑍋𑍍𑍢𑍣𑍦𑍬𑍰𑍴𑐵𑑆𑑐𑑙𑒰𑓃𑓐𑓙𑖯𑖵𑖸𑗀𑗜𑗝𑘰𑙀𑙐𑙙𑚫𑚷𑛀𑛉𑜝𑜫𑜰𑜹𑠬𑠺𑣠𑣩𑨁𑨊𑨳𑨹𑨻𑨾𑩑𑩛𑪊𑪙" u"𑰯𑰶𑰸𑰿𑱐𑱙𑲒𑲧𑲩𑲶𑴱𑴶𑴼𑴽𑴿𑵅𑵐𑵙𑶊𑶎𑶐𑶑𑶓𑶗𑶠𑶩𑻳𑻶𖩠𖩩𖫰𖫴𖬰𖬶𖭐𖭙𖽑𖽾𖾏𖾒𛲝𛲞𝅩𝅥𝅲𝅻𝆂𝆋𝅭𝆅𝆪𝆭𝉂𝉄𝟎𝟿𝨀𝨶𝨻𝩬" - u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙") + u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙" +) # END GENERATED CODE diff --git a/bin/cython-generate-lexicon.py b/bin/cython-generate-lexicon.py index a66da53c1..e108ca9a0 100755 --- a/bin/cython-generate-lexicon.py +++ b/bin/cython-generate-lexicon.py @@ -11,21 +11,23 @@ # --overwrite to update the existing Lexicon.py file # --here to create a copy of Lexicon.py in the current directory +import functools +import re +import os import sys from io import StringIO -import os -import functools # Make sure we import the right Cython cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory cythonpath, _ = os.path.split(cythonpath) -if os.path.exists(os.path.join(cythonpath,"Cython")): +if os.path.exists(os.path.join(cythonpath, "Cython")): sys.path.insert(0, cythonpath) print("Found (and using) local cython directory") # else we aren't in a development directory from Cython.Compiler import Lexicon + def main(): arg = '--overwrite' if len(sys.argv) == 2: @@ -37,35 +39,24 @@ def main(): """) return - generated_code = StringIO() - print("# generated with:\n #", sys.implementation.name, sys.version, file=generated_code) - print(file=generated_code) - print(start_expression(), file=generated_code) - print(file=generated_code) - print(cont_expression(), file=generated_code) - print(file=generated_code) - generated_code = generated_code.getvalue() - - output = StringIO() - mode = 0 # 1 when found generated section, 2 afterwards + generated_code = ( + f"# generated with:\n" + f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n" + "\n" + f"{generate_character_sets()}\n" + ) + print("Reading file", Lexicon.__file__) - with open(Lexicon.__file__,'r') as f: - for line in f: - if mode != 1: - output.write(line) - else: - if line.strip() == "# END GENERATED CODE": - mode = 2 - output.write(line) - if mode == 0: - if line.strip() == "# BEGIN GENERATED CODE": - mode = 1 - output.write(generated_code) - - if mode != 2: + with open(Lexicon.__file__, 'r') as f: + parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read()) + + if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]: print("Warning: generated code section not found - code not inserted") return + parts[2] = generated_code + output = "".join(parts) + if arg == "--here": outfile = "Lexicon.py" else: @@ -73,23 +64,26 @@ def main(): outfile = Lexicon.__file__ print("Writing to file", outfile) - with open(outfile,'w') as f: - f.write(output.getvalue()) + with open(outfile, 'w') as f: + f.write(output) # The easiest way to generate an appropriate character set is just to use the str.isidentifier method # An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412 -@functools.lru_cache(None) +@functools.lru_cache() def get_start_characters_as_number(): return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ] + def get_continue_characters_as_number(): return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ] + def get_continue_not_start_as_number(): start = get_start_characters_as_number() cont = get_continue_characters_as_number() - return sorted(set(cont)-set(start)) + return sorted(set(cont) - set(start)) + def to_ranges(char_num_list): # Convert the large lists of character digits to @@ -106,47 +100,32 @@ def to_ranges(char_num_list): if first_good_val == char_num_list[n-1]: single_chars.append(chr(char_num_list[n-1])) else: - ranges.append(chr(first_good_val)+chr(char_num_list[n-1])) + ranges.append(chr(first_good_val) + chr(char_num_list[n-1])) first_good_val = char_num_list[n] - return single_chars, ranges - -def make_split_strings(chars, splitby=60): - out = [] - for i in range(0, len(chars), splitby): - out.append('u"{}"'.format("".join(chars[i:i+splitby]))) - return "\n ".join(out) - -def start_expression(): - output = StringIO() - print("unicode_start_ch_any = (\n ", end='', file=output) - single_chars, ranges = to_ranges(get_start_characters_as_number()) - single_chars = "".join(single_chars) - ranges = "".join(ranges) + return ''.join(single_chars), ''.join(ranges) - print(make_split_strings(single_chars), end='', file=output) - print(")", file=output) - print("unicode_start_ch_range = (\n ", end='', file=output) - print(make_split_strings(ranges), end='', file=output) - print(")", file=output) - return output.getvalue() +def make_split_strings(chars, splitby=60, indent=" "): + lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)] + return indent + f"\n{indent}".join(lines) -def cont_expression(): - output = StringIO() - print("unicode_continuation_ch_any = (\n ", end='', file=output) - single_chars, ranges = to_ranges(get_continue_not_start_as_number()) - single_chars = "".join(single_chars) - ranges = "".join(ranges) +def generate_character_sets(): + declarations = [] + for char_type, char_generator in [ + ("unicode_start_ch", get_start_characters_as_number), + ("unicode_continuation_ch", get_continue_not_start_as_number), + ]: + for set_type, chars in zip(("any", "range"), to_ranges(char_generator())): + declarations.append( + f"{char_type}_{set_type} = (\n" + f"{make_split_strings(chars)}\n" + f")\n" + ) - print(make_split_strings(single_chars), end='', file=output) - print(")", file=output) - print("unicode_continuation_ch_range = (\n ", end='', file=output) - print(make_split_strings(ranges), end='', file=output) - print(")", file=output) + return "".join(declarations) - return output.getvalue() if __name__ == "__main__": main() |