summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2019-08-24 10:39:09 +0200
committerStefan Behnel <stefan_ml@behnel.de>2019-08-24 11:32:01 +0200
commitc9dfe708d3e7abad331c16bc866d25896d2735dd (patch)
tree8c5b395fbf2ee5d8a7c6431834670e9e2787629d
parent270bf960a25374d3eb19a6040c395fd4bf4d6a25 (diff)
downloadcython-c9dfe708d3e7abad331c16bc866d25896d2735dd.tar.gz
Clean up the Lexicon.py generation script and use f-strings to prevent accidentally running it with older Python versions.
-rw-r--r--Cython/Compiler/Lexicon.py15
-rwxr-xr-xbin/cython-generate-lexicon.py109
2 files changed, 54 insertions, 70 deletions
diff --git a/Cython/Compiler/Lexicon.py b/Cython/Compiler/Lexicon.py
index b54dbee5e..30372b188 100644
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -153,14 +153,16 @@ def make_lexicon():
#debug_file = scanner_dump_file
)
+
# BEGIN GENERATED CODE
# generated with:
- # cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
+# cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
unicode_start_ch_any = (
u"_ªµºˬˮͿΆΌՙەۿܐޱߺࠚࠤࠨऽॐলঽৎৼਫ਼ઽૐૹଽୱஃஜௐఽಀಽೞഽൎලาຄຊຍລວາຽໆༀဿၡႎჇჍቘዀៗៜᢪ"
u"ᪧὙὛὝιⁱⁿℂℇℕℤΩℨⅎⴧⴭⵯꣻꧏꩺꪱꫀꫂיִמּﹱﹳﹷﹹﹻﹽ𐠈𐠼𐨀𐼧𑅄𑅶𑇚𑇜𑊈𑌽𑍐𑓇𑙄𑣿𑨀𑨺𑩐𑪝𑱀𑵆𑶘𖽐𝒢𝒻𝕆𞸤𞸧𞸹𞸻"
- u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾")
+ u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾"
+)
unicode_start_ch_range = (
u"AZazÀÖØöøˁˆˑˠˤͰʹͶͷͻͽΈΊΎΡΣϵϷҁҊԯԱՖՠֈאתׯײؠيٮٯٱۓۥۦۮۯۺۼܒܯݍޥߊߪߴߵࠀࠕ"
u"ࡀࡘࡠࡪࢠࢴࢶࢽऄहक़ॡॱঀঅঌএঐওনপরশহড়ঢ়য়ৡৰৱਅਊਏਐਓਨਪਰਲਲ਼ਵਸ਼ਸਹਖ਼ੜੲੴઅઍએઑઓનપરલળવહ"
@@ -177,11 +179,13 @@ unicode_start_ch_range = (
u"𑈀𑈑𑈓𑈫𑊀𑊆𑊊𑊍𑊏𑊝𑊟𑊨𑊰𑋞𑌅𑌌𑌏𑌐𑌓𑌨𑌪𑌰𑌲𑌳𑌵𑌹𑍝𑍡𑐀𑐴𑑇𑑊𑒀𑒯𑓄𑓅𑖀𑖮𑗘𑗛𑘀𑘯𑚀𑚪𑜀𑜚𑠀𑠫𑢠𑣟𑨋𑨲𑩜𑪃𑪆𑪉𑫀𑫸𑰀𑰈"
u"𑰊𑰮𑱲𑲏𑴀𑴆𑴈𑴉𑴋𑴰𑵠𑵥𑵧𑵨𑵪𑶉𑻠𑻲𒀀𒎙𒐀𒑮𒒀𒕃𓀀𓐮𔐀𔙆𖠀𖨸𖩀𖩞𖫐𖫭𖬀𖬯𖭀𖭃𖭣𖭷𖭽𖮏𖹀𖹿𖼀𖽄𖾓𖾟𖿠𖿡𗀀𘟱𘠀𘫲𛀀𛄞𛅰𛋻𛰀𛱪"
u"𛱰𛱼𛲀𛲈𛲐𛲙𝐀𝑔𝑖𝒜𝒞𝒟𝒥𝒦𝒩𝒬𝒮𝒹𝒽𝓃𝓅𝔅𝔇𝔊𝔍𝔔𝔖𝔜𝔞𝔹𝔻𝔾𝕀𝕄𝕊𝕐𝕒𝚥𝚨𝛀𝛂𝛚𝛜𝛺𝛼𝜔𝜖𝜴𝜶𝝎𝝐𝝮𝝰𝞈𝞊𝞨𝞪𝟂𝟄𝟋"
- u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠")
+ u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠"
+)
unicode_continuation_ch_any = (
- u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁_𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄")
+ u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁_𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄"
+)
unicode_continuation_ch_range = (
u"09ֽׁׂًؚ֑ׅ̀ͯ҃҇ׄؐ٩۪ۭۖۜ۟ۤۧۨ۰۹ܰ݊ަް߀߉࡙࡛࣓ࣣ߫߳ࠖ࠙ࠛࠣࠥࠧࠩ࠭࣡ःऺ़ाॏ॑ॗॢॣ०९ঁঃ"
u"াৄেৈো্ৢৣ০৯ਁਃਾੂੇੈੋ੍੦ੱઁઃાૅેૉો્ૢૣ૦૯ૺ૿ଁଃାୄେୈୋ୍ୖୗୢୣ୦୯ாூெைொ்௦௯ఀఄాౄ"
@@ -192,7 +196,8 @@ unicode_continuation_ch_range = (
u"︠︯︳︴﹍﹏09゙゚𐍶𐍺𐒠𐒩𐨁𐨃𐨅𐨆𐨌𐨺𐫦𐨏𐨸𐫥𐴤𐴧𐴰𐴹𐽆𐽐𑀀𑀂𑀸𑁆𑁦𑁯𑁿𑂂𑂰𑂺𑃰𑃹𑄀𑄂𑄧𑄴𑄶𑄿𑅅𑅆𑆀𑆂𑆳𑇀𑇉𑇌𑇐𑇙𑈬𑈷"
u"𑋟𑋪𑋰𑋹𑌀𑌃𑌻𑌼𑌾𑍄𑍇𑍈𑍋𑍍𑍢𑍣𑍦𑍬𑍰𑍴𑐵𑑆𑑐𑑙𑒰𑓃𑓐𑓙𑖯𑖵𑖸𑗀𑗜𑗝𑘰𑙀𑙐𑙙𑚫𑚷𑛀𑛉𑜝𑜫𑜰𑜹𑠬𑠺𑣠𑣩𑨁𑨊𑨳𑨹𑨻𑨾𑩑𑩛𑪊𑪙"
u"𑰯𑰶𑰸𑰿𑱐𑱙𑲒𑲧𑲩𑲶𑴱𑴶𑴼𑴽𑴿𑵅𑵐𑵙𑶊𑶎𑶐𑶑𑶓𑶗𑶠𑶩𑻳𑻶𖩠𖩩𖫰𖫴𖬰𖬶𖭐𖭙𖽑𖽾𖾏𖾒𛲝𛲞𝅩𝅥𝅲𝅻𝆂𝆋𝅭𝆅𝆪𝆭𝉂𝉄𝟎𝟿𝨀𝨶𝨻𝩬"
- u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙")
+ u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙"
+)
# END GENERATED CODE
diff --git a/bin/cython-generate-lexicon.py b/bin/cython-generate-lexicon.py
index a66da53c1..e108ca9a0 100755
--- a/bin/cython-generate-lexicon.py
+++ b/bin/cython-generate-lexicon.py
@@ -11,21 +11,23 @@
# --overwrite to update the existing Lexicon.py file
# --here to create a copy of Lexicon.py in the current directory
+import functools
+import re
+import os
import sys
from io import StringIO
-import os
-import functools
# Make sure we import the right Cython
cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
cythonpath, _ = os.path.split(cythonpath)
-if os.path.exists(os.path.join(cythonpath,"Cython")):
+if os.path.exists(os.path.join(cythonpath, "Cython")):
sys.path.insert(0, cythonpath)
print("Found (and using) local cython directory")
# else we aren't in a development directory
from Cython.Compiler import Lexicon
+
def main():
arg = '--overwrite'
if len(sys.argv) == 2:
@@ -37,35 +39,24 @@ def main():
""")
return
- generated_code = StringIO()
- print("# generated with:\n #", sys.implementation.name, sys.version, file=generated_code)
- print(file=generated_code)
- print(start_expression(), file=generated_code)
- print(file=generated_code)
- print(cont_expression(), file=generated_code)
- print(file=generated_code)
- generated_code = generated_code.getvalue()
-
- output = StringIO()
- mode = 0 # 1 when found generated section, 2 afterwards
+ generated_code = (
+ f"# generated with:\n"
+ f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
+ "\n"
+ f"{generate_character_sets()}\n"
+ )
+
print("Reading file", Lexicon.__file__)
- with open(Lexicon.__file__,'r') as f:
- for line in f:
- if mode != 1:
- output.write(line)
- else:
- if line.strip() == "# END GENERATED CODE":
- mode = 2
- output.write(line)
- if mode == 0:
- if line.strip() == "# BEGIN GENERATED CODE":
- mode = 1
- output.write(generated_code)
-
- if mode != 2:
+ with open(Lexicon.__file__, 'r') as f:
+ parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
+
+ if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
print("Warning: generated code section not found - code not inserted")
return
+ parts[2] = generated_code
+ output = "".join(parts)
+
if arg == "--here":
outfile = "Lexicon.py"
else:
@@ -73,23 +64,26 @@ def main():
outfile = Lexicon.__file__
print("Writing to file", outfile)
- with open(outfile,'w') as f:
- f.write(output.getvalue())
+ with open(outfile, 'w') as f:
+ f.write(output)
# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
-@functools.lru_cache(None)
+@functools.lru_cache()
def get_start_characters_as_number():
return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
+
def get_continue_characters_as_number():
return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
+
def get_continue_not_start_as_number():
start = get_start_characters_as_number()
cont = get_continue_characters_as_number()
- return sorted(set(cont)-set(start))
+ return sorted(set(cont) - set(start))
+
def to_ranges(char_num_list):
# Convert the large lists of character digits to
@@ -106,47 +100,32 @@ def to_ranges(char_num_list):
if first_good_val == char_num_list[n-1]:
single_chars.append(chr(char_num_list[n-1]))
else:
- ranges.append(chr(first_good_val)+chr(char_num_list[n-1]))
+ ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
first_good_val = char_num_list[n]
- return single_chars, ranges
-
-def make_split_strings(chars, splitby=60):
- out = []
- for i in range(0, len(chars), splitby):
- out.append('u"{}"'.format("".join(chars[i:i+splitby])))
- return "\n ".join(out)
-
-def start_expression():
- output = StringIO()
- print("unicode_start_ch_any = (\n ", end='', file=output)
- single_chars, ranges = to_ranges(get_start_characters_as_number())
- single_chars = "".join(single_chars)
- ranges = "".join(ranges)
+ return ''.join(single_chars), ''.join(ranges)
- print(make_split_strings(single_chars), end='', file=output)
- print(")", file=output)
- print("unicode_start_ch_range = (\n ", end='', file=output)
- print(make_split_strings(ranges), end='', file=output)
- print(")", file=output)
- return output.getvalue()
+def make_split_strings(chars, splitby=60, indent=" "):
+ lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
+ return indent + f"\n{indent}".join(lines)
-def cont_expression():
- output = StringIO()
- print("unicode_continuation_ch_any = (\n ", end='', file=output)
- single_chars, ranges = to_ranges(get_continue_not_start_as_number())
- single_chars = "".join(single_chars)
- ranges = "".join(ranges)
+def generate_character_sets():
+ declarations = []
+ for char_type, char_generator in [
+ ("unicode_start_ch", get_start_characters_as_number),
+ ("unicode_continuation_ch", get_continue_not_start_as_number),
+ ]:
+ for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
+ declarations.append(
+ f"{char_type}_{set_type} = (\n"
+ f"{make_split_strings(chars)}\n"
+ f")\n"
+ )
- print(make_split_strings(single_chars), end='', file=output)
- print(")", file=output)
- print("unicode_continuation_ch_range = (\n ", end='', file=output)
- print(make_split_strings(ranges), end='', file=output)
- print(")", file=output)
+ return "".join(declarations)
- return output.getvalue()
if __name__ == "__main__":
main()