scripts/generate_identifier_pattern.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

#!/usr/bin/env python3
import itertools
import os
import re
import sys

if sys.version_info[0] < 3:
    raise RuntimeError('This needs to run on Python 3.')


def get_characters():
    """Find every Unicode character that is valid in a Python `identifier`_ but
    is not matched by the regex ``\w`` group.

    ``\w`` matches some characters that aren't valid in identifiers, but
    :meth:`str.isidentifier` will catch that later in lexing.

    All start characters are valid continue characters, so we only test for
    continue characters.

    _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
    """
    for cp in range(sys.maxunicode + 1):
        s = chr(cp)

        if ('a' + s).isidentifier() and not re.match(r'\w', s):
            yield s


def collapse_ranges(data):
    """Given a sorted list of unique characters, generate ranges representing
    sequential code points.

    Source: https://stackoverflow.com/a/4629241/400617
    """
    for a, b in itertools.groupby(
        enumerate(data),
        lambda x: ord(x[1]) - x[0]
    ):
        b = list(b)
        yield b[0][1], b[-1][1]


def build_pattern(ranges):
    """Output the regex pattern for ranges of characters.

    One and two character ranges output the individual characters.
    """
    out = []

    for a, b in ranges:
        if a == b:  # single char
            out.append(a)
        elif ord(b) - ord(a) == 1:  # two chars, range is redundant
            out.append(a)
            out.append(b)
        else:
            out.append(f'{a}-{b}')

    return ''.join(out)


def main():
    """Build the regex pattern and write it to the file
    :file:`jinja2/_identifier.py`."""
    pattern = build_pattern(collapse_ranges(get_characters()))
    filename = os.path.abspath(os.path.join(
        os.path.dirname(__file__), '..', 'jinja2', '_identifier.py'
    ))

    with open(filename, 'w', encoding='utf8') as f:
        f.write('# generated by scripts/generate_identifier_pattern.py\n')
        f.write(f'pattern = \'{pattern}\'\n')


if __name__ == '__main__':
    main()