regen/mk_invlists.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337

#!perl -w
use 5.015;
use strict;
use warnings;
use Unicode::UCD qw(prop_invlist prop_invmap);
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';

# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# Thus, the lists it contains are essentially pre-compiled, and need only a
# light-weight fast wrapper to make them usable at run-time.

# As such, this code knows about the internal structure of these lists, and
# any change made to that has to be done here as well.  A random number stored
# in the headers is used to minimize the possibility of things getting
# out-of-sync, or the wrong data structure being passed.  Currently that
# random number is:
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;

my $out_fh = open_new('charclass_invlists.h', '>',
		      {style => '*', by => $0,
                      from => "Unicode::UCD"});

my $is_in_ifndef_ext_re = 0;

print $out_fh "/* See the generating file for comments */\n\n";

my %include_in_ext_re = ( NonL1_Perl_Non_Final_Folds => 1 );

sub end_ifndef_ext_re {
    if ($is_in_ifndef_ext_re) {
        print $out_fh "\n#endif\t/* #ifndef PERL_IN_XSUB_RE */\n";
        $is_in_ifndef_ext_re = 0;
    }
}

sub output_invlist ($$;$) {
    my $name = shift;
    my $invlist = shift;     # Reference to inversion list array
    my $charset = shift // "";  # name of character set for comment

    die "No inversion list for $name" unless defined $invlist
                                             && ref $invlist eq 'ARRAY'
                                             && @$invlist;

    # Output the inversion list $invlist using the name $name for it.
    # It is output in the exact internal form for inversion lists.

    # Is the last element of the header 0, or 1 ?
    my $zero_or_one = 0;
    if ($invlist->[0] != 0) {
        unshift @$invlist, 0;
        $zero_or_one = 1;
    }
    my $count = @$invlist;

    if ($is_in_ifndef_ext_re) {
        if (exists $include_in_ext_re{$name}) {
            end_ifndef_ext_re;
        }
    }
    elsif (! exists $include_in_ext_re{$name}) {
        print $out_fh "\n#ifndef PERL_IN_XSUB_RE\n" unless exists $include_in_ext_re{$name};
        $is_in_ifndef_ext_re = 1;
    }

    print $out_fh "\nstatic const UV ${name}_invlist[] = {";
    print $out_fh " /* for $charset */" if $charset;
    print $out_fh "\n";

    print $out_fh "\t$count,\t/* Number of elements */\n";
    print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
    print $out_fh "\t", $zero_or_one,
                  ",\t/* 0 if the list starts at 0;",
                  "\n\t\t   1 if it starts at the element beyond 0 */\n";

    # The main body are the UVs passed in to this routine.  Do the final
    # element separately
    for my $i (0 .. @$invlist - 1) {
        printf $out_fh "\t0x%X", $invlist->[$i];
        print $out_fh "," if $i < @$invlist - 1;
        print $out_fh "\n";
    }

    print $out_fh "};\n";
}

sub mk_invlist_from_cp_list {

    # Returns an inversion list constructed from the sorted input array of
    # code points

    my $list_ref = shift;

    # Initialize to just the first element
    my @invlist = ( $list_ref->[0], $list_ref->[0] + 1);

    # For each succeeding element, if it extends the previous range, adjust
    # up, otherwise add it.
    for my $i (1 .. @$list_ref - 1) {
        if ($invlist[-1] == $list_ref->[$i]) {
            $invlist[-1]++;
        }
        else {
            push @invlist, $list_ref->[$i], $list_ref->[$i] + 1;
        }
    }
    return @invlist;
}

# Read in the Case Folding rules, and construct arrays of code points for the
# properties we need.
my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
die "Could not find inversion map for Case_Folding" unless defined $format;
die "Incorrect format '$format' for Case_Folding inversion map"
                                                    unless $format eq 'al';
my @has_multi_char_fold;
my @is_non_final_fold;

for my $i (0 .. @$folds_ref - 1) {
    next unless ref $folds_ref->[$i];   # Skip single-char folds
    push @has_multi_char_fold, $cp_ref->[$i];

    # Add to the non-finals list each code point that is in a non-final
    # position
    for my $j (0 .. @{$folds_ref->[$i]} - 2) {
        push @is_non_final_fold, $folds_ref->[$i][$j]
                unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
    }
}

sub _Perl_Non_Final_Folds {
    @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
    return mk_invlist_from_cp_list(\@is_non_final_fold);
}

sub UpperLatin1 {
    return mk_invlist_from_cp_list([ 128 .. 255 ]);
}

output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);

end_ifndef_ext_re;

# We construct lists for all the POSIX and backslash sequence character
# classes in two forms:
#   1) ones which match only in the ASCII range
#   2) ones which match either in the Latin1 range, or the entire Unicode range
#
# These get compiled in, and hence affect the memory footprint of every Perl
# program, even those not using Unicode.  To minimize the size, currently
# the Latin1 version is generated for the beyond ASCII range except for those
# lists that are quite small for the entire range, such as for \s, which is 22
# UVs long plus 4 UVs (currently) for the header.
#
# To save even more memory, the ASCII versions could be derived from the
# larger ones at runtime, saving some memory (minus the expense of the machine
# instructions to do so), but these are all small anyway, so their total is
# about 100 UVs.
#
# In the list of properties below that get generated, the L1 prefix is a fake
# property that means just the Latin1 range of the full property (whose name
# has an X prefix instead of L1).
#
# An initial & means to use the subroutine from this file instead of an
# official inversion list.

for my $charset (get_supported_code_pages()) {
    print $out_fh "\n" . get_conditional_compile_line_start($charset);

    my @a2n = @{get_a2n($charset)};
                             # Ignore non-alpha in sort
    for my $prop (sort {     lc ($a =~ s/[[:^alpha:]]//gr)
                         cmp lc ($b =~ s/[[:^alpha:]]//gr)
                       } qw(
                             ASCII
                             Cased
                             VertSpace
                             XPerlSpace
                             XPosixAlnum
                             XPosixAlpha
                             XPosixBlank
                             XPosixCntrl
                             XPosixDigit
                             XPosixGraph
                             XPosixLower
                             XPosixPrint
                             XPosixPunct
                             XPosixSpace
                             XPosixUpper
                             XPosixWord
                             XPosixXDigit
                             _Perl_Any_Folds
                             &NonL1_Perl_Non_Final_Folds
                             _Perl_Folds_To_Multi_Char
                             &UpperLatin1
                             _Perl_IDStart
                             _Perl_IDCont
                           )
    ) {

        # For the Latin1 properties, we change to use the eXtended version of the
        # base property, then go through the result and get rid of everything not
        # in Latin1 (above 255).  Actually, we retain the element for the range
        # that crosses the 255/256 boundary if it is one that matches the
        # property.  For example, in the Word property, there is a range of code
        # points that start at U+00F8 and goes through U+02C1.  Instead of
        # artificially cutting that off at 256 because 256 is the first code point
        # above Latin1, we let the range go to its natural ending.  That gives us
        # extra information with no added space taken.  But if the range that
        # crosses the boundary is one that doesn't match the property, we don't
        # start a new range above 255, as that could be construed as going to
        # infinity.  For example, the Upper property doesn't include the character
        # at 255, but does include the one at 256.  We don't include the 256 one.
        my $prop_name = $prop;
        my $is_local_sub = $prop_name =~ s/^&//;
        my $lookup_prop = $prop_name;
        my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
                       or $lookup_prop =~ s/^L1//);
        my $nonl1_only = 0;
        $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;

        my @invlist;
        if ($is_local_sub) {
            @invlist = eval $lookup_prop;
        }
        else {
            @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok');
        }
        die "Could not find inversion list for '$lookup_prop'" unless @invlist;

        # Re-order the Unicode code points to native ones for this platform;
        # only needed for code points below 256, and only if the first range
        # doesn't span the whole of 0..256 (256 not 255 because a re-ordering
        # could cause 256 to need to be in the same range as 255.)
        if (! $nonl1_only || ($invlist[0] < 256
                              && ! ($invlist[0] == 0 && $invlist[1] > 256)))
        {

            # Look at all the ranges that start before 257.
            my @latin1_list;
            while (@invlist) {
                last if $invlist[0] > 256;
                my $upper = @invlist > 1
                            ? $invlist[1] - 1      # In range

                              # To infinity.  You may want to stop much much
                              # earlier; going this high may expose perl
                              # deficiencies with very large numbers.
                            : $Unicode::UCD::MAX_CP;
                for my $j ($invlist[0] .. $upper) {
                    if ($j < 256) {
                        push @latin1_list, $a2n[$j];
                    }
                    else {
                        push @latin1_list, $j;
                    }
                }

                shift @invlist; # Shift off the range that's in the list
                shift @invlist; # Shift off the range not in the list
            }

            # Here @invlist contains all the ranges in the original that start
            # at code points above 256, and @latin1_list contains all the
            # native code points for ranges that start with a Unicode code
            # point below 257.  We sort the latter and convert it to inversion
            # list format.  Then simply prepend it to the list of the higher
            # code points.
            @latin1_list = sort { $a <=> $b } @latin1_list;
            @latin1_list = mk_invlist_from_cp_list(\@latin1_list);
            unshift @invlist, @latin1_list;
        }

        if ($l1_only) {
            for my $i (0 .. @invlist - 1 - 1) {
                if ($invlist[$i] > 255) {

                    # In an inversion list, even-numbered elements give the code
                    # points that begin ranges that match the property;
                    # odd-numbered give ones that begin ranges that don't match.
                    # If $i is odd, we are at the first code point above 255 that
                    # doesn't match, which means the range it is ending does
                    # match, and crosses the 255/256 boundary.  We want to include
                    # this ending point, so increment $i, so the splice below
                    # includes it.  Conversely, if $i is even, it is the first
                    # code point above 255 that matches, which means there was no
                    # matching range that crossed the boundary, and we don't want
                    # to include this code point, so splice before it.
                    $i++ if $i % 2 != 0;

                    # Remove everything past this.
                    splice @invlist, $i;
                    last;
                }
            }
        }
        elsif ($nonl1_only) {
            my $found_nonl1 = 0;
            for my $i (0 .. @invlist - 1 - 1) {
                next if $invlist[$i] < 256;

                # Here, we have the first element in the array that indicates an
                # element above Latin1.  Get rid of all previous ones.
                splice @invlist, 0, $i;

                # If this one's index is not divisible by 2, it means that this
                # element is inverting away from being in the list, which means
                # all code points from 256 to this one are in this list.
                unshift @invlist, 256 if $i % 2 != 0;
                $found_nonl1 = 1;
                last;
            }
            die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
        }

        output_invlist($prop_name, \@invlist, $charset);
    }
    end_ifndef_ext_re;
    print $out_fh "\n" . get_conditional_compile_line_end();
}

my @sources = ($0, "lib/Unicode/UCD.pm");
{
    # Depend on mktables’ own sources.  It’s a shorter list of files than
    # those that Unicode::UCD uses.
    open my $mktables_list, "lib/unicore/mktables.lst"
        or die "$0 cannot open lib/unicore/mktables.lst: $!";
    while(<$mktables_list>) {
        last if /===/;
        chomp;
        push @sources, "lib/unicore/$_" if /^[^#]/;
    }
}
read_only_bottom_close_and_rename($out_fh, \@sources)