diff options
Diffstat (limited to 'glib/gen-unicode-tables.pl')
-rwxr-xr-x | glib/gen-unicode-tables.pl | 160 |
1 files changed, 155 insertions, 5 deletions
diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl index d111d910c..6e83db6db 100755 --- a/glib/gen-unicode-tables.pl +++ b/glib/gen-unicode-tables.pl @@ -153,6 +153,23 @@ $FOLDING_MAPPING = 2; 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE" ); +%grapheme_break_mappings = + ( + 'XX' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_OTHER", + 'CR' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_CR", + 'LF' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_LF", + 'Control' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_CONTROL", + 'SpacingMark' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_SPACING_MARK", + 'Extend' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_EXTEND", + 'Prepend' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_PREPEND", + 'Regional_Indicator' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_REGIONAL_INDICATOR", + 'L' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_L", + 'V' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_V", + 'T' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_T", + 'LV' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_LV", + 'LVT' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_LVT" + ); + # Title case mappings. %title_to_lower = (); %title_to_upper = (); @@ -179,10 +196,10 @@ elsif (@ARGV && $ARGV[0] eq '-both') if (@ARGV != 2) { $0 =~ s@.*/@@; - die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt\n\n"; + die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt, auxiliary/GraphemeBreakProperty.txt\n\n"; } -my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt); +my ($unicodedatatxt, $linebreaktxt, $graphemebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt); my $d = $ARGV[1]; opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; @@ -195,8 +212,16 @@ for my $f (readdir ($dir)) $compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/); } +$d = "$ARGV[1]/auxiliary"; +opendir ($dir, $d) or die "Cannot open Unicode auxiliary data dir $d: $!\n"; +for my $f (readdir ($dir)) +{ + $graphemebreaktxt = "$d/$f" if ($f =~ /^GraphemeBreakProperty.*\.txt/); +} + defined $unicodedatatxt or die "Did not find UnicodeData file"; defined $linebreaktxt or die "Did not find LineBreak file"; +defined $graphemebreaktxt or die "Did not find GraphemeBreakProperty file"; defined $specialcasingtxt or die "Did not find SpecialCasing file"; defined $casefoldingtxt or die "Did not find CaseFolding file"; defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; @@ -362,6 +387,57 @@ for (++$last_code; $last_code <= 0x10FFFF; ++$last_code) print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF); +print "Creating grapheme break table\n"; + +print "Line grapheme break data from $graphemebreaktxt\n"; + +open (INPUT, "< $graphemebreaktxt") || exit 1; + +while (<INPUT>) +{ + my ($start_code, $end_code); + + chop; + + next if /^#/; + next if /^$/; + + s/\s*#.*//; + + @fields = split (';', $_, 30); + if ($#fields != 1) + { + printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); + next; + } + + $fields[$CODE] =~ s/\s+//; + if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/) + { + $start_code = hex ($1); + $end_code = hex ($2); + } else { + $start_code = $end_code = hex ($fields[$CODE]); + } + + $fields[$BREAK_PROPERTY] =~ s/\s+//; + + for ($last_code = $start_code; $last_code <= $end_code; $last_code++) + { + $grapheme_break_props[$last_code] = $fields[$BREAK_PROPERTY]; + } + + $last_code = $end_code; +} + +close INPUT; + +# All other characters have value Other (XX) +for ($last_code = 0x0; $last_code <= 0x10FFFF; ++$last_code) +{ + $grapheme_break_props[$last_code] = 'XX' unless defined($grapheme_break_props[$last_code]); +} + print "Reading special-casing table for case conversion\n"; open (INPUT, "< $specialcasingtxt") || exit 1; @@ -491,6 +567,8 @@ while (<INPUT>) close INPUT; +$last_code = 0x10FFFF; + if ($do_props) { &print_tables ($last_code) } @@ -501,6 +579,8 @@ if ($do_decomp) { &print_line_break ($last_code); +&print_grapheme_break ($last_code); + exit 0; @@ -971,6 +1051,72 @@ sub print_line_break printf STDERR "Generated %d bytes in break tables\n", $bytes_out; } +sub print_grapheme_break +{ + my ($last) = @_; + my ($outfile) = "gunigraphemebreak.h"; + + local ($bytes_out) = 0; + + print "Writing $outfile...\n"; + + open (OUT, "> $outfile"); + + print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; + print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; + + print OUT "#ifndef G_UNIGRAPHEMEBREAKTABLES_H\n"; + print OUT "#define G_UNIGRAPHEMEBREAKTABLES_H\n\n"; + + print OUT "#include <glib/gtypes.h>\n"; + print OUT "#include <glib/gunicode.h>\n\n"; + + print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; + + printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last; + + printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n"; + + my $last_part1 = ($pages_before_e0000 * 256) - 1; + printf OUT "/* the last code point that should be looked up in grapheme_break_property_table_part1 */\n"; + printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; + + $table_index = 0; + printf OUT "static const gint8 grapheme_break_property_data[][256] = {\n"; + for ($count = 0; $count <= $last; $count += 256) + { + $row[$count / 256] = &print_row ($count, 1, \&fetch_grapheme_break_type); + } + printf OUT "\n};\n\n"; + + printf OUT "/* U+0000 through U+%04X */\n", $last_part1; + print OUT "static const gint16 grapheme_break_property_table_part1[$pages_before_e0000] = {\n"; + for ($count = 0; $count <= $last_part1; $count += 256) + { + print OUT ",\n" if $count > 0; + print OUT " ", $row[$count / 256]; + $bytes_out += 2; + } + print OUT "\n};\n\n"; + + printf OUT "/* U+E0000 through U+%04X */\n", $last; + print OUT "static const gint16 grapheme_break_property_table_part2[768] = {\n"; + for ($count = 0xE0000; $count <= $last; $count += 256) + { + print OUT ",\n" if $count > 0xE0000; + print OUT " ", $row[$count / 256]; + $bytes_out += 2; + } + print OUT "\n};\n\n"; + + + print OUT "#endif /* G_UNIGRAPHEMEBREAKTABLES_H */\n"; + + close (OUT); + + printf STDERR "Generated %d bytes in grapheme break tables\n", $bytes_out; +} + # A fetch function for the break properties table. sub fetch_break_type @@ -979,6 +1125,13 @@ sub fetch_break_type return $break_mappings{$break_props[$index]}; } +# A fetch function for the grapheme break properties table. +sub fetch_grapheme_break_type +{ + my ($index) = @_; + return $grapheme_break_mappings{$grapheme_break_props[$index]}; +} + # Fetcher for combining class. sub fetch_cclass { @@ -1335,6 +1488,3 @@ EOT my $recordlen = (2+$casefoldlen+1) & ~1; printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; } - - - |