summaryrefslogtreecommitdiff
path: root/glib/gen-unicode-tables.pl
diff options
context:
space:
mode:
Diffstat (limited to 'glib/gen-unicode-tables.pl')
-rwxr-xr-xglib/gen-unicode-tables.pl160
1 files changed, 155 insertions, 5 deletions
diff --git a/glib/gen-unicode-tables.pl b/glib/gen-unicode-tables.pl
index d111d910c..6e83db6db 100755
--- a/glib/gen-unicode-tables.pl
+++ b/glib/gen-unicode-tables.pl
@@ -153,6 +153,23 @@ $FOLDING_MAPPING = 2;
'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE"
);
+%grapheme_break_mappings =
+ (
+ 'XX' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_OTHER",
+ 'CR' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_CR",
+ 'LF' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_LF",
+ 'Control' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_CONTROL",
+ 'SpacingMark' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_SPACING_MARK",
+ 'Extend' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_EXTEND",
+ 'Prepend' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_PREPEND",
+ 'Regional_Indicator' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_REGIONAL_INDICATOR",
+ 'L' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_L",
+ 'V' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_V",
+ 'T' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_T",
+ 'LV' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_LV",
+ 'LVT' => "G_UNICODE_GRAPHEME_CLUSTER_BREAK_HANGUL_SYLLABLE_LVT"
+ );
+
# Title case mappings.
%title_to_lower = ();
%title_to_upper = ();
@@ -179,10 +196,10 @@ elsif (@ARGV && $ARGV[0] eq '-both')
if (@ARGV != 2) {
$0 =~ s@.*/@@;
- die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt\n\n";
+ die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt, auxiliary/GraphemeBreakProperty.txt\n\n";
}
-my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt);
+my ($unicodedatatxt, $linebreaktxt, $graphemebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt);
my $d = $ARGV[1];
opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";
@@ -195,8 +212,16 @@ for my $f (readdir ($dir))
$compositionexclusionstxt = "$d/$f" if ($f =~ /^CompositionExclusions.*\.txt/);
}
+$d = "$ARGV[1]/auxiliary";
+opendir ($dir, $d) or die "Cannot open Unicode auxiliary data dir $d: $!\n";
+for my $f (readdir ($dir))
+{
+ $graphemebreaktxt = "$d/$f" if ($f =~ /^GraphemeBreakProperty.*\.txt/);
+}
+
defined $unicodedatatxt or die "Did not find UnicodeData file";
defined $linebreaktxt or die "Did not find LineBreak file";
+defined $graphemebreaktxt or die "Did not find GraphemeBreakProperty file";
defined $specialcasingtxt or die "Did not find SpecialCasing file";
defined $casefoldingtxt or die "Did not find CaseFolding file";
defined $compositionexclusionstxt or die "Did not find CompositionExclusions file";
@@ -362,6 +387,57 @@ for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);
+print "Creating grapheme break table\n";
+
+print "Line grapheme break data from $graphemebreaktxt\n";
+
+open (INPUT, "< $graphemebreaktxt") || exit 1;
+
+while (<INPUT>)
+{
+ my ($start_code, $end_code);
+
+ chop;
+
+ next if /^#/;
+ next if /^$/;
+
+ s/\s*#.*//;
+
+ @fields = split (';', $_, 30);
+ if ($#fields != 1)
+ {
+ printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
+ next;
+ }
+
+ $fields[$CODE] =~ s/\s+//;
+ if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/)
+ {
+ $start_code = hex ($1);
+ $end_code = hex ($2);
+ } else {
+ $start_code = $end_code = hex ($fields[$CODE]);
+ }
+
+ $fields[$BREAK_PROPERTY] =~ s/\s+//;
+
+ for ($last_code = $start_code; $last_code <= $end_code; $last_code++)
+ {
+ $grapheme_break_props[$last_code] = $fields[$BREAK_PROPERTY];
+ }
+
+ $last_code = $end_code;
+}
+
+close INPUT;
+
+# All other characters have value Other (XX)
+for ($last_code = 0x0; $last_code <= 0x10FFFF; ++$last_code)
+{
+ $grapheme_break_props[$last_code] = 'XX' unless defined($grapheme_break_props[$last_code]);
+}
+
print "Reading special-casing table for case conversion\n";
open (INPUT, "< $specialcasingtxt") || exit 1;
@@ -491,6 +567,8 @@ while (<INPUT>)
close INPUT;
+$last_code = 0x10FFFF;
+
if ($do_props) {
&print_tables ($last_code)
}
@@ -501,6 +579,8 @@ if ($do_decomp) {
&print_line_break ($last_code);
+&print_grapheme_break ($last_code);
+
exit 0;
@@ -971,6 +1051,72 @@ sub print_line_break
printf STDERR "Generated %d bytes in break tables\n", $bytes_out;
}
+sub print_grapheme_break
+{
+ my ($last) = @_;
+ my ($outfile) = "gunigraphemebreak.h";
+
+ local ($bytes_out) = 0;
+
+ print "Writing $outfile...\n";
+
+ open (OUT, "> $outfile");
+
+ print OUT "/* This file is automatically generated. DO NOT EDIT!\n";
+ print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n";
+
+ print OUT "#ifndef G_UNIGRAPHEMEBREAKTABLES_H\n";
+ print OUT "#define G_UNIGRAPHEMEBREAKTABLES_H\n\n";
+
+ print OUT "#include <glib/gtypes.h>\n";
+ print OUT "#include <glib/gunicode.h>\n\n";
+
+ print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
+
+ printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;
+
+ printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
+
+ my $last_part1 = ($pages_before_e0000 * 256) - 1;
+ printf OUT "/* the last code point that should be looked up in grapheme_break_property_table_part1 */\n";
+ printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
+
+ $table_index = 0;
+ printf OUT "static const gint8 grapheme_break_property_data[][256] = {\n";
+ for ($count = 0; $count <= $last; $count += 256)
+ {
+ $row[$count / 256] = &print_row ($count, 1, \&fetch_grapheme_break_type);
+ }
+ printf OUT "\n};\n\n";
+
+ printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
+ print OUT "static const gint16 grapheme_break_property_table_part1[$pages_before_e0000] = {\n";
+ for ($count = 0; $count <= $last_part1; $count += 256)
+ {
+ print OUT ",\n" if $count > 0;
+ print OUT " ", $row[$count / 256];
+ $bytes_out += 2;
+ }
+ print OUT "\n};\n\n";
+
+ printf OUT "/* U+E0000 through U+%04X */\n", $last;
+ print OUT "static const gint16 grapheme_break_property_table_part2[768] = {\n";
+ for ($count = 0xE0000; $count <= $last; $count += 256)
+ {
+ print OUT ",\n" if $count > 0xE0000;
+ print OUT " ", $row[$count / 256];
+ $bytes_out += 2;
+ }
+ print OUT "\n};\n\n";
+
+
+ print OUT "#endif /* G_UNIGRAPHEMEBREAKTABLES_H */\n";
+
+ close (OUT);
+
+ printf STDERR "Generated %d bytes in grapheme break tables\n", $bytes_out;
+}
+
# A fetch function for the break properties table.
sub fetch_break_type
@@ -979,6 +1125,13 @@ sub fetch_break_type
return $break_mappings{$break_props[$index]};
}
+# A fetch function for the grapheme break properties table.
+sub fetch_grapheme_break_type
+{
+ my ($index) = @_;
+ return $grapheme_break_mappings{$grapheme_break_props[$index]};
+}
+
# Fetcher for combining class.
sub fetch_cclass
{
@@ -1335,6 +1488,3 @@ EOT
my $recordlen = (2+$casefoldlen+1) & ~1;
printf "Generated %d bytes for casefold table\n", $recordlen * @casefold;
}
-
-
-