summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaiki Ueno <ueno@gnu.org>2015-06-24 16:09:03 +0900
committerDaiki Ueno <ueno@gnu.org>2015-06-24 16:43:28 +0900
commit9e26a8ba2e3c40e41d8f639544c63ceb248c8629 (patch)
tree7c978c83da4bb83ec9989de94c7e72fef68a93d3
parent4b0cfb0e39796400149767bdeb6097927895635a (diff)
downloadlibunistring-9e26a8ba2e3c40e41d8f639544c63ceb248c8629.tar.gz
maint: Add scripts to update Unicode data files
-rw-r--r--Admin/README.update59
-rwxr-xr-xAdmin/containing110
-rwxr-xr-xAdmin/dependent120
-rwxr-xr-xAdmin/gen-uni-tables160
-rwxr-xr-xAdmin/gen-uninames116
5 files changed, 565 insertions, 0 deletions
diff --git a/Admin/README.update b/Admin/README.update
new file mode 100644
index 0000000..2e178f9
--- /dev/null
+++ b/Admin/README.update
@@ -0,0 +1,59 @@
+-*- outline -*-
+
+This file attemps to describe the procedure of updating Gnulib to the
+latest Unicode standard.
+
+First, read the Unicode release note carefully and understand any
+changes which might affect Gnulib. In particular, changes in the text
+segmentation algoritms (provided by unilbrk.h, uniwbrk.h, unigbrk.h)
+often require manual adjustment of files.
+
+* Regenerating the Gnulib source code
+
+The tables in Gnulib are updated using a couple of tools:
+'lib/gen-uni-tables.c' and 'lib/uniname/gen-uninames.lisp'. To make
+things easier, wrapper scripts are included in this directory:
+'gen-uni-tables' and 'gen-uninames'.
+
+Those scripts respect the GNULIB_SRCDIR and UCD environment variables.
+GNULIB_SRCDIR points to the absolute path of the Gnulib checkout, and
+UCD points to the directory containing the Unicode UCD.
+
+ $ export GNULIB_SRCDIR=..
+ $ export UCD=..
+ $ ./gen-uni-tables
+ $ ./gen-uninames
+
+Note that 'gen-uni-tables' sometimes fails when a large portion of
+Unicode data changes. In that case, errors are reported as assertion
+failures and you will need to adjust 'lib/gen-uni-tables.c'.
+
+Also note that the files under 'lib/uniwidth' and 'tests/uniwidth' are
+not updated automatically. You need to merge the generated *.part
+file by hand.
+
+* Updating the version numbers of modules
+
+The 'libunistring-optional' module in Gnulib allows you to use the
+system libunistring when possible (i.e., when certain modules are
+equal to or older than the system libunistring). This is done by
+checking the version specified with gl_LIBUNISTRING_MODULE invocation
+in modules:
+
+ $ cat modules/uniwidth/width
+ ...
+ configure.ac:
+ gl_LIBUNISTRING_MODULE([0.9.6], [uniwidth/width])
+ ...
+
+If you update Gnulib to the new Unicode standard, you also need to
+bump the version of affected modules. To identify affected modules,
+you can use the following command:
+
+ $ (cd $GNULIB_SRCDIR && git show --oneline --name-only $COMMIT | tail -n+2) \
+ | ./containing | LC_ALL=C sort | uniq \
+ | ./dependent | LC_ALL=C sort | uniq
+
+where 'containing' and 'dependent' are scripts included here.
+
+Daiki Ueno <ueno@gnu.org>
diff --git a/Admin/containing b/Admin/containing
new file mode 100755
index 0000000..2ab1dd2
--- /dev/null
+++ b/Admin/containing
@@ -0,0 +1,110 @@
+#!/bin/sh
+#
+# Copyright (C) 2015 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+progname=$0
+
+# func_exit STATUS
+# exits with a given status.
+# This function needs to be used, rather than 'exit', when a 'trap' handler is
+# in effect that refers to $?.
+func_exit ()
+{
+ (exit $1); exit $1
+}
+
+# func_tmpdir
+# creates a temporary directory.
+# Input:
+# - progname name of this program
+# Sets variable
+# - tmp pathname of freshly created temporary directory
+func_tmpdir ()
+{
+ # Use the environment variable TMPDIR, falling back to /tmp. This allows
+ # users to specify a different temporary directory, for example, if their
+ # /tmp is filled up or too small.
+ : ${TMPDIR=/tmp}
+ {
+ # Use the mktemp program if available. If not available, hide the error
+ # message.
+ tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+ } ||
+ {
+ # Use a simple mkdir command. It is guaranteed to fail if the directory
+ # already exists. $RANDOM is bash specific and expands to empty in shells
+ # other than bash, ksh and zsh. Its use does not increase security;
+ # rather, it minimizes the probability of failure in a very cluttered /tmp
+ # directory.
+ tmp=$TMPDIR/gl$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+ } ||
+ {
+ echo "$progname: cannot create a temporary directory in $TMPDIR" >&2
+ func_exit 1
+ }
+}
+
+func_tmpdir
+trap 'exit_status=$?
+ if test "$signal" != 0; then
+ echo "caught signal $signal" >&2
+ fi
+ rm -rf "$tmp"
+ exit $exit_status' 0
+for signal in 1 2 3 13 15; do
+ trap '{ signal='$signal'; func_exit 1; }' $signal
+done
+signal=0
+
+sed_literal_to_basic_regex='s/\\/\\\\/g
+s/\[/\\[/g
+s/\^/\\^/g
+s/\([.*$]\)/[\1]/g'
+
+while read file; do
+ file_regex=`echo "$file" | sed -e "$sed_literal_to_basic_regex"`
+ if grep "^$file_regex " "$tmp/cache" > "$tmp/result" 2>/dev/null; then
+ sed -n -e 's,.* \([^ ]*\)$,\1,p' < "$tmp/result"
+ else
+ (cd "$GNULIB_SRCDIR" \
+ && find modules -type f -print | while read modfile; do
+ sed -n -e '/^Files:/,/^$/p' < "$modfile" \
+ | sed -e '/^Files:/d' -e '/^$/d' > "$tmp/files"
+ module=$(echo $modfile | sed -e 's,modules/,,')
+ module_regex=`echo "$module" | sed -e "$sed_literal_to_basic_regex"`
+ if test ! -f "$tmp/cache" \
+ || ! grep " $module_regex\$" "$tmp/cache" >/dev/null 2>&1; then
+ sed -e "s,\$, $module," < "$tmp/files" >> "$tmp/cache"
+ fi
+ grep "^$file_regex$" "$tmp/files" > /dev/null 2>&1 && echo "$module"
+ done)
+ fi
+done
+
+rm -rf "$tmp"
+
+# Undo the effect of the previous 'trap' command. Some shellology:
+# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to
+# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal
+# exit); for the others we need to call 'exit' explicitly. The value of $? is
+# 128 + signal number and is set before the trap-registered command is run.
+trap '' 0
+trap 'func_exit $?' 1 2 3 13 15
+
+exit 0
diff --git a/Admin/dependent b/Admin/dependent
new file mode 100755
index 0000000..67893ae
--- /dev/null
+++ b/Admin/dependent
@@ -0,0 +1,120 @@
+#!/bin/sh
+#
+# Copyright (C) 2015 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+progname=$0
+
+# func_exit STATUS
+# exits with a given status.
+# This function needs to be used, rather than 'exit', when a 'trap' handler is
+# in effect that refers to $?.
+func_exit ()
+{
+ (exit $1); exit $1
+}
+
+# func_tmpdir
+# creates a temporary directory.
+# Input:
+# - progname name of this program
+# Sets variable
+# - tmp pathname of freshly created temporary directory
+func_tmpdir ()
+{
+ # Use the environment variable TMPDIR, falling back to /tmp. This allows
+ # users to specify a different temporary directory, for example, if their
+ # /tmp is filled up or too small.
+ : ${TMPDIR=/tmp}
+ {
+ # Use the mktemp program if available. If not available, hide the error
+ # message.
+ tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+ } ||
+ {
+ # Use a simple mkdir command. It is guaranteed to fail if the directory
+ # already exists. $RANDOM is bash specific and expands to empty in shells
+ # other than bash, ksh and zsh. Its use does not increase security;
+ # rather, it minimizes the probability of failure in a very cluttered /tmp
+ # directory.
+ tmp=$TMPDIR/gl$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+ } ||
+ {
+ echo "$progname: cannot create a temporary directory in $TMPDIR" >&2
+ func_exit 1
+ }
+}
+
+func_tmpdir
+trap 'exit_status=$?
+ if test "$signal" != 0; then
+ echo "caught signal $signal" >&2
+ fi
+ rm -rf "$tmp"
+ exit $exit_status' 0
+for signal in 1 2 3 13 15; do
+ trap '{ signal='$signal'; func_exit 1; }' $signal
+done
+signal=0
+
+sed_literal_to_basic_regex='s/\\/\\\\/g
+s/\[/\\[/g
+s/\^/\\^/g
+s/\([.*$]\)/[\1]/g'
+
+# Populate the cache
+(cd "$GNULIB_SRCDIR" \
+ && find modules -type f -print | while read modfile; do
+ sed -n -e '/^Depends-on:/,/^$/p' < "$modfile" \
+ | sed -e '/^Depends-on:/d' -e '/^$/d' > "$tmp/dependencies"
+ dependent=$(echo $modfile | sed -e 's,modules/,,')
+ dependent_regex=`echo "$dependent" | sed -e "$sed_literal_to_basic_regex"`
+ if test ! -f "$tmp/cache" \
+ || ! grep " $dependent_regex\$" "$tmp/cache" >/dev/null 2>&1; then
+ sed -e "s,\$, $dependent," < "$tmp/dependencies" >> "$tmp/cache"
+ fi
+ done)
+
+# Recursively list dependent modules
+func_get_dependent ()
+{
+ module_regex=`echo "$1" | sed -e "$sed_literal_to_basic_regex"`
+ if grep "^$module_regex " "$tmp/cache" > "$tmp/result" 2>/dev/null; then
+ sed -n -e 's,.* \([^ ]*\)$,\1,p' < "$tmp/result" | while read dependent; do
+ echo "$dependent"
+ func_get_dependent "$dependent"
+ done
+ fi
+}
+
+while read module; do
+ echo "$module"
+ func_get_dependent "$module"
+done
+
+rm -rf "$tmp"
+
+# Undo the effect of the previous 'trap' command. Some shellology:
+# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to
+# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal
+# exit); for the others we need to call 'exit' explicitly. The value of $? is
+# 128 + signal number and is set before the trap-registered command is run.
+trap '' 0
+trap 'func_exit $?' 1 2 3 13 15
+
+exit 0
diff --git a/Admin/gen-uni-tables b/Admin/gen-uni-tables
new file mode 100755
index 0000000..21d05a9
--- /dev/null
+++ b/Admin/gen-uni-tables
@@ -0,0 +1,160 @@
+#!/bin/sh
+#
+# Copyright (C) 2015 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+progname=$0
+
+# func_exit STATUS
+# exits with a given status.
+# This function needs to be used, rather than 'exit', when a 'trap' handler is
+# in effect that refers to $?.
+func_exit ()
+{
+ (exit $1); exit $1
+}
+
+# func_fatal_error message
+# outputs to stderr a fatal error message, and terminates the program.
+# Input:
+# - progname name of this program
+func_fatal_error ()
+{
+ echo "$progname: *** $1" 1>&2
+ echo "$progname: *** Stop." 1>&2
+ func_exit 1
+}
+
+# func_tmpdir
+# creates a temporary directory.
+# Input:
+# - progname name of this program
+# Sets variable
+# - tmp pathname of freshly created temporary directory
+func_tmpdir ()
+{
+ # Use the environment variable TMPDIR, falling back to /tmp. This allows
+ # users to specify a different temporary directory, for example, if their
+ # /tmp is filled up or too small.
+ : ${TMPDIR=/tmp}
+ {
+ # Use the mktemp program if available. If not available, hide the error
+ # message.
+ tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+ } ||
+ {
+ # Use a simple mkdir command. It is guaranteed to fail if the directory
+ # already exists. $RANDOM is bash specific and expands to empty in shells
+ # other than bash, ksh and zsh. Its use does not increase security;
+ # rather, it minimizes the probability of failure in a very cluttered /tmp
+ # directory.
+ tmp=$TMPDIR/gl$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+ } ||
+ {
+ echo "$progname: cannot create a temporary directory in $TMPDIR" >&2
+ func_exit 1
+ }
+}
+
+test -n "$GNULIB_SRCDIR" || \
+ func_fatal_error "$progname: GNULIB_SRCDIR is not set"
+
+test -n "$UCD" || \
+ func_fatal_error "$progname: UCD is not set"
+
+func_tmpdir
+trap 'exit_status=$?
+ if test "$signal" != 0; then
+ echo "caught signal $signal" >&2
+ fi
+ rm -rf "$tmp"
+ exit $exit_status' 0
+for signal in 1 2 3 13 15; do
+ trap '{ signal='$signal'; func_exit 1; }' $signal
+done
+signal=0
+
+# Compile lib/gen-uni-tables.c in a temporary directory
+: ${CC=gcc}
+"$CC" -O0 -g -Wall "$GNULIB_SRCDIR/lib/gen-uni-tables.c" \
+ -I"$GNULIB_SRCDIR/lib/unictype" -o "$tmp/gen-uni-tables" \
+ || func_fatal_error "cannot compile gen-uni-tables.c"
+
+: ${WGET=wget}
+"$WGET" -q http://www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
+ -O "$tmp/PropList-3.0.1.txt" \
+ || func_fatal_error "cannot fetch PropList-3.0.1.txt"
+
+ver=`sed -n -e 's/.*Version \([0-9.]*\).*/\1/p' < "$UCD/ReadMe.txt"`
+
+(cd "$GNULIB_SRCDIR/lib" \
+ && "$tmp/gen-uni-tables" "$UCD/UnicodeData.txt" \
+ "$UCD/PropList.txt" \
+ "$UCD/DerivedCoreProperties.txt" \
+ "$UCD/ArabicShaping.txt" \
+ "$UCD/Scripts.txt" \
+ "$UCD/Blocks.txt" \
+ "$tmp/PropList-3.0.1.txt" \
+ "$UCD/EastAsianWidth.txt" \
+ "$UCD/LineBreak.txt" \
+ "$UCD/auxiliary/WordBreakProperty.txt" \
+ "$UCD/auxiliary/GraphemeBreakProperty.txt" \
+ "$UCD/CompositionExclusions.txt" \
+ "$UCD/SpecialCasing.txt" \
+ "$UCD/CaseFolding.txt" \
+ "$ver") \
+ || func_fatal_error "error running gen-uni-tables"
+
+: ${DIFF=diff}
+"$DIFF" "$GNULIB_SRCDIR/lib/unilbrk/lbrkprop_org.txt" \
+ "$GNULIB_SRCDIR/lib/unilbrk/lbrkprop.txt" \
+ || func_fatal_error "lbrkprop is not updated properly"
+
+"$DIFF" "$GNULIB_SRCDIR/lib/uniwbrk/wbrkprop_org.txt" \
+ "$GNULIB_SRCDIR/lib/uniwbrk/wbrkprop.txt" \
+ || func_fatal_error "wbrkprop is not updated properly"
+
+# Copy necessary files from UCD
+for dstfile in "$GNULIB_SRCDIR/tests/uninorm/NormalizationTest.txt" \
+ "$GNULIB_SRCDIR/tests/uniwbrk/WordBreakTest.txt" \
+ "$GNULIB_SRCDIR/tests/unigbrk/GraphemeBreakTest.txt" \
+ "$GNULIB_SRCDIR/tests/uniname/UnicodeData.txt" \
+ "$GNULIB_SRCDIR/tests/uniname/NameAliases.txt" ; do
+ srcfile=`expr "$dstfile" : '.*/\(.*\)'`
+ if test -f "$UCD/$srcfile"; then
+ srcfile="$UCD/$srcfile"
+ elif test -f "$UCD/auxiliary/$srcfile"; then
+ srcfile="$UCD/auxiliary/$srcfile"
+ else
+ func_fatal_error "cannot find $srcfile"
+ fi
+
+ sed 's/ *$//' < "$srcfile" > "$dstfile"
+done
+
+rm -rf "$tmp"
+
+# Undo the effect of the previous 'trap' command. Some shellology:
+# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to
+# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal
+# exit); for the others we need to call 'exit' explicitly. The value of $? is
+# 128 + signal number and is set before the trap-registered command is run.
+trap '' 0
+trap 'func_exit $?' 1 2 3 13 15
+
+exit 0
diff --git a/Admin/gen-uninames b/Admin/gen-uninames
new file mode 100755
index 0000000..2b0771a
--- /dev/null
+++ b/Admin/gen-uninames
@@ -0,0 +1,116 @@
+#!/bin/sh
+#
+# Copyright (C) 2015 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+progname=$0
+
+# func_exit STATUS
+# exits with a given status.
+# This function needs to be used, rather than 'exit', when a 'trap' handler is
+# in effect that refers to $?.
+func_exit ()
+{
+ (exit $1); exit $1
+}
+
+# func_fatal_error message
+# outputs to stderr a fatal error message, and terminates the program.
+# Input:
+# - progname name of this program
+func_fatal_error ()
+{
+ echo "$progname: *** $1" 1>&2
+ echo "$progname: *** Stop." 1>&2
+ func_exit 1
+}
+
+# func_tmpdir
+# creates a temporary directory.
+# Input:
+# - progname name of this program
+# Sets variable
+# - tmp pathname of freshly created temporary directory
+func_tmpdir ()
+{
+ # Use the environment variable TMPDIR, falling back to /tmp. This allows
+ # users to specify a different temporary directory, for example, if their
+ # /tmp is filled up or too small.
+ : ${TMPDIR=/tmp}
+ {
+ # Use the mktemp program if available. If not available, hide the error
+ # message.
+ tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` &&
+ test -n "$tmp" && test -d "$tmp"
+ } ||
+ {
+ # Use a simple mkdir command. It is guaranteed to fail if the directory
+ # already exists. $RANDOM is bash specific and expands to empty in shells
+ # other than bash, ksh and zsh. Its use does not increase security;
+ # rather, it minimizes the probability of failure in a very cluttered /tmp
+ # directory.
+ tmp=$TMPDIR/gl$$-$RANDOM
+ (umask 077 && mkdir "$tmp")
+ } ||
+ {
+ echo "$progname: cannot create a temporary directory in $TMPDIR" >&2
+ func_exit 1
+ }
+}
+
+test -n "$GNULIB_SRCDIR" || \
+ func_fatal_error "$progname: GNULIB_SRCDIR is not set"
+
+test -n "$UCD" || \
+ func_fatal_error "$progname: UCD is not set"
+
+func_tmpdir
+trap 'exit_status=$?
+ if test "$signal" != 0; then
+ echo "caught signal $signal" >&2
+ fi
+ rm -rf "$tmp"
+ exit $exit_status' 0
+for signal in 1 2 3 13 15; do
+ trap '{ signal='$signal'; func_exit 1; }' $signal
+done
+signal=0
+
+: ${CLISP=clisp}
+("$CLISP" --version) >/dev/null 2>/dev/null \
+ || { echo "$progname: *** clisp not found; skipping uniname" 1>&2; exit 0; }
+
+sed -e '/^[0-9A-F]*;</d' "$UCD/UnicodeData.txt" > "$tmp/UnicodeDataNames.txt"
+
+sed -e 's/ *$//' -e '/^#/d' -e '/^$/d' < "$UCD/NameAliases.txt" \
+ > "$tmp/NameAliases.txt"
+
+"$CLISP" "$GNULIB_SRCDIR/lib/uniname/gen-uninames.lisp" \
+ "$tmp/UnicodeDataNames.txt" \
+ "$GNULIB_SRCDIR/lib/uniname/uninames.h" \
+ "$tmp/NameAliases.txt"
+
+rm -rf "$tmp"
+
+# Undo the effect of the previous 'trap' command. Some shellology:
+# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to
+# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal
+# exit); for the others we need to call 'exit' explicitly. The value of $? is
+# 128 + signal number and is set before the trap-registered command is run.
+trap '' 0
+trap 'func_exit $?' 1 2 3 13 15
+
+exit 0