From 9e26a8ba2e3c40e41d8f639544c63ceb248c8629 Mon Sep 17 00:00:00 2001 From: Daiki Ueno Date: Wed, 24 Jun 2015 16:09:03 +0900 Subject: maint: Add scripts to update Unicode data files --- Admin/README.update | 59 +++++++++++++++++++ Admin/containing | 110 +++++++++++++++++++++++++++++++++++ Admin/dependent | 120 ++++++++++++++++++++++++++++++++++++++ Admin/gen-uni-tables | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++ Admin/gen-uninames | 116 +++++++++++++++++++++++++++++++++++++ 5 files changed, 565 insertions(+) create mode 100644 Admin/README.update create mode 100755 Admin/containing create mode 100755 Admin/dependent create mode 100755 Admin/gen-uni-tables create mode 100755 Admin/gen-uninames diff --git a/Admin/README.update b/Admin/README.update new file mode 100644 index 0000000..2e178f9 --- /dev/null +++ b/Admin/README.update @@ -0,0 +1,59 @@ +-*- outline -*- + +This file attemps to describe the procedure of updating Gnulib to the +latest Unicode standard. + +First, read the Unicode release note carefully and understand any +changes which might affect Gnulib. In particular, changes in the text +segmentation algoritms (provided by unilbrk.h, uniwbrk.h, unigbrk.h) +often require manual adjustment of files. + +* Regenerating the Gnulib source code + +The tables in Gnulib are updated using a couple of tools: +'lib/gen-uni-tables.c' and 'lib/uniname/gen-uninames.lisp'. To make +things easier, wrapper scripts are included in this directory: +'gen-uni-tables' and 'gen-uninames'. + +Those scripts respect the GNULIB_SRCDIR and UCD environment variables. +GNULIB_SRCDIR points to the absolute path of the Gnulib checkout, and +UCD points to the directory containing the Unicode UCD. + + $ export GNULIB_SRCDIR=.. + $ export UCD=.. + $ ./gen-uni-tables + $ ./gen-uninames + +Note that 'gen-uni-tables' sometimes fails when a large portion of +Unicode data changes. In that case, errors are reported as assertion +failures and you will need to adjust 'lib/gen-uni-tables.c'. + +Also note that the files under 'lib/uniwidth' and 'tests/uniwidth' are +not updated automatically. You need to merge the generated *.part +file by hand. + +* Updating the version numbers of modules + +The 'libunistring-optional' module in Gnulib allows you to use the +system libunistring when possible (i.e., when certain modules are +equal to or older than the system libunistring). This is done by +checking the version specified with gl_LIBUNISTRING_MODULE invocation +in modules: + + $ cat modules/uniwidth/width + ... + configure.ac: + gl_LIBUNISTRING_MODULE([0.9.6], [uniwidth/width]) + ... + +If you update Gnulib to the new Unicode standard, you also need to +bump the version of affected modules. To identify affected modules, +you can use the following command: + + $ (cd $GNULIB_SRCDIR && git show --oneline --name-only $COMMIT | tail -n+2) \ + | ./containing | LC_ALL=C sort | uniq \ + | ./dependent | LC_ALL=C sort | uniq + +where 'containing' and 'dependent' are scripts included here. + +Daiki Ueno diff --git a/Admin/containing b/Admin/containing new file mode 100755 index 0000000..2ab1dd2 --- /dev/null +++ b/Admin/containing @@ -0,0 +1,110 @@ +#!/bin/sh +# +# Copyright (C) 2015 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +progname=$0 + +# func_exit STATUS +# exits with a given status. +# This function needs to be used, rather than 'exit', when a 'trap' handler is +# in effect that refers to $?. +func_exit () +{ + (exit $1); exit $1 +} + +# func_tmpdir +# creates a temporary directory. +# Input: +# - progname name of this program +# Sets variable +# - tmp pathname of freshly created temporary directory +func_tmpdir () +{ + # Use the environment variable TMPDIR, falling back to /tmp. This allows + # users to specify a different temporary directory, for example, if their + # /tmp is filled up or too small. + : ${TMPDIR=/tmp} + { + # Use the mktemp program if available. If not available, hide the error + # message. + tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` && + test -n "$tmp" && test -d "$tmp" + } || + { + # Use a simple mkdir command. It is guaranteed to fail if the directory + # already exists. $RANDOM is bash specific and expands to empty in shells + # other than bash, ksh and zsh. Its use does not increase security; + # rather, it minimizes the probability of failure in a very cluttered /tmp + # directory. + tmp=$TMPDIR/gl$$-$RANDOM + (umask 077 && mkdir "$tmp") + } || + { + echo "$progname: cannot create a temporary directory in $TMPDIR" >&2 + func_exit 1 + } +} + +func_tmpdir +trap 'exit_status=$? + if test "$signal" != 0; then + echo "caught signal $signal" >&2 + fi + rm -rf "$tmp" + exit $exit_status' 0 +for signal in 1 2 3 13 15; do + trap '{ signal='$signal'; func_exit 1; }' $signal +done +signal=0 + +sed_literal_to_basic_regex='s/\\/\\\\/g +s/\[/\\[/g +s/\^/\\^/g +s/\([.*$]\)/[\1]/g' + +while read file; do + file_regex=`echo "$file" | sed -e "$sed_literal_to_basic_regex"` + if grep "^$file_regex " "$tmp/cache" > "$tmp/result" 2>/dev/null; then + sed -n -e 's,.* \([^ ]*\)$,\1,p' < "$tmp/result" + else + (cd "$GNULIB_SRCDIR" \ + && find modules -type f -print | while read modfile; do + sed -n -e '/^Files:/,/^$/p' < "$modfile" \ + | sed -e '/^Files:/d' -e '/^$/d' > "$tmp/files" + module=$(echo $modfile | sed -e 's,modules/,,') + module_regex=`echo "$module" | sed -e "$sed_literal_to_basic_regex"` + if test ! -f "$tmp/cache" \ + || ! grep " $module_regex\$" "$tmp/cache" >/dev/null 2>&1; then + sed -e "s,\$, $module," < "$tmp/files" >> "$tmp/cache" + fi + grep "^$file_regex$" "$tmp/files" > /dev/null 2>&1 && echo "$module" + done) + fi +done + +rm -rf "$tmp" + +# Undo the effect of the previous 'trap' command. Some shellology: +# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to +# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal +# exit); for the others we need to call 'exit' explicitly. The value of $? is +# 128 + signal number and is set before the trap-registered command is run. +trap '' 0 +trap 'func_exit $?' 1 2 3 13 15 + +exit 0 diff --git a/Admin/dependent b/Admin/dependent new file mode 100755 index 0000000..67893ae --- /dev/null +++ b/Admin/dependent @@ -0,0 +1,120 @@ +#!/bin/sh +# +# Copyright (C) 2015 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +progname=$0 + +# func_exit STATUS +# exits with a given status. +# This function needs to be used, rather than 'exit', when a 'trap' handler is +# in effect that refers to $?. +func_exit () +{ + (exit $1); exit $1 +} + +# func_tmpdir +# creates a temporary directory. +# Input: +# - progname name of this program +# Sets variable +# - tmp pathname of freshly created temporary directory +func_tmpdir () +{ + # Use the environment variable TMPDIR, falling back to /tmp. This allows + # users to specify a different temporary directory, for example, if their + # /tmp is filled up or too small. + : ${TMPDIR=/tmp} + { + # Use the mktemp program if available. If not available, hide the error + # message. + tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` && + test -n "$tmp" && test -d "$tmp" + } || + { + # Use a simple mkdir command. It is guaranteed to fail if the directory + # already exists. $RANDOM is bash specific and expands to empty in shells + # other than bash, ksh and zsh. Its use does not increase security; + # rather, it minimizes the probability of failure in a very cluttered /tmp + # directory. + tmp=$TMPDIR/gl$$-$RANDOM + (umask 077 && mkdir "$tmp") + } || + { + echo "$progname: cannot create a temporary directory in $TMPDIR" >&2 + func_exit 1 + } +} + +func_tmpdir +trap 'exit_status=$? + if test "$signal" != 0; then + echo "caught signal $signal" >&2 + fi + rm -rf "$tmp" + exit $exit_status' 0 +for signal in 1 2 3 13 15; do + trap '{ signal='$signal'; func_exit 1; }' $signal +done +signal=0 + +sed_literal_to_basic_regex='s/\\/\\\\/g +s/\[/\\[/g +s/\^/\\^/g +s/\([.*$]\)/[\1]/g' + +# Populate the cache +(cd "$GNULIB_SRCDIR" \ + && find modules -type f -print | while read modfile; do + sed -n -e '/^Depends-on:/,/^$/p' < "$modfile" \ + | sed -e '/^Depends-on:/d' -e '/^$/d' > "$tmp/dependencies" + dependent=$(echo $modfile | sed -e 's,modules/,,') + dependent_regex=`echo "$dependent" | sed -e "$sed_literal_to_basic_regex"` + if test ! -f "$tmp/cache" \ + || ! grep " $dependent_regex\$" "$tmp/cache" >/dev/null 2>&1; then + sed -e "s,\$, $dependent," < "$tmp/dependencies" >> "$tmp/cache" + fi + done) + +# Recursively list dependent modules +func_get_dependent () +{ + module_regex=`echo "$1" | sed -e "$sed_literal_to_basic_regex"` + if grep "^$module_regex " "$tmp/cache" > "$tmp/result" 2>/dev/null; then + sed -n -e 's,.* \([^ ]*\)$,\1,p' < "$tmp/result" | while read dependent; do + echo "$dependent" + func_get_dependent "$dependent" + done + fi +} + +while read module; do + echo "$module" + func_get_dependent "$module" +done + +rm -rf "$tmp" + +# Undo the effect of the previous 'trap' command. Some shellology: +# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to +# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal +# exit); for the others we need to call 'exit' explicitly. The value of $? is +# 128 + signal number and is set before the trap-registered command is run. +trap '' 0 +trap 'func_exit $?' 1 2 3 13 15 + +exit 0 diff --git a/Admin/gen-uni-tables b/Admin/gen-uni-tables new file mode 100755 index 0000000..21d05a9 --- /dev/null +++ b/Admin/gen-uni-tables @@ -0,0 +1,160 @@ +#!/bin/sh +# +# Copyright (C) 2015 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +progname=$0 + +# func_exit STATUS +# exits with a given status. +# This function needs to be used, rather than 'exit', when a 'trap' handler is +# in effect that refers to $?. +func_exit () +{ + (exit $1); exit $1 +} + +# func_fatal_error message +# outputs to stderr a fatal error message, and terminates the program. +# Input: +# - progname name of this program +func_fatal_error () +{ + echo "$progname: *** $1" 1>&2 + echo "$progname: *** Stop." 1>&2 + func_exit 1 +} + +# func_tmpdir +# creates a temporary directory. +# Input: +# - progname name of this program +# Sets variable +# - tmp pathname of freshly created temporary directory +func_tmpdir () +{ + # Use the environment variable TMPDIR, falling back to /tmp. This allows + # users to specify a different temporary directory, for example, if their + # /tmp is filled up or too small. + : ${TMPDIR=/tmp} + { + # Use the mktemp program if available. If not available, hide the error + # message. + tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` && + test -n "$tmp" && test -d "$tmp" + } || + { + # Use a simple mkdir command. It is guaranteed to fail if the directory + # already exists. $RANDOM is bash specific and expands to empty in shells + # other than bash, ksh and zsh. Its use does not increase security; + # rather, it minimizes the probability of failure in a very cluttered /tmp + # directory. + tmp=$TMPDIR/gl$$-$RANDOM + (umask 077 && mkdir "$tmp") + } || + { + echo "$progname: cannot create a temporary directory in $TMPDIR" >&2 + func_exit 1 + } +} + +test -n "$GNULIB_SRCDIR" || \ + func_fatal_error "$progname: GNULIB_SRCDIR is not set" + +test -n "$UCD" || \ + func_fatal_error "$progname: UCD is not set" + +func_tmpdir +trap 'exit_status=$? + if test "$signal" != 0; then + echo "caught signal $signal" >&2 + fi + rm -rf "$tmp" + exit $exit_status' 0 +for signal in 1 2 3 13 15; do + trap '{ signal='$signal'; func_exit 1; }' $signal +done +signal=0 + +# Compile lib/gen-uni-tables.c in a temporary directory +: ${CC=gcc} +"$CC" -O0 -g -Wall "$GNULIB_SRCDIR/lib/gen-uni-tables.c" \ + -I"$GNULIB_SRCDIR/lib/unictype" -o "$tmp/gen-uni-tables" \ + || func_fatal_error "cannot compile gen-uni-tables.c" + +: ${WGET=wget} +"$WGET" -q http://www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \ + -O "$tmp/PropList-3.0.1.txt" \ + || func_fatal_error "cannot fetch PropList-3.0.1.txt" + +ver=`sed -n -e 's/.*Version \([0-9.]*\).*/\1/p' < "$UCD/ReadMe.txt"` + +(cd "$GNULIB_SRCDIR/lib" \ + && "$tmp/gen-uni-tables" "$UCD/UnicodeData.txt" \ + "$UCD/PropList.txt" \ + "$UCD/DerivedCoreProperties.txt" \ + "$UCD/ArabicShaping.txt" \ + "$UCD/Scripts.txt" \ + "$UCD/Blocks.txt" \ + "$tmp/PropList-3.0.1.txt" \ + "$UCD/EastAsianWidth.txt" \ + "$UCD/LineBreak.txt" \ + "$UCD/auxiliary/WordBreakProperty.txt" \ + "$UCD/auxiliary/GraphemeBreakProperty.txt" \ + "$UCD/CompositionExclusions.txt" \ + "$UCD/SpecialCasing.txt" \ + "$UCD/CaseFolding.txt" \ + "$ver") \ + || func_fatal_error "error running gen-uni-tables" + +: ${DIFF=diff} +"$DIFF" "$GNULIB_SRCDIR/lib/unilbrk/lbrkprop_org.txt" \ + "$GNULIB_SRCDIR/lib/unilbrk/lbrkprop.txt" \ + || func_fatal_error "lbrkprop is not updated properly" + +"$DIFF" "$GNULIB_SRCDIR/lib/uniwbrk/wbrkprop_org.txt" \ + "$GNULIB_SRCDIR/lib/uniwbrk/wbrkprop.txt" \ + || func_fatal_error "wbrkprop is not updated properly" + +# Copy necessary files from UCD +for dstfile in "$GNULIB_SRCDIR/tests/uninorm/NormalizationTest.txt" \ + "$GNULIB_SRCDIR/tests/uniwbrk/WordBreakTest.txt" \ + "$GNULIB_SRCDIR/tests/unigbrk/GraphemeBreakTest.txt" \ + "$GNULIB_SRCDIR/tests/uniname/UnicodeData.txt" \ + "$GNULIB_SRCDIR/tests/uniname/NameAliases.txt" ; do + srcfile=`expr "$dstfile" : '.*/\(.*\)'` + if test -f "$UCD/$srcfile"; then + srcfile="$UCD/$srcfile" + elif test -f "$UCD/auxiliary/$srcfile"; then + srcfile="$UCD/auxiliary/$srcfile" + else + func_fatal_error "cannot find $srcfile" + fi + + sed 's/ *$//' < "$srcfile" > "$dstfile" +done + +rm -rf "$tmp" + +# Undo the effect of the previous 'trap' command. Some shellology: +# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to +# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal +# exit); for the others we need to call 'exit' explicitly. The value of $? is +# 128 + signal number and is set before the trap-registered command is run. +trap '' 0 +trap 'func_exit $?' 1 2 3 13 15 + +exit 0 diff --git a/Admin/gen-uninames b/Admin/gen-uninames new file mode 100755 index 0000000..2b0771a --- /dev/null +++ b/Admin/gen-uninames @@ -0,0 +1,116 @@ +#!/bin/sh +# +# Copyright (C) 2015 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +progname=$0 + +# func_exit STATUS +# exits with a given status. +# This function needs to be used, rather than 'exit', when a 'trap' handler is +# in effect that refers to $?. +func_exit () +{ + (exit $1); exit $1 +} + +# func_fatal_error message +# outputs to stderr a fatal error message, and terminates the program. +# Input: +# - progname name of this program +func_fatal_error () +{ + echo "$progname: *** $1" 1>&2 + echo "$progname: *** Stop." 1>&2 + func_exit 1 +} + +# func_tmpdir +# creates a temporary directory. +# Input: +# - progname name of this program +# Sets variable +# - tmp pathname of freshly created temporary directory +func_tmpdir () +{ + # Use the environment variable TMPDIR, falling back to /tmp. This allows + # users to specify a different temporary directory, for example, if their + # /tmp is filled up or too small. + : ${TMPDIR=/tmp} + { + # Use the mktemp program if available. If not available, hide the error + # message. + tmp=`(umask 077 && mktemp -d "$TMPDIR/glXXXXXX") 2>/dev/null` && + test -n "$tmp" && test -d "$tmp" + } || + { + # Use a simple mkdir command. It is guaranteed to fail if the directory + # already exists. $RANDOM is bash specific and expands to empty in shells + # other than bash, ksh and zsh. Its use does not increase security; + # rather, it minimizes the probability of failure in a very cluttered /tmp + # directory. + tmp=$TMPDIR/gl$$-$RANDOM + (umask 077 && mkdir "$tmp") + } || + { + echo "$progname: cannot create a temporary directory in $TMPDIR" >&2 + func_exit 1 + } +} + +test -n "$GNULIB_SRCDIR" || \ + func_fatal_error "$progname: GNULIB_SRCDIR is not set" + +test -n "$UCD" || \ + func_fatal_error "$progname: UCD is not set" + +func_tmpdir +trap 'exit_status=$? + if test "$signal" != 0; then + echo "caught signal $signal" >&2 + fi + rm -rf "$tmp" + exit $exit_status' 0 +for signal in 1 2 3 13 15; do + trap '{ signal='$signal'; func_exit 1; }' $signal +done +signal=0 + +: ${CLISP=clisp} +("$CLISP" --version) >/dev/null 2>/dev/null \ + || { echo "$progname: *** clisp not found; skipping uniname" 1>&2; exit 0; } + +sed -e '/^[0-9A-F]*; "$tmp/UnicodeDataNames.txt" + +sed -e 's/ *$//' -e '/^#/d' -e '/^$/d' < "$UCD/NameAliases.txt" \ + > "$tmp/NameAliases.txt" + +"$CLISP" "$GNULIB_SRCDIR/lib/uniname/gen-uninames.lisp" \ + "$tmp/UnicodeDataNames.txt" \ + "$GNULIB_SRCDIR/lib/uniname/uninames.h" \ + "$tmp/NameAliases.txt" + +rm -rf "$tmp" + +# Undo the effect of the previous 'trap' command. Some shellology: +# We cannot use "trap - 0 1 2 3 13 15", because Solaris sh would attempt to +# execute the command "-". "trap '' ..." is fine only for signal 0 (= normal +# exit); for the others we need to call 'exit' explicitly. The value of $? is +# 128 + signal number and is set before the trap-registered command is run. +trap '' 0 +trap 'func_exit $?' 1 2 3 13 15 + +exit 0 -- cgit v1.2.1