From 6ec70a06da21e2bdceb9814fe6fde6be46f890cd Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Tue, 29 Mar 2011 23:10:57 +0200 Subject: Add grapheme cluster break functions. --- AUTHORS | 10 +++++++++- ChangeLog | 21 +++++++++++++++++++++ NEWS | 4 ++++ README | 1 + autogen.sh | 13 +++++++++++++ doc/Makefile.am | 2 +- doc/libunistring.texi | 18 +++++++++--------- doc/unigbrk.texi | 29 ++++++++++++++++++++++------- doc/uniwbrk.texi | 2 ++ gnulib-local/Makefile.am | 3 ++- gnulib-local/lib/unigbrk.in.h.diff | 19 +++++++++++++++++++ lib/Makefile.am | 4 +++- 12 files changed, 106 insertions(+), 20 deletions(-) create mode 100644 gnulib-local/lib/unigbrk.in.h.diff diff --git a/AUTHORS b/AUTHORS index 8bedd79..c690b65 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1 +1,9 @@ -Bruno Haible +Authors of GNU libunistring + +The following contributions warranted legal paper exchanges with the +Free Software Foundation. See also the ChangeLog and THANKS files in this +package and the ChangeLog file in gnulib (where most of the code is imported +from). + +Bruno Haible all files +Ben Pfaff unigbrk diff --git a/ChangeLog b/ChangeLog index e5a7edd..4d69c82 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2011-03-29 Bruno Haible + + Add grapheme cluster break functions. + * autogen.sh (GNULIB_MODULES): Add unigbrk/*. + * gnulib-local/lib/unigbrk.in.h.diff: New file. + * gnulib-local/Makefile.am (EXTRA_DIST): Add lib/unigbrk.in.h.diff. + * lib/Makefile.am (nobase_include_HEADERS, HEADERS_WITH_EXTERNS): Add + unigbrk.h. + * doc/Makefile.am (libunistring_TEXINFOS): Move unigbrk.texi before + uniwbrk.texi. + * doc/libunistring.texi: Move chapter unigbrk.h before chapter + uniwbrk.h. + * doc/unigbrk.texi (unigbrk.h): Add more index entries. Enhance the + explanation of what a grapheme cluster is. + (Grapheme cluster break property): Add missing quotes. Avoid imperative + form. + * doc/uniwbrk.texi (uniwbrk.h): Add more index entries. + * AUTHORS: List Ben Pfaff. + * README: Mention the new include file unigbrk.h. + * NEWS: Likewise. + 2011-03-28 Ben Pfaff Document grapheme cluster break functions. diff --git a/NEWS b/NEWS index d5aa25a..fbcc453 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,10 @@ New in 0.9.4: * The data tables and line breaking algorithm have been updated to Unicode version 6.0.0. +* A new include file unigbrk.h is provided. It declares functions for + grapheme cluster breaking, that is, determining the boundaries between + graphemes. See the documentation chapter "Grapheme cluster breaks in strings" + for details. * In the include file unictype.h, constants are defined for the group of general categories LC ("Cased Letter"). * In the include file unictype.h, functions for associating canonical diff --git a/README b/README index c52e56d..e4bc981 100644 --- a/README +++ b/README @@ -11,6 +11,7 @@ It consists of the following parts: uniname.h character names unictype.h character classification and properties uniwidth.h string width when using nonproportional fonts + unigbrk.h grapheme cluster breaks uniwbrk.h word breaks unilbrk.h line breaking algorithm uninorm.h normalization (composition and decomposition) diff --git a/autogen.sh b/autogen.sh index bde7758..9be9365 100755 --- a/autogen.sh +++ b/autogen.sh @@ -314,6 +314,19 @@ if test $skip_gnulib = false; then uniwidth/u32-strwidth uniwidth/u32-width uniwidth/width + unigbrk/base + unigbrk/u8-grapheme-breaks + unigbrk/u8-grapheme-next + unigbrk/u8-grapheme-prev + unigbrk/u16-grapheme-breaks + unigbrk/u16-grapheme-next + unigbrk/u16-grapheme-prev + unigbrk/u32-grapheme-breaks + unigbrk/u32-grapheme-next + unigbrk/u32-grapheme-prev + unigbrk/uc-gbrk-prop + unigbrk/uc-is-grapheme-break + unigbrk/ulc-grapheme-breaks uniwbrk/base uniwbrk/u8-wordbreaks uniwbrk/u16-wordbreaks diff --git a/doc/Makefile.am b/doc/Makefile.am index cd5c514..c470f6c 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -33,7 +33,7 @@ info_TEXINFOS = libunistring.texi # List of texinfo sources @included by libunistring.texi, excluding version.texi. libunistring_TEXINFOS = \ unitypes.texi unistr.texi uniconv.texi unistdio.texi uniname.texi \ - unictype.texi uniwidth.texi uniwbrk.texi unilbrk.texi unigbrk.texi \ + unictype.texi uniwidth.texi unigbrk.texi uniwbrk.texi unilbrk.texi \ uninorm.texi unicase.texi uniregex.texi \ gpl.texi lgpl.texi fdl.texi diff --git a/doc/libunistring.texi b/doc/libunistring.texi index 32209ab..a6f9c8f 100644 --- a/doc/libunistring.texi +++ b/doc/libunistring.texi @@ -158,9 +158,9 @@ A copy of the license is included in @ref{GNU GPL}. * uniname.h:: Names of Unicode characters * unictype.h:: Unicode character classification and properties * uniwidth.h:: Display width +* unigbrk.h:: Grapheme cluster breaking * uniwbrk.h:: Word breaks in strings * unilbrk.h:: Line breaking -* unigbrk.h:: Grapheme cluster breaking * uninorm.h:: Normalization forms * unicase.h:: Case mappings * uniregex.h:: Regular expressions @@ -217,16 +217,16 @@ Properties * Properties as objects:: * Properties as functions:: -uniwbrk.h - -* Word breaks in a string:: -* Word break property:: - unigbrk.h * Grapheme cluster breaks in a string:: * Grapheme cluster break property:: +uniwbrk.h + +* Word breaks in a string:: +* Word break property:: + uninorm.h * Decomposition of characters:: @@ -281,12 +281,12 @@ character names character classification and properties @item string width when using nonproportional fonts +@item +grapheme cluster breaks @item word breaks @item line breaking algorithm -@item -grapheme cluster breaks @item normalization (composition and decomposition) @item @@ -763,9 +763,9 @@ NULL is returned and @code{errno} is set. @include uniname.texi @include unictype.texi @include uniwidth.texi +@include unigbrk.texi @include uniwbrk.texi @include unilbrk.texi -@include unigbrk.texi @include uninorm.texi @include unicase.texi @include uniregex.texi diff --git a/doc/unigbrk.texi b/doc/unigbrk.texi index db4df6a..196bd9f 100644 --- a/doc/unigbrk.texi +++ b/doc/unigbrk.texi @@ -2,11 +2,18 @@ @chapter Grapheme cluster breaks in strings @code{} @cindex grapheme cluster breaks +@cindex grapheme cluster boundaries @cindex breaks, grapheme cluster +@cindex boundaries, between grapheme clusters This include file declares functions for determining where in a string ``grapheme clusters'' start and end. A ``grapheme cluster'' is an approximation to a user-perceived character, which sometimes -corresponds to multiple Unicode characters. The letter @samp{@'e}, +corresponds to multiple Unicode characters. Editing operations such as +mouse selection, cursor movement, and backspacing often operate on +grapheme clusters as units, not on individual characters. + +Some grapheme clusters are built from a base character and a combining +character. The letter @samp{@'e}, for example, is most commonly represented in Unicode as a single character U+00E8 @sc{LATIN SMALL LETTER E WITH ACUTE}. It is, however, equally valid to use the pair of characters U+0065 @sc{LATIN @@ -14,6 +21,12 @@ SMALL LETTER E} followed by U+0301 @sc{COMBINING ACUTE ACCENT}. Since the user would perceive this pair of characters as a single character, they would be grouped into a single grapheme cluster. +But there are also grapheme clusters that consist of several base characters. +For example, a Devanagari letter and a Devanagari vowel sign that follows it +may form a grapheme cluster. Similarly, some pairs of Thai characters and +Hangul syllables (formed by two or three Hangul characters) are grapheme +clusters. + @menu * Grapheme cluster breaks in a string:: * Grapheme cluster break property:: @@ -65,10 +78,11 @@ grapheme cluster break at start of text. @node Grapheme cluster break property @section Grapheme cluster break property -This is a more low-level API. The grapheme cluster break property is a property defined -in Unicode Standard Annex #29, section ``Grapheme Cluster Boundaries, see -@url{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}.@texnl{} It is -used for determining the grapheme cluster breaks in a string. +This is a more low-level API. The grapheme cluster break property is a +property defined in Unicode Standard Annex #29, section ``Grapheme Cluster +Boundaries'', see +@url{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}.@texnl{} +It is used for determining the grapheme cluster breaks in a string. The following are the possible values of the grapheme cluster break property. More values may be added in the future. @@ -87,7 +101,8 @@ property. More values may be added in the future. @deftypevrx Constant int GBP_LVT @end deftypevr -The following function looks up the grapheme cluster break property of a character. +The following function looks up the grapheme cluster break property of a +character. @deftypefun int uc_graphemeclusterbreak_property (ucs4_t @var{uc}) Returns the Grapheme_Cluster_Break property of a Unicode character. @@ -102,7 +117,7 @@ Returns true if there is an grapheme cluster boundary between Unicode characters @var{a} and @var{b}. There is always a grapheme cluster break at the start or end of text. -Specify zero for @var{a} or @var{b} to indicate start of text or end +You can specify zero for @var{a} or @var{b} to indicate start of text or end of text, respectively. This implements the extended (not legacy) grapheme cluster rules diff --git a/doc/uniwbrk.texi b/doc/uniwbrk.texi index 6f06b92..08c273c 100644 --- a/doc/uniwbrk.texi +++ b/doc/uniwbrk.texi @@ -2,7 +2,9 @@ @chapter Word breaks in strings @code{} @cindex word breaks +@cindex word boundaries @cindex breaks, word +@cindex boundaries, between words This include file declares functions for determining where in a string ``words'' start and end. Here ``words'' are not necessarily the same as entities that can be looked up in dictionaries, but rather groups of diff --git a/gnulib-local/Makefile.am b/gnulib-local/Makefile.am index d011b8d..3758f6d 100644 --- a/gnulib-local/Makefile.am +++ b/gnulib-local/Makefile.am @@ -1,5 +1,5 @@ ## Makefile for the gnulib-local directory of GNU libunistring -## Copyright (C) 2006-2010 Free Software Foundation, Inc. +## Copyright (C) 2006-2011 Free Software Foundation, Inc. ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -22,6 +22,7 @@ EXTRA_DIST = \ lib/unicase.in.h.diff \ lib/uniconv.in.h.diff \ lib/unictype.in.h.diff \ +lib/unigbrk.in.h.diff \ lib/unilbrk.in.h.diff \ lib/uninorm.in.h.diff \ lib/unistr.in.h.diff \ diff --git a/gnulib-local/lib/unigbrk.in.h.diff b/gnulib-local/lib/unigbrk.in.h.diff new file mode 100644 index 0000000..ed6c7e3 --- /dev/null +++ b/gnulib-local/lib/unigbrk.in.h.diff @@ -0,0 +1,19 @@ +*** unigbrk.in.h.orig 2011-01-07 17:40:15.000000000 +0100 +--- unigbrk.in.h 2011-03-29 23:36:45.000000000 +0200 +*************** +*** 19,25 **** + #define _UNIGBRK_H + + /* Get bool. */ +! #include + + /* Get size_t. */ + #include +--- 19,25 ---- + #define _UNIGBRK_H + + /* Get bool. */ +! #include + + /* Get size_t. */ + #include diff --git a/lib/Makefile.am b/lib/Makefile.am index 495b134..917ace1 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1,5 +1,5 @@ ## Makefile for the lib subdirectory of GNU libunistring. -## Copyright (C) 2009-2010 Free Software Foundation, Inc. +## Copyright (C) 2009-2011 Free Software Foundation, Inc. ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -36,6 +36,7 @@ nobase_include_HEADERS = \ uniname.h \ unictype.h \ uniwidth.h \ + unigbrk.h \ uniwbrk.h \ unilbrk.h \ uninorm.h \ @@ -200,6 +201,7 @@ HEADERS_WITH_EXTERNS = \ uniname.h \ unictype.h \ uniwidth.h \ + unigbrk.h \ uniwbrk.h \ unilbrk.h \ uninorm.h \ -- cgit v1.2.1