summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2011-03-29 23:10:57 +0200
committerBruno Haible <bruno@clisp.org>2011-03-29 23:47:50 +0200
commit6ec70a06da21e2bdceb9814fe6fde6be46f890cd (patch)
tree1eb310cb7adb756f54392eeea8423af19b1e6600
parent820590c2b81686f64c50d22022aeb49ff3c6e3ad (diff)
downloadlibunistring-6ec70a06da21e2bdceb9814fe6fde6be46f890cd.tar.gz
Add grapheme cluster break functions.
-rw-r--r--AUTHORS10
-rw-r--r--ChangeLog21
-rw-r--r--NEWS4
-rw-r--r--README1
-rwxr-xr-xautogen.sh13
-rw-r--r--doc/Makefile.am2
-rw-r--r--doc/libunistring.texi18
-rw-r--r--doc/unigbrk.texi29
-rw-r--r--doc/uniwbrk.texi2
-rw-r--r--gnulib-local/Makefile.am3
-rw-r--r--gnulib-local/lib/unigbrk.in.h.diff19
-rw-r--r--lib/Makefile.am4
12 files changed, 106 insertions, 20 deletions
diff --git a/AUTHORS b/AUTHORS
index 8bedd79..c690b65 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1 +1,9 @@
-Bruno Haible <bruno@clisp.org>
+Authors of GNU libunistring
+
+The following contributions warranted legal paper exchanges with the
+Free Software Foundation. See also the ChangeLog and THANKS files in this
+package and the ChangeLog file in gnulib (where most of the code is imported
+from).
+
+Bruno Haible <bruno@clisp.org> all files
+Ben Pfaff <blp@cs.stanford.edu> unigbrk
diff --git a/ChangeLog b/ChangeLog
index e5a7edd..4d69c82 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2011-03-29 Bruno Haible <bruno@clisp.org>
+
+ Add grapheme cluster break functions.
+ * autogen.sh (GNULIB_MODULES): Add unigbrk/*.
+ * gnulib-local/lib/unigbrk.in.h.diff: New file.
+ * gnulib-local/Makefile.am (EXTRA_DIST): Add lib/unigbrk.in.h.diff.
+ * lib/Makefile.am (nobase_include_HEADERS, HEADERS_WITH_EXTERNS): Add
+ unigbrk.h.
+ * doc/Makefile.am (libunistring_TEXINFOS): Move unigbrk.texi before
+ uniwbrk.texi.
+ * doc/libunistring.texi: Move chapter unigbrk.h before chapter
+ uniwbrk.h.
+ * doc/unigbrk.texi (unigbrk.h): Add more index entries. Enhance the
+ explanation of what a grapheme cluster is.
+ (Grapheme cluster break property): Add missing quotes. Avoid imperative
+ form.
+ * doc/uniwbrk.texi (uniwbrk.h): Add more index entries.
+ * AUTHORS: List Ben Pfaff.
+ * README: Mention the new include file unigbrk.h.
+ * NEWS: Likewise.
+
2011-03-28 Ben Pfaff <blp@cs.stanford.edu>
Document grapheme cluster break functions.
diff --git a/NEWS b/NEWS
index d5aa25a..fbcc453 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,10 @@
New in 0.9.4:
* The data tables and line breaking algorithm have been updated to Unicode
version 6.0.0.
+* A new include file unigbrk.h is provided. It declares functions for
+ grapheme cluster breaking, that is, determining the boundaries between
+ graphemes. See the documentation chapter "Grapheme cluster breaks in strings"
+ for details.
* In the include file unictype.h, constants are defined for the group of
general categories LC ("Cased Letter").
* In the include file unictype.h, functions for associating canonical
diff --git a/README b/README
index c52e56d..e4bc981 100644
--- a/README
+++ b/README
@@ -11,6 +11,7 @@ It consists of the following parts:
uniname.h character names
unictype.h character classification and properties
uniwidth.h string width when using nonproportional fonts
+ unigbrk.h grapheme cluster breaks
uniwbrk.h word breaks
unilbrk.h line breaking algorithm
uninorm.h normalization (composition and decomposition)
diff --git a/autogen.sh b/autogen.sh
index bde7758..9be9365 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -314,6 +314,19 @@ if test $skip_gnulib = false; then
uniwidth/u32-strwidth
uniwidth/u32-width
uniwidth/width
+ unigbrk/base
+ unigbrk/u8-grapheme-breaks
+ unigbrk/u8-grapheme-next
+ unigbrk/u8-grapheme-prev
+ unigbrk/u16-grapheme-breaks
+ unigbrk/u16-grapheme-next
+ unigbrk/u16-grapheme-prev
+ unigbrk/u32-grapheme-breaks
+ unigbrk/u32-grapheme-next
+ unigbrk/u32-grapheme-prev
+ unigbrk/uc-gbrk-prop
+ unigbrk/uc-is-grapheme-break
+ unigbrk/ulc-grapheme-breaks
uniwbrk/base
uniwbrk/u8-wordbreaks
uniwbrk/u16-wordbreaks
diff --git a/doc/Makefile.am b/doc/Makefile.am
index cd5c514..c470f6c 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -33,7 +33,7 @@ info_TEXINFOS = libunistring.texi
# List of texinfo sources @included by libunistring.texi, excluding version.texi.
libunistring_TEXINFOS = \
unitypes.texi unistr.texi uniconv.texi unistdio.texi uniname.texi \
- unictype.texi uniwidth.texi uniwbrk.texi unilbrk.texi unigbrk.texi \
+ unictype.texi uniwidth.texi unigbrk.texi uniwbrk.texi unilbrk.texi \
uninorm.texi unicase.texi uniregex.texi \
gpl.texi lgpl.texi fdl.texi
diff --git a/doc/libunistring.texi b/doc/libunistring.texi
index 32209ab..a6f9c8f 100644
--- a/doc/libunistring.texi
+++ b/doc/libunistring.texi
@@ -158,9 +158,9 @@ A copy of the license is included in @ref{GNU GPL}.
* uniname.h:: Names of Unicode characters
* unictype.h:: Unicode character classification and properties
* uniwidth.h:: Display width
+* unigbrk.h:: Grapheme cluster breaking
* uniwbrk.h:: Word breaks in strings
* unilbrk.h:: Line breaking
-* unigbrk.h:: Grapheme cluster breaking
* uninorm.h:: Normalization forms
* unicase.h:: Case mappings
* uniregex.h:: Regular expressions
@@ -217,16 +217,16 @@ Properties
* Properties as objects::
* Properties as functions::
-uniwbrk.h
-
-* Word breaks in a string::
-* Word break property::
-
unigbrk.h
* Grapheme cluster breaks in a string::
* Grapheme cluster break property::
+uniwbrk.h
+
+* Word breaks in a string::
+* Word break property::
+
uninorm.h
* Decomposition of characters::
@@ -281,12 +281,12 @@ character names
character classification and properties
@item <uniwidth.h>
string width when using nonproportional fonts
+@item <unigbrk.h>
+grapheme cluster breaks
@item <uniwbrk.h>
word breaks
@item <unilbrk.h>
line breaking algorithm
-@item <unigbrk.h>
-grapheme cluster breaks
@item <uninorm.h>
normalization (composition and decomposition)
@item <unicase.h>
@@ -763,9 +763,9 @@ NULL is returned and @code{errno} is set.
@include uniname.texi
@include unictype.texi
@include uniwidth.texi
+@include unigbrk.texi
@include uniwbrk.texi
@include unilbrk.texi
-@include unigbrk.texi
@include uninorm.texi
@include unicase.texi
@include uniregex.texi
diff --git a/doc/unigbrk.texi b/doc/unigbrk.texi
index db4df6a..196bd9f 100644
--- a/doc/unigbrk.texi
+++ b/doc/unigbrk.texi
@@ -2,11 +2,18 @@
@chapter Grapheme cluster breaks in strings @code{<unigbrk.h>}
@cindex grapheme cluster breaks
+@cindex grapheme cluster boundaries
@cindex breaks, grapheme cluster
+@cindex boundaries, between grapheme clusters
This include file declares functions for determining where in a string
``grapheme clusters'' start and end. A ``grapheme cluster'' is an
approximation to a user-perceived character, which sometimes
-corresponds to multiple Unicode characters. The letter @samp{@'e},
+corresponds to multiple Unicode characters. Editing operations such as
+mouse selection, cursor movement, and backspacing often operate on
+grapheme clusters as units, not on individual characters.
+
+Some grapheme clusters are built from a base character and a combining
+character. The letter @samp{@'e},
for example, is most commonly represented in Unicode as a single
character U+00E8 @sc{LATIN SMALL LETTER E WITH ACUTE}. It is,
however, equally valid to use the pair of characters U+0065 @sc{LATIN
@@ -14,6 +21,12 @@ SMALL LETTER E} followed by U+0301 @sc{COMBINING ACUTE ACCENT}. Since
the user would perceive this pair of characters as a single character,
they would be grouped into a single grapheme cluster.
+But there are also grapheme clusters that consist of several base characters.
+For example, a Devanagari letter and a Devanagari vowel sign that follows it
+may form a grapheme cluster. Similarly, some pairs of Thai characters and
+Hangul syllables (formed by two or three Hangul characters) are grapheme
+clusters.
+
@menu
* Grapheme cluster breaks in a string::
* Grapheme cluster break property::
@@ -65,10 +78,11 @@ grapheme cluster break at start of text.
@node Grapheme cluster break property
@section Grapheme cluster break property
-This is a more low-level API. The grapheme cluster break property is a property defined
-in Unicode Standard Annex #29, section ``Grapheme Cluster Boundaries, see
-@url{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}.@texnl{} It is
-used for determining the grapheme cluster breaks in a string.
+This is a more low-level API. The grapheme cluster break property is a
+property defined in Unicode Standard Annex #29, section ``Grapheme Cluster
+Boundaries'', see
+@url{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}.@texnl{}
+It is used for determining the grapheme cluster breaks in a string.
The following are the possible values of the grapheme cluster break
property. More values may be added in the future.
@@ -87,7 +101,8 @@ property. More values may be added in the future.
@deftypevrx Constant int GBP_LVT
@end deftypevr
-The following function looks up the grapheme cluster break property of a character.
+The following function looks up the grapheme cluster break property of a
+character.
@deftypefun int uc_graphemeclusterbreak_property (ucs4_t @var{uc})
Returns the Grapheme_Cluster_Break property of a Unicode character.
@@ -102,7 +117,7 @@ Returns true if there is an grapheme cluster boundary between Unicode
characters @var{a} and @var{b}.
There is always a grapheme cluster break at the start or end of text.
-Specify zero for @var{a} or @var{b} to indicate start of text or end
+You can specify zero for @var{a} or @var{b} to indicate start of text or end
of text, respectively.
This implements the extended (not legacy) grapheme cluster rules
diff --git a/doc/uniwbrk.texi b/doc/uniwbrk.texi
index 6f06b92..08c273c 100644
--- a/doc/uniwbrk.texi
+++ b/doc/uniwbrk.texi
@@ -2,7 +2,9 @@
@chapter Word breaks in strings @code{<uniwbrk.h>}
@cindex word breaks
+@cindex word boundaries
@cindex breaks, word
+@cindex boundaries, between words
This include file declares functions for determining where in a string
``words'' start and end. Here ``words'' are not necessarily the same as
entities that can be looked up in dictionaries, but rather groups of
diff --git a/gnulib-local/Makefile.am b/gnulib-local/Makefile.am
index d011b8d..3758f6d 100644
--- a/gnulib-local/Makefile.am
+++ b/gnulib-local/Makefile.am
@@ -1,5 +1,5 @@
## Makefile for the gnulib-local directory of GNU libunistring
-## Copyright (C) 2006-2010 Free Software Foundation, Inc.
+## Copyright (C) 2006-2011 Free Software Foundation, Inc.
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@@ -22,6 +22,7 @@ EXTRA_DIST = \
lib/unicase.in.h.diff \
lib/uniconv.in.h.diff \
lib/unictype.in.h.diff \
+lib/unigbrk.in.h.diff \
lib/unilbrk.in.h.diff \
lib/uninorm.in.h.diff \
lib/unistr.in.h.diff \
diff --git a/gnulib-local/lib/unigbrk.in.h.diff b/gnulib-local/lib/unigbrk.in.h.diff
new file mode 100644
index 0000000..ed6c7e3
--- /dev/null
+++ b/gnulib-local/lib/unigbrk.in.h.diff
@@ -0,0 +1,19 @@
+*** unigbrk.in.h.orig 2011-01-07 17:40:15.000000000 +0100
+--- unigbrk.in.h 2011-03-29 23:36:45.000000000 +0200
+***************
+*** 19,25 ****
+ #define _UNIGBRK_H
+
+ /* Get bool. */
+! #include <stdbool.h>
+
+ /* Get size_t. */
+ #include <stddef.h>
+--- 19,25 ----
+ #define _UNIGBRK_H
+
+ /* Get bool. */
+! #include <unistring/stdbool.h>
+
+ /* Get size_t. */
+ #include <stddef.h>
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 495b134..917ace1 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,5 +1,5 @@
## Makefile for the lib subdirectory of GNU libunistring.
-## Copyright (C) 2009-2010 Free Software Foundation, Inc.
+## Copyright (C) 2009-2011 Free Software Foundation, Inc.
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@@ -36,6 +36,7 @@ nobase_include_HEADERS = \
uniname.h \
unictype.h \
uniwidth.h \
+ unigbrk.h \
uniwbrk.h \
unilbrk.h \
uninorm.h \
@@ -200,6 +201,7 @@ HEADERS_WITH_EXTERNS = \
uniname.h \
unictype.h \
uniwidth.h \
+ unigbrk.h \
uniwbrk.h \
unilbrk.h \
uninorm.h \