From 820590c2b81686f64c50d22022aeb49ff3c6e3ad Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 28 Mar 2011 21:33:05 -0700 Subject: Document grapheme cluster break functions. --- ChangeLog | 8 ++++ doc/Makefile.am | 6 +-- doc/libunistring.texi | 9 ++++ doc/unigbrk.texi | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 doc/unigbrk.texi diff --git a/ChangeLog b/ChangeLog index 0ebbe3f..e5a7edd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2011-03-28 Ben Pfaff + + Document grapheme cluster break functions. + * doc/Makefile.am (libunistring_TEXINFOS): Add unigbrk.texi. + * doc/libunistring.texi: Include unigbrk.texi and refer to it from + the text and tables of content. + * doc/unigbrk.texi: New file. + 2011-03-26 Bruno Haible Allow omitting spaces in property names. diff --git a/doc/Makefile.am b/doc/Makefile.am index ac3480c..cd5c514 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,5 +1,5 @@ ## Makefile for the doc subdirectory of GNU libunistring. -## Copyright (C) 2009 Free Software Foundation, Inc. +## Copyright (C) 2009, 2011 Free Software Foundation, Inc. ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -33,8 +33,8 @@ info_TEXINFOS = libunistring.texi # List of texinfo sources @included by libunistring.texi, excluding version.texi. libunistring_TEXINFOS = \ unitypes.texi unistr.texi uniconv.texi unistdio.texi uniname.texi \ - unictype.texi uniwidth.texi uniwbrk.texi unilbrk.texi uninorm.texi \ - unicase.texi uniregex.texi \ + unictype.texi uniwidth.texi uniwbrk.texi unilbrk.texi unigbrk.texi \ + uninorm.texi unicase.texi uniregex.texi \ gpl.texi lgpl.texi fdl.texi # The dependencies of stamp-vti generated by automake are incomplete. diff --git a/doc/libunistring.texi b/doc/libunistring.texi index 2024a9c..32209ab 100644 --- a/doc/libunistring.texi +++ b/doc/libunistring.texi @@ -160,6 +160,7 @@ A copy of the license is included in @ref{GNU GPL}. * uniwidth.h:: Display width * uniwbrk.h:: Word breaks in strings * unilbrk.h:: Line breaking +* unigbrk.h:: Grapheme cluster breaking * uninorm.h:: Normalization forms * unicase.h:: Case mappings * uniregex.h:: Regular expressions @@ -221,6 +222,11 @@ uniwbrk.h * Word breaks in a string:: * Word break property:: +unigbrk.h + +* Grapheme cluster breaks in a string:: +* Grapheme cluster break property:: + uninorm.h * Decomposition of characters:: @@ -279,6 +285,8 @@ string width when using nonproportional fonts word breaks @item line breaking algorithm +@item +grapheme cluster breaks @item normalization (composition and decomposition) @item @@ -757,6 +765,7 @@ NULL is returned and @code{errno} is set. @include uniwidth.texi @include uniwbrk.texi @include unilbrk.texi +@include unigbrk.texi @include uninorm.texi @include unicase.texi @include uniregex.texi diff --git a/doc/unigbrk.texi b/doc/unigbrk.texi new file mode 100644 index 0000000..db4df6a --- /dev/null +++ b/doc/unigbrk.texi @@ -0,0 +1,111 @@ +@node unigbrk.h +@chapter Grapheme cluster breaks in strings @code{} + +@cindex grapheme cluster breaks +@cindex breaks, grapheme cluster +This include file declares functions for determining where in a string +``grapheme clusters'' start and end. A ``grapheme cluster'' is an +approximation to a user-perceived character, which sometimes +corresponds to multiple Unicode characters. The letter @samp{@'e}, +for example, is most commonly represented in Unicode as a single +character U+00E8 @sc{LATIN SMALL LETTER E WITH ACUTE}. It is, +however, equally valid to use the pair of characters U+0065 @sc{LATIN +SMALL LETTER E} followed by U+0301 @sc{COMBINING ACUTE ACCENT}. Since +the user would perceive this pair of characters as a single character, +they would be grouped into a single grapheme cluster. + +@menu +* Grapheme cluster breaks in a string:: +* Grapheme cluster break property:: +@end menu + +@node Grapheme cluster breaks in a string +@section Grapheme cluster breaks in a string + +The following functions find a single boundary between grapheme +clusters in a string. + +@deftypefun void u8_grapheme_next (const uint8_t *@var{s}, const uint8_t *@var{end}) +@deftypefunx void u16_grapheme_next (const uint16_t *@var{s}, const uint16_t *@var{end}) +@deftypefunx void u32_grapheme_next (const uint32_t *@var{s}, const uint32_t *@var{end}) +Returns the start of the next grapheme cluster following @var{s}, +or @var{end} if no grapheme cluster break is encountered before it. +Returns NULL if and only if @code{@var{s} == @var{end}}. +@end deftypefun + +@deftypefun void u8_grapheme_prev (const uint8_t *@var{s}, const uint8_t *@var{start}) +@deftypefunx void u16_grapheme_prev (const uint16_t *@var{s}, const uint16_t *@var{start}) +@deftypefunx void u32_grapheme_prev (const uint32_t *@var{s}, const uint32_t *@var{start}) +Returns the start of the grapheme cluster preceding @var{s}, or +@var{start} if no grapheme cluster break is encountered before it. +Returns NULL if and only if @code{@var{s} == @var{start}}. +@end deftypefun + +The following functions determine all of the grapheme cluster +boundaries in a string. + +@deftypefun void u8_grapheme_breaks (const uint8_t *@var{s}, size_t @var{n}, char *@var{p}) +@deftypefunx void u16_grapheme_breaks (const uint16_t *@var{s}, size_t @var{n}, char *@var{p}) +@deftypefunx void u32_grapheme_breaks (const uint32_t *@var{s}, size_t @var{n}, char *@var{p}) +@deftypefunx void ulc_grapheme_breaks (const char *@var{s}, size_t @var{n}, char *@var{p}) +Determines the grapheme cluster break points in @var{s}, an array of +@var{n} units, and stores the result at @code{@var{p}[0..@var{n}-1]}. +@table @asis +@item @code{@var{p}[i] = 1} +means that there is a grapheme cluster boundary between +@code{@var{s}[i-1]} and @code{@var{s}[i]}. +@item @code{@var{p}[i] = 0} +means that @code{@var{s}[i-1]} and @code{@var{s}[i]} are part of the +same grapheme cluster. +@end table +@code{@var{p}[0]} is always set to 1, because there is always a +grapheme cluster break at start of text. +@end deftypefun + +@node Grapheme cluster break property +@section Grapheme cluster break property + +This is a more low-level API. The grapheme cluster break property is a property defined +in Unicode Standard Annex #29, section ``Grapheme Cluster Boundaries, see +@url{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}.@texnl{} It is +used for determining the grapheme cluster breaks in a string. + +The following are the possible values of the grapheme cluster break +property. More values may be added in the future. + +@deftypevr Constant int GBP_OTHER +@deftypevrx Constant int GBP_CR +@deftypevrx Constant int GBP_LF +@deftypevrx Constant int GBP_CONTROL +@deftypevrx Constant int GBP_EXTEND +@deftypevrx Constant int GBP_PREPEND +@deftypevrx Constant int GBP_SPACINGMARK +@deftypevrx Constant int GBP_L +@deftypevrx Constant int GBP_V +@deftypevrx Constant int GBP_T +@deftypevrx Constant int GBP_LV +@deftypevrx Constant int GBP_LVT +@end deftypevr + +The following function looks up the grapheme cluster break property of a character. + +@deftypefun int uc_graphemeclusterbreak_property (ucs4_t @var{uc}) +Returns the Grapheme_Cluster_Break property of a Unicode character. +@end deftypefun + +The following function determines whether there is a grapheme cluster +break between two Unicode characters. It is the primitive upon which +the higher-level functions in the previous section are directly based. + +@deftypefun bool uc_is_grapheme_break (ucs4_t @var{a}, ucs4_t @var{b}) +Returns true if there is an grapheme cluster boundary between Unicode +characters @var{a} and @var{b}. + +There is always a grapheme cluster break at the start or end of text. +Specify zero for @var{a} or @var{b} to indicate start of text or end +of text, respectively. + +This implements the extended (not legacy) grapheme cluster rules +described in the Unicode standard, because the standard says that they +are preferred. +@end deftypefun -- cgit v1.2.1