summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2009-02-08 20:56:37 +0100
committerBruno Haible <bruno@clisp.org>2009-02-08 20:56:37 +0100
commit5956ed4c1444f84c9cd620ea9e918f376f3c3ed4 (patch)
treed99e41da02ad9110e6c91102c63df0987794a1c5 /lib
parent38db1b35bf51a7a738f27d2660a9fdd3b46447be (diff)
downloadgnulib-5956ed4c1444f84c9cd620ea9e918f376f3c3ed4.tar.gz
New module 'uniwbrk/u8-wordbreaks'.
Diffstat (limited to 'lib')
-rw-r--r--lib/uniwbrk/u-wordbreaks.h127
-rw-r--r--lib/uniwbrk/u8-wordbreaks.c124
2 files changed, 251 insertions, 0 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
new file mode 100644
index 0000000000..5ef4e8c1a9
--- /dev/null
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -0,0 +1,127 @@
+/* Word breaks in UTF-8/UTF-16/UTF-32 strings.
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+void
+FUNC (const UNIT *s, size_t n, char *p)
+{
+ if (n > 0)
+ {
+ const UNIT *s_end = s + n;
+
+ /* Word break property of the last character.
+ -1 at the very beginning of the string. */
+ int last_char_prop = -1;
+
+ /* Format and Extend characters are ignored; this means, the mostly used
+ unit is the complex character (= character with subsequent ignored
+ characters).
+ Word break property of the last complex character.
+ -1 at the very beginning of the string. */
+ int last_compchar_prop = -1;
+ char *last_compchar_ptr = NULL;
+
+ /* For recognizing rules involving 3 complex characters:
+ Word break property of the second-to-last complex character.
+ -1 at the very beginning of the string. */
+ int secondlast_compchar_prop = -1;
+
+ /* Don't break inside multibyte characters. */
+ memset (p, 0, n);
+
+ while (s < s_end)
+ {
+ ucs4_t uc;
+ int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
+ int prop = uc_wordbreak_property (uc);
+
+ /* No break at the start of the string. */
+ if (last_char_prop >= 0)
+ {
+ /* No break between CR and LF. */
+ if (last_char_prop == WBP_CR && prop == WBP_LF)
+ /* *p = 0 */;
+ /* Break before and after newlines. */
+ else if (last_char_prop >= WBP_NEWLINE
+ /* same as:
+ last_char_prop == WBP_CR
+ || last_char_prop == WBP_LF
+ || last_char_prop == WBP_NEWLINE */
+ || prop >= WBP_NEWLINE
+ /* same as:
+ prop == WBP_CR
+ || prop == WBP_LF
+ || prop == WBP_NEWLINE */)
+ *p = 1;
+ /* Ignore Format and Extend characters. */
+ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ {
+ /* No break in these situations (see UAX #29):
+
+ secondlast last current
+
+ ALetter (MidLetter | MidNumLet) × ALetter (WB7)
+ ALetter × (MidLetter | MidNumLet) ALetter (WB6)
+ Numeric (MidNum | MidNumLet) × Numeric (WB11)
+ Numeric × (MidNum | MidNumLet) Numeric (WB12)
+ ALetter × ALetter (WB5)
+ ALetter × Numeric (WB9)
+ Numeric × ALetter (WB10)
+ Numeric × Numeric (WB8)
+ Katakana × Katakana (WB13)
+ (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
+ ExtendNumLet × ExtendNumLet (WB13a)
+ ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ */
+ /* No break across certain punctuation. Also, disable word
+ breaks that were recognized earlier (due to lookahead of
+ only one complex character). */
+ if ((prop == WBP_ALETTER
+ && (last_compchar_prop == WBP_MIDLETTER
+ || last_compchar_prop == WBP_MIDNUMLET)
+ && secondlast_compchar_prop == WBP_ALETTER)
+ || (prop == WBP_NUMERIC
+ && (last_compchar_prop == WBP_MIDNUM
+ || last_compchar_prop == WBP_MIDNUMLET)
+ && secondlast_compchar_prop == WBP_NUMERIC))
+ {
+ *last_compchar_ptr = 0;
+ /* *p = 0; */
+ }
+ else
+ {
+ /* Perform a single table lookup. */
+ if (uniwbrk_table[last_compchar_prop][prop])
+ *p = 1;
+ /* else *p = 0; */
+ }
+ }
+ }
+
+ last_char_prop = prop;
+ /* Ignore Format and Extend characters, except at the start of the string. */
+ if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ {
+ secondlast_compchar_prop = last_compchar_prop;
+ last_compchar_prop = prop;
+ last_compchar_ptr = p;
+ }
+
+ s += count;
+ p += count;
+ }
+ }
+}
diff --git a/lib/uniwbrk/u8-wordbreaks.c b/lib/uniwbrk/u8-wordbreaks.c
new file mode 100644
index 0000000000..59d2076de7
--- /dev/null
+++ b/lib/uniwbrk/u8-wordbreaks.c
@@ -0,0 +1,124 @@
+/* Word breaks in UTF-8 strings.
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "uniwbrk.h"
+
+#include <string.h>
+
+#include "unistr.h"
+#include "uniwbrk/wbrktable.h"
+
+#define FUNC u8_wordbreaks
+#define UNIT uint8_t
+#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe
+#include "u-wordbreaks.h"
+
+
+#ifdef TEST
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/* Read the contents of an input stream, and return it, terminated with a NUL
+ byte. */
+char *
+read_file (FILE *stream)
+{
+#define BUFSIZE 4096
+ char *buf = NULL;
+ int alloc = 0;
+ int size = 0;
+ int count;
+
+ while (! feof (stream))
+ {
+ if (size + BUFSIZE > alloc)
+ {
+ alloc = alloc + alloc / 2;
+ if (alloc < size + BUFSIZE)
+ alloc = size + BUFSIZE;
+ buf = realloc (buf, alloc);
+ if (buf == NULL)
+ {
+ fprintf (stderr, "out of memory\n");
+ exit (1);
+ }
+ }
+ count = fread (buf + size, 1, BUFSIZE, stream);
+ if (count == 0)
+ {
+ if (ferror (stream))
+ {
+ perror ("fread");
+ exit (1);
+ }
+ }
+ else
+ size += count;
+ }
+ buf = realloc (buf, size + 1);
+ if (buf == NULL)
+ {
+ fprintf (stderr, "out of memory\n");
+ exit (1);
+ }
+ buf[size] = '\0';
+ return buf;
+#undef BUFSIZE
+}
+
+int
+main (int argc, char * argv[])
+{
+ if (argc == 1)
+ {
+ /* Display all the word breaks in the input string. */
+ char *input = read_file (stdin);
+ int length = strlen (input);
+ char *breaks = malloc (length);
+ int i;
+
+ u8_wordbreaks ((uint8_t *) input, length, breaks);
+
+ for (i = 0; i < length; i++)
+ {
+ switch (breaks[i])
+ {
+ case 1:
+ /* U+2027 in UTF-8 encoding */
+ putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
+ break;
+ case 0:
+ break;
+ default:
+ abort ();
+ }
+ putc (input[i], stdout);
+ }
+
+ free (breaks);
+
+ return 0;
+ }
+ else
+ return 1;
+}
+
+#endif /* TEST */