1 files changed, 1163 insertions, 0 deletions
diff --git a/src/myspell/hyphen.c b/src/myspell/hyphen.c
index e265efe..1c10c22 100644
--- a/src/myspell/hyphen.c
+++ b/src/myspell/hyphen.c
@@ -6,6 +6,1169 @@
  * Copyright (C) 1998 Raph Levien, 
  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
+ *           (C) 2006, 2007, 2008, 2010 L)szl8 Nmeth (nemeth at OOo)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the 
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 
+ * Boston, MA  02111-1307  USA.
+*/
+
+/*
+ * The contents of this file are subject to the Mozilla Public License
+ * Version 1.0 (the "MPL"); you may not use this file except in
+ * compliance with the MPL.  You may obtain a copy of the MPL at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the MPL is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
+ * for the specific language governing rights and limitations under the
+ * MPL.
+ *
+ */
+#include <stdlib.h> /* for NULL, malloc */
+#include <stdio.h>  /* for fprintf */
+#include <string.h> /* for strdup */
+
+#ifdef UNX
+#include <unistd.h> /* for exit */
+#endif
+
+#define noVERBOSE
+
+/* calculate hyphenmin values with long ligature length (2 or 3 characters
+ * instead of 1 or 2) for comparison with hyphenation without ligatures */
+#define noLONG_LIGATURE
+
+#ifdef LONG_LIGATURE
+#define LIG_xx	1
+#define LIG_xxx	2
+#else
+#define LIG_xx	0
+#define LIG_xxx	1
+#endif
+
+#include "hnjalloc.h"
+#include "hyphen.h"
+
+static char *
+hnj_strdup (const char *s)
+{
+  char *new;
+  int l;
+
+  l = strlen (s);
+  new = hnj_malloc (l + 1);
+  memcpy (new, s, l);
+  new[l] = 0;
+  return new;
+}
+/* get a pointer to the nth 8-bit or UTF-8 character of the word */
+char * hindex(char * word, int n, int utf8) {
+	int j = 0;
+	while (j < n) {
+		j++;
+		word++;
+		while (utf8 && ((((unsigned char) *word) >> 6) == 2)) word++;
+	}
+	return word;
+}
+
+/* list possible hyphenations with -dd option (example for the usage of the hyphenate2() function) */
+void single_hyphenations(char * word, char * hyphen, char ** rep, int * pos, int * cut, int utf8) {
+	int i, k, j = 0;
+	char r;
+	for (i = 0; (i + 1) < strlen(word); i++) {
+		if (utf8 && ((((unsigned char) word[i]) >> 6) == 2)) continue;
+		if ((hyphen[j] & 1)) {
+			if (rep && rep[j]) {
+				k = hindex(word, j - pos[j] + 1, utf8) - word;
+				r = word[k];
+				word[k] = 0;
+				printf(" - %s%s", word, rep[j]);
+				word[k] = r;
+				printf("%s\n", hindex(word + k, cut[j], utf8));
+			} else {
+				k = hindex(word, j + 1, utf8) - word;
+				r = word[k];
+				word[k] = 0;
+				printf(" - %s=", word);
+				word[k] = r;
+				printf("%s\n", word + k);
+			}
+		}
+		j++;
+	}
+}
+
+/* remove cross-platform text line end characters */
+void hnj_strchomp(char * s)
+{
+  int k = strlen(s);
+  if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
+  if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
+}
+
+/* a little bit of a hash table implementation. This simply maps strings
+   to state numbers */
+
+typedef struct _HashTab HashTab;
+typedef struct _HashEntry HashEntry;
+
+/* A cheap, but effective, hack. */
+#define HASH_SIZE 31627
+
+struct _HashTab {
+  HashEntry *entries[HASH_SIZE];
+};
+
+struct _HashEntry {
+  HashEntry *next;
+  char *key;
+  int val;
+};
+
+/* a char* hash function from ASU - adapted from Gtk+ */
+static unsigned int
+hnj_string_hash (const char *s)
+{
+  const char *p;
+  unsigned int h=0, g;
+  for(p = s; *p != '\0'; p += 1) {
+    h = ( h << 4 ) + *p;
+    if ( ( g = h & 0xf0000000 ) ) {
+      h = h ^ (g >> 24);
+      h = h ^ g;
+    }
+  }
+  return h /* % M */;
+}
+
+static HashTab *
+hnj_hash_new (void)
+{
+  HashTab *hashtab;
+  int i;
+
+  hashtab = hnj_malloc (sizeof(HashTab));
+  for (i = 0; i < HASH_SIZE; i++)
+    hashtab->entries[i] = NULL;
+
+  return hashtab;
+}
+
+static void
+hnj_hash_free (HashTab *hashtab)
+{
+  int i;
+  HashEntry *e, *next;
+
+  for (i = 0; i < HASH_SIZE; i++)
+    for (e = hashtab->entries[i]; e; e = next)
+      {
+	next = e->next;
+	hnj_free (e->key);
+	hnj_free (e);
+      }
+
+  hnj_free (hashtab);
+}
+
+/* assumes that key is not already present! */
+static void
+hnj_hash_insert (HashTab *hashtab, const char *key, int val)
+{
+  int i;
+  HashEntry *e;
+
+  i = hnj_string_hash (key) % HASH_SIZE;
+  e = hnj_malloc (sizeof(HashEntry));
+  e->next = hashtab->entries[i];
+  e->key = hnj_strdup (key);
+  e->val = val;
+  hashtab->entries[i] = e;
+}
+
+/* return val if found, otherwise -1 */
+static int
+hnj_hash_lookup (HashTab *hashtab, const char *key)
+{
+  int i;
+  HashEntry *e;
+  i = hnj_string_hash (key) % HASH_SIZE;
+  for (e = hashtab->entries[i]; e; e = e->next)
+    if (!strcmp (key, e->key))
+      return e->val;
+  return -1;
+}
+
+/* Get the state number, allocating a new state if necessary. */
+static int
+hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
+{
+  int state_num;
+
+  state_num = hnj_hash_lookup (hashtab, string);
+
+  if (state_num >= 0)
+    return state_num;
+
+  hnj_hash_insert (hashtab, string, dict->num_states);
+  /* predicate is true if dict->num_states is a power of two */
+  if (!(dict->num_states & (dict->num_states - 1)))
+    {
+      dict->states = hnj_realloc (dict->states,
+				  (dict->num_states << 1) *
+				  sizeof(HyphenState));
+    }
+  dict->states[dict->num_states].match = NULL;
+  dict->states[dict->num_states].repl = NULL;
+  dict->states[dict->num_states].fallback_state = -1;
+  dict->states[dict->num_states].num_trans = 0;
+  dict->states[dict->num_states].trans = NULL;
+  return dict->num_states++;
+}
+
+/* add a transition from state1 to state2 through ch - assumes that the
+   transition does not already exist */
+static void
+hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
+{
+  int num_trans;
+
+  num_trans = dict->states[state1].num_trans;
+  if (num_trans == 0)
+    {
+      dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
+    }
+  else if (!(num_trans & (num_trans - 1)))
+    {
+      dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
+						(num_trans << 1) *
+						sizeof(HyphenTrans));
+    }
+  dict->states[state1].trans[num_trans].ch = ch;
+  dict->states[state1].trans[num_trans].new_state = state2;
+  dict->states[state1].num_trans++;
+}
+
+#ifdef VERBOSE
+HashTab *global;
+
+static char *
+get_state_str (int state)
+{
+  int i;
+  HashEntry *e;
+
+  for (i = 0; i < HASH_SIZE; i++)
+    for (e = global->entries[i]; e; e = e->next)
+      if (e->val == state)
+	return e->key;
+  return NULL;
+}
+#endif
+
+HyphenDict *
+hnj_hyphen_load (const char *fn)
+{
+  HyphenDict *dict[2];
+  HashTab *hashtab;
+  FILE *f;
+  char buf[MAX_CHARS];
+  char word[MAX_CHARS];
+  char pattern[MAX_CHARS];
+  char * repl;
+  signed char replindex;
+  signed char replcut;
+  int state_num = 0, last_state;
+  int i, j, k;
+  char ch;
+  int found;
+  HashEntry *e;
+  int nextlevel = 0;
+
+  f = fopen (fn, "r");
+  if (f == NULL)
+    return NULL;
+
+// loading one or two dictionaries (separated by NEXTLEVEL keyword)
+for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { 
+  hashtab = hnj_hash_new ();
+#ifdef VERBOSE
+  global = hashtab;
+#endif
+  hnj_hash_insert (hashtab, "", 0);
+  dict[k] = hnj_malloc (sizeof(HyphenDict));
+  dict[k]->num_states = 1;
+  dict[k]->states = hnj_malloc (sizeof(HyphenState));
+  dict[k]->states[0].match = NULL;
+  dict[k]->states[0].repl = NULL;
+  dict[k]->states[0].fallback_state = -1;
+  dict[k]->states[0].num_trans = 0;
+  dict[k]->states[0].trans = NULL;
+  dict[k]->nextlevel = NULL;
+  dict[k]->lhmin = 0;
+  dict[k]->rhmin = 0;
+  dict[k]->clhmin = 0;
+  dict[k]->crhmin = 0;
+  dict[k]->nohyphen = NULL;
+  dict[k]->nohyphenl = 0;
+
+  /* read in character set info */
+  if (k == 0) {
+    for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
+    if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {
+      for (i=0;i<MAX_NAME;i++)
+        if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
+          dict[k]->cset[i] = 0;
+    } else {
+      dict[k]->cset[0] = 0;
+    }
+    dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
+  } else {
+    strcpy(dict[k]->cset, dict[0]->cset);
+    dict[k]->utf8 = dict[0]->utf8;
+  }
+
+  while (fgets (buf, sizeof(buf), f) != NULL)
+    {
+      if (buf[0] != '%')
+	{
+	  if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
+	    nextlevel = 1;
+	    break;
+	  } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
+	    dict[k]->lhmin = atoi(buf + 13);
+	    continue;
+	  } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
+	    dict[k]->rhmin = atoi(buf + 14);
+	    continue;
+	  } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
+	    dict[k]->clhmin = atoi(buf + 21);
+	    continue;
+	  } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
+	    dict[k]->crhmin = atoi(buf + 22);
+	    continue;
+	  } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
+	    char * space = buf + 8;
+	    while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
+	    if (*buf != '\0') dict[k]->nohyphen = hnj_strdup(space);
+	    if (dict[k]->nohyphen) {
+	        char * nhe = dict[k]->nohyphen + strlen(dict[k]->nohyphen) - 1;
+	        *nhe = 0;
+	        for (nhe = nhe - 1; nhe > dict[k]->nohyphen; nhe--) {
+	                if (*nhe == ',') {
+	                    dict[k]->nohyphenl++;
+	                    *nhe = 0;
+	                }
+	        }
+	    }
+	    continue;
+	  } 
+	  j = 0;
+	  pattern[j] = '0';
+          repl = strchr(buf, '/');
+          replindex = 0;
+          replcut = 0;
+          if (repl) {
+            char * index = strchr(repl + 1, ',');
+            *repl = '\0';
+            if (index) {
+                char * index2 = strchr(index + 1, ',');
+                *index = '\0';
+                if (index2) {
+                    *index2 = '\0';
+                    replindex = (signed char) atoi(index + 1) - 1;
+                    replcut = (signed char) atoi(index2 + 1);                
+                }
+            } else {
+                hnj_strchomp(repl + 1);
+                replindex = 0;
+                replcut = (signed char) strlen(buf);
+            }
+            repl = hnj_strdup(repl + 1);
+          }
+	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
+	    {
+	      if (buf[i] >= '0' && buf[i] <= '9')
+		pattern[j] = buf[i];
+	      else
+		{
+		  word[j] = buf[i];
+		  pattern[++j] = '0';
+		}
+	    }
+	  word[j] = '\0';
+	  pattern[j + 1] = '\0';
+
+          i = 0;
+	  if (!repl) {
+	    /* Optimize away leading zeroes */
+            for (; pattern[i] == '0'; i++);
+          } else {
+            if (*word == '.') i++;
+            /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
+            if (dict[k]->utf8) {
+                int pu = -1;        /* unicode character position */
+                int ps = -1;        /* unicode start position (original replindex) */
+                int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
+                for (; pc < (strlen(word) + 1); pc++) {
+                /* beginning of an UTF-8 character (not '10' start bits) */
+                    if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
+                    if ((ps < 0) && (replindex == pu)) {
+                        ps = replindex;
+                        replindex = (signed char) pc;
+                    }
+                    if ((ps >= 0) && ((pu - ps) == replcut)) {
+                        replcut = (signed char) (pc - replindex);
+                        break;
+                    }
+                }
+                if (*word == '.') replindex--;
+            }
+          }
+
+#ifdef VERBOSE
+	  printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
+#endif
+	  found = hnj_hash_lookup (hashtab, word);
+	  state_num = hnj_get_state (dict[k], hashtab, word);
+	  dict[k]->states[state_num].match = hnj_strdup (pattern + i);
+	  dict[k]->states[state_num].repl = repl;
+	  dict[k]->states[state_num].replindex = replindex;
+          if (!replcut) {
+            dict[k]->states[state_num].replcut = (signed char) strlen(word);
+          } else {
+            dict[k]->states[state_num].replcut = replcut;
+          }
+
+	  /* now, put in the prefix transitions */
+          for (; found < 0 ;j--)
+	    {
+	      last_state = state_num;
+	      ch = word[j - 1];
+	      word[j - 1] = '\0';
+	      found = hnj_hash_lookup (hashtab, word);
+	      state_num = hnj_get_state (dict[k], hashtab, word);
+	      hnj_add_trans (dict[k], state_num, last_state, ch);
+	    }
+	}
+    }
+
+  /* Could do unioning of matches here (instead of the preprocessor script).
+     If we did, the pseudocode would look something like this:
+
+     foreach state in the hash table
+        foreach i = [1..length(state) - 1]
+           state to check is substr (state, i)
+           look it up
+           if found, and if there is a match, union the match in.
+
+     It's also possible to avoid the quadratic blowup by doing the
+     search in order of increasing state string sizes - then you
+     can break the loop after finding the first match.
+
+     This step should be optional in any case - if there is a
+     preprocessed rule table, it's always faster to use that.
+
+*/
+
+  /* put in the fallback states */
+  for (i = 0; i < HASH_SIZE; i++)
+    for (e = hashtab->entries[i]; e; e = e->next)
+      {
+	if (*(e->key)) for (j = 1; 1; j++)
+	  {          
+	    state_num = hnj_hash_lookup (hashtab, e->key + j);
+	    if (state_num >= 0)
+	      break;
+	  }
+        /* KBH: FIXME state 0 fallback_state should always be -1? */
+	if (e->val)
+	  dict[k]->states[e->val].fallback_state = state_num;
+      }
+#ifdef VERBOSE
+  for (i = 0; i < HASH_SIZE; i++)
+    for (e = hashtab->entries[i]; e; e = e->next)
+      {
+	printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
+		dict[k]->states[e->val].fallback_state);
+	for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
+	  printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
+		  dict[k]->states[e->val].trans[j].new_state);
+      }
+#endif
+
+#ifndef VERBOSE
+  hnj_hash_free (hashtab);
+#endif
+  state_num = 0;
+}
+  fclose(f);
+  if (k == 2) dict[0]->nextlevel = dict[1];
+  return dict[0];
+}
+
+void hnj_hyphen_free (HyphenDict *dict)
+{
+  int state_num;
+  HyphenState *hstate;
+
+  for (state_num = 0; state_num < dict->num_states; state_num++)
+    {
+      hstate = &dict->states[state_num];
+      if (hstate->match)
+	hnj_free (hstate->match);
+      if (hstate->repl)
+	hnj_free (hstate->repl);
+      if (hstate->trans)
+	hnj_free (hstate->trans);
+    }
+  if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
+
+  if (dict->nohyphen) hnj_free(dict->nohyphen);
+
+  hnj_free (dict->states);
+
+  hnj_free (dict);
+}
+
+#define MAX_WORD 256
+
+int hnj_hyphen_hyphenate (HyphenDict *dict,
+			   const char *word, int word_size,
+			   char *hyphens)
+{
+  char prep_word_buf[MAX_WORD];
+  char *prep_word;
+  int i, j, k;
+  int state;
+  char ch;
+  HyphenState *hstate;
+  char *match;
+  int offset;
+
+  if (word_size + 3 < MAX_WORD)
+    prep_word = prep_word_buf;
+  else
+    prep_word = hnj_malloc (word_size + 3);
+
+  j = 0;
+  prep_word[j++] = '.';
+
+  for (i = 0; i < word_size; i++)
+      prep_word[j++] = word[i];
+
+  prep_word[j++] = '.';
+  prep_word[j] = '\0';
+
+  for (i = 0; i < word_size + 5; i++)
+    hyphens[i] = '0';
+
+#ifdef VERBOSE
+  printf ("prep_word = %s\n", prep_word);
+#endif
+
+  /* now, run the finite state machine */
+  state = 0;
+  for (i = 0; i < j; i++)
+    {
+      ch = prep_word[i];
+      for (;;)
+	{
+
+	  if (state == -1) {
+            /* return 1; */
+	    /*  KBH: FIXME shouldn't this be as follows? */
+            state = 0;
+            goto try_next_letter;
+          }          
+
+#ifdef VERBOSE
+	  char *state_str;
+	  state_str = get_state_str (state);
+
+	  for (k = 0; k < i - strlen (state_str); k++)
+	    putchar (' ');
+	  printf ("%s", state_str);
+#endif
+
+	  hstate = &dict->states[state];
+	  for (k = 0; k < hstate->num_trans; k++)
+	    if (hstate->trans[k].ch == ch)
+	      {
+		state = hstate->trans[k].new_state;
+		goto found_state;
+	      }
+	  state = hstate->fallback_state;
+#ifdef VERBOSE
+	  printf (" falling back, fallback_state %d\n", state);
+#endif
+	}
+    found_state:
+#ifdef VERBOSE
+      printf ("found state %d\n",state);
+#endif
+      /* Additional optimization is possible here - especially,
+	 elimination of trailing zeroes from the match. Leading zeroes
+	 have already been optimized. */
+      match = dict->states[state].match;
+      /* replacing rules not handled by hyphen_hyphenate() */
+      if (match && !dict->states[state].repl)
+	{
+	  offset = i + 1 - strlen (match);
+#ifdef VERBOSE
+	  for (k = 0; k < offset; k++)
+	    putchar (' ');
+	  printf ("%s\n", match);
+#endif
+	  /* This is a linear search because I tried a binary search and
+	     found it to be just a teeny bit slower. */
+	  for (k = 0; match[k]; k++)
+	    if (hyphens[offset + k] < match[k])
+	      hyphens[offset + k] = match[k];
+	}
+
+      /* KBH: we need this to make sure we keep looking in a word */
+      /* for patterns even if the current character is not known in state 0 */
+      /* since patterns for hyphenation may occur anywhere in the word */
+      try_next_letter: ;
+
+    }
+#ifdef VERBOSE
+  for (i = 0; i < j; i++)
+    putchar (hyphens[i]);
+  putchar ('\n');
+#endif
+
+  for (i = 0; i < j - 4; i++)
+#if 0
+    if (hyphens[i + 1] & 1)
+      hyphens[i] = '-';
+#else
+    hyphens[i] = hyphens[i + 1];
+#endif
+  hyphens[0] = '0';
+  for (; i < word_size; i++)
+    hyphens[i] = '0';
+  hyphens[word_size] = '\0';
+
+  if (prep_word != prep_word_buf)
+    hnj_free (prep_word);
+    
+  return 0;    
+}
+
+/* Unicode ligature length */
+int hnj_ligature(unsigned char c) {
+    switch (c) {
+        case 0x80:			/* ff */
+        case 0x81:			/* fi */
+        case 0x82: return LIG_xx;	/* fl */
+        case 0x83:			/* ffi */
+        case 0x84: return LIG_xxx;	/* ffl */
+        case 0x85:			/* long st */
+        case 0x86: return LIG_xx;	/* st */
+    }
+    return 0;
+}
+
+/* character length of the first n byte of the input word */
+int hnj_hyphen_strnlen(const char * word, int n, int utf8)
+{
+    int i = 0;
+    int j = 0;
+    while (j < n && word[j] != '\0') {
+      i++;
+      // Unicode ligature support
+      if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
+        i += hnj_ligature(word[j + 2]);
+      }
+      for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
+    }
+    return i;
+}
+
+int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
+	char *** rep, int ** pos, int ** cut, int lhmin)
+{
+    int i = 1, j;
+
+    // Unicode ligature support
+    if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
+      i += hnj_ligature(word[2]);
+    }
+
+    for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
+      // check length of the non-standard part
+      if (*rep && *pos && *cut && (*rep)[j]) {
+        char * rh = strchr((*rep)[j], '=');
+        if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
+          hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
+            free((*rep)[j]);
+            (*rep)[j] = NULL;
+            hyphens[j] = '0';
+          }
+       } else {
+         hyphens[j] = '0';
+       }
+       j++;
+
+       // Unicode ligature support
+       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
+         i += hnj_ligature(word[j + 2]);
+       }
+    } while (utf8 && (word[j] & 0xc0) == 0x80);
+    return 0;
+}
+
+int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
+	char *** rep, int ** pos, int ** cut, int rhmin)
+{
+    int i;
+    int j = word_size - 2;    
+    for (i = 1; i < rhmin && j > 0; j--) {
+      // check length of the non-standard part
+      if (*rep && *pos && *cut && (*rep)[j]) {
+        char * rh = strchr((*rep)[j], '=');
+        if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
+          hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
+            free((*rep)[j]);
+            (*rep)[j] = NULL;
+            hyphens[j] = '0';
+          }
+       } else {
+         hyphens[j] = '0';
+       }
+       if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
+    }
+    return 0;
+}
+
+// recursive function for compound level hyphenation
+int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
+    char * hyphens, char *** rep, int ** pos, int ** cut,
+    int clhmin, int crhmin, int lend, int rend)
+{
+  char prep_word_buf[MAX_WORD];
+  char *prep_word;
+  int i, j, k;
+  int state;
+  char ch;
+  HyphenState *hstate;
+  char *match;
+  char *repl;
+  signed char replindex;
+  signed char replcut;
+  int offset;
+  int matchlen_buf[MAX_CHARS];
+  int matchindex_buf[MAX_CHARS];
+  char * matchrepl_buf[MAX_CHARS];
+  int * matchlen;
+  int * matchindex;
+  char ** matchrepl;  
+  int isrepl = 0;
+  int nHyphCount;
+
+  if (word_size + 3 < MAX_CHARS) {
+    prep_word = prep_word_buf;
+    matchlen = matchlen_buf;
+    matchindex = matchindex_buf;
+    matchrepl = matchrepl_buf;
+  } else {
+    prep_word = hnj_malloc (word_size + 3);
+    matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
+    matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
+    matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
+  }
+
+  j = 0;
+  prep_word[j++] = '.';
+  
+  for (i = 0; i < word_size; i++)
+      prep_word[j++] = word[i];
+
+  prep_word[j++] = '.';
+  prep_word[j] = '\0';
+
+  for (i = 0; i < j; i++)
+    hyphens[i] = '0';    
+
+#ifdef VERBOSE
+  printf ("prep_word = %s\n", prep_word);
+#endif
+
+  /* now, run the finite state machine */
+  state = 0;
+  for (i = 0; i < j; i++)
+    {
+      ch = prep_word[i];
+      for (;;)
+	{
+
+	  if (state == -1) {
+            /* return 1; */
+	    /*  KBH: FIXME shouldn't this be as follows? */
+            state = 0;
+            goto try_next_letter;
+          }          
+
+#ifdef VERBOSE
+	  char *state_str;
+	  state_str = get_state_str (state);
+
+	  for (k = 0; k < i - strlen (state_str); k++)
+	    putchar (' ');
+	  printf ("%s", state_str);
+#endif
+
+	  hstate = &dict->states[state];
+	  for (k = 0; k < hstate->num_trans; k++)
+	    if (hstate->trans[k].ch == ch)
+	      {
+		state = hstate->trans[k].new_state;
+		goto found_state;
+	      }
+	  state = hstate->fallback_state;
+#ifdef VERBOSE
+	  printf (" falling back, fallback_state %d\n", state);
+#endif
+	}
+    found_state:
+#ifdef VERBOSE
+      printf ("found state %d\n",state);
+#endif
+      /* Additional optimization is possible here - especially,
+	 elimination of trailing zeroes from the match. Leading zeroes
+	 have already been optimized. */
+      match = dict->states[state].match;
+      repl = dict->states[state].repl;
+      replindex = dict->states[state].replindex;
+      replcut = dict->states[state].replcut;
+      /* replacing rules not handled by hyphen_hyphenate() */
+      if (match)
+	{
+	  offset = i + 1 - strlen (match);
+#ifdef VERBOSE
+	  for (k = 0; k < offset; k++)
+	    putchar (' ');
+	  printf ("%s (%s)\n", match, repl);
+#endif
+          if (repl) {
+            if (!isrepl) for(; isrepl < word_size; isrepl++) {
+                matchrepl[isrepl] = NULL;
+                matchindex[isrepl] = -1;
+            }
+            matchlen[offset + replindex] = replcut;
+          }
+	  /* This is a linear search because I tried a binary search and
+	     found it to be just a teeny bit slower. */
+	  for (k = 0; match[k]; k++) {
+	    if ((hyphens[offset + k] < match[k])) {
+	      hyphens[offset + k] = match[k];
+              if (match[k]&1) {
+                matchrepl[offset + k] = repl;
+                if (repl && (k >= replindex) && (k <= replindex + replcut)) {
+                    matchindex[offset + replindex] = offset + k;
+                }
+              }
+            }
+          }
+          
+	}
+
+      /* KBH: we need this to make sure we keep looking in a word */
+      /* for patterns even if the current character is not known in state 0 */
+      /* since patterns for hyphenation may occur anywhere in the word */
+      try_next_letter: ;
+
+    }
+#ifdef VERBOSE
+  for (i = 0; i < j; i++)
+    putchar (hyphens[i]);
+  putchar ('\n');
+#endif
+
+  for (i = 0; i < j - 3; i++)
+#if 0
+    if (hyphens[i + 1] & 1)
+      hyphens[i] = '-';
+#else
+    hyphens[i] = hyphens[i + 1];
+#endif
+  for (; i < word_size; i++)
+    hyphens[i] = '0';
+  hyphens[word_size] = '\0';
+
+       /* now create a new char string showing hyphenation positions */
+       /* count the hyphens and allocate space for the new hyphenated string */
+       nHyphCount = 0;
+       for (i = 0; i < word_size; i++)
+          if (hyphens[i]&1)
+             nHyphCount++;
+       j = 0;
+       for (i = 0; i < word_size; i++) {
+           if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { 
+                if (rep && pos && cut) {
+                    if (!*rep && !*pos && !*cut) {
+                        int k;
+                        *rep = (char **) malloc(sizeof(char *) * word_size);
+                        *pos = (int *) malloc(sizeof(int) * word_size);
+                        *cut = (int *) malloc(sizeof(int) * word_size);
+                        for (k = 0; k < word_size; k++) {
+                            (*rep)[k] = NULL;
+                            (*pos)[k] = 0;
+                            (*cut)[k] = 0;
+                        }
+                    }
+                    (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
+                    (*pos)[matchindex[i] - 1] = matchindex[i] - i;
+                    (*cut)[matchindex[i] - 1] = matchlen[i];
+                }
+                j += strlen(matchrepl[matchindex[i]]);
+                i += matchlen[i] - 1;
+          }
+       }
+
+  if (matchrepl != matchrepl_buf) {
+    hnj_free (matchrepl);
+    hnj_free (matchlen);
+    hnj_free (matchindex);
+  }
+
+  // recursive hyphenation of the first (compound) level segments
+  if (dict->nextlevel) {
+     char * rep2_buf[MAX_WORD];
+     int pos2_buf[MAX_WORD];
+     int cut2_buf[MAX_WORD];
+     char hyphens2_buf[MAX_WORD];
+     char ** rep2;
+     int * pos2;
+     int * cut2;
+     char * hyphens2;
+     int begin = 0;
+     if (word_size < MAX_CHARS) {
+        rep2 = rep2_buf;
+        pos2 = pos2_buf;
+        cut2 = cut2_buf;
+        hyphens2 = hyphens2_buf;
+     } else {
+        rep2 = hnj_malloc (word_size * sizeof(char *));
+        pos2 = hnj_malloc (word_size * sizeof(int));
+        cut2 = hnj_malloc (word_size * sizeof(int));
+        hyphens2 = hnj_malloc (word_size);
+     }
+     for (i = 0; i < word_size; i++) rep2[i] = NULL;
+     for (i = 0; i < word_size; i++) if 
+        (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
+        if (i - begin > 1) {
+            int hyph = 0;
+            prep_word[i + 2] = '\0';
+            /* non-standard hyphenation at compound boundary (Schiffahrt) */
+            if (*rep && *pos && *cut && (*rep)[i]) {
+                char * l = strchr((*rep)[i], '=');
+                strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
+                if (l) {
+                    hyph = (l - (*rep)[i]) - (*pos)[i];
+                    prep_word[2 + i + hyph] = '\0';
+                }
+            }
+            hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
+                hyphens2, &rep2, &pos2, &cut2, clhmin,
+                crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
+            for (j = 0; j < i - begin - 1; j++) {
+                hyphens[begin + j] = hyphens2[j];
+                if (rep2[j] && rep && pos && cut) {
+                    if (!*rep && !*pos && !*cut) {
+                        int k;
+                        *rep = (char **) malloc(sizeof(char *) * word_size);
+                        *pos = (int *) malloc(sizeof(int) * word_size);
+                        *cut = (int *) malloc(sizeof(int) * word_size);
+                        for (k = 0; k < word_size; k++) {
+                            (*rep)[k] = NULL;
+                            (*pos)[k] = 0;
+                            (*cut)[k] = 0;
+                        }
+                    }
+                    (*rep)[begin + j] = rep2[j];
+                    (*pos)[begin + j] = pos2[j];
+                    (*cut)[begin + j] = cut2[j];
+                }
+            }
+            prep_word[i + 2] = word[i + 1];
+            if (*rep && *pos && *cut && (*rep)[i]) {
+                strcpy(prep_word + 1, word);
+            }
+        }
+        begin = i + 1;
+        for (j = 0; j < word_size; j++) rep2[j] = NULL;
+     }
+     
+     // non-compound
+     if (begin == 0) {
+        hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
+            hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
+        if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
+            rep, pos, cut, clhmin);
+        if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
+            rep, pos, cut, crhmin);
+     }
+     
+     if (rep2 != rep2_buf) {
+        free(rep2);
+        free(cut2);
+        free(pos2);
+        free(hyphens2);
+     }
+  }
+
+  if (prep_word != prep_word_buf) hnj_free (prep_word);
+  return 0;
+}
+
+/* UTF-8 normalization of hyphen and non-standard positions */
+int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
+	char *** rep, int ** pos, int ** cut)
+{
+  int i, j, k;
+  if ((((unsigned char) word[0]) >> 6) == 2) {
+    fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
+    return 1;
+  }
+
+  /* calculate UTF-8 character positions */
+  for (i = 0, j = -1; i < word_size; i++) {
+    /* beginning of an UTF-8 character (not '10' start bits) */
+    if ((((unsigned char) word[i]) >> 6) != 2) j++;
+    hyphens[j] = hyphens[i];
+    if (rep && pos && cut && *rep && *pos && *cut) {
+        int l = (*pos)[i];
+        (*pos)[j] = 0;
+        for (k = 0; k < l; k++) {
+            if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
+        }
+        k = i - l + 1;
+        l = k + (*cut)[i];
+        (*cut)[j] = 0;        
+        for (; k < l; k++) {
+            if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
+        }
+        (*rep)[j] = (*rep)[i];
+        if (j < i) {
+            (*rep)[i] = NULL;
+            (*pos)[i] = 0;
+            (*cut)[i] = 0;
+        }
+    }
+  }
+  hyphens[j + 1] = '\0';
+  return 0;
+}
+
+/* get the word with all possible hyphenations (output: hyphword) */
+void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 
+    char * hyphword, char *** rep, int ** pos, int ** cut)
+{
+  int i, j;
+  for (i = 0, j = 0; i < l; i++, j++) {
+    if (hyphens[i]&1) {
+      hyphword[j] = word[i];
+      if (*rep && *pos && *cut && (*rep)[i]) {
+        strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
+        j += strlen((*rep)[i]) - (*pos)[i];
+        i += (*cut)[i] - (*pos)[i];
+      } else hyphword[++j] = '=';
+    } else hyphword[j] = word[i];
+  }
+  hyphword[j] = '\0';
+}
+
+
+/* main api function with default hyphenmin parameters */
+int hnj_hyphen_hyphenate2 (HyphenDict *dict,
+			   const char *word, int word_size, char * hyphens,
+			   char *hyphword, char *** rep, int ** pos, int ** cut)
+{
+  hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
+    dict->clhmin, dict->crhmin, 1, 1);
+  hnj_hyphen_lhmin(dict->utf8, word, word_size,
+    hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
+  hnj_hyphen_rhmin(dict->utf8, word, word_size,
+    hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
+
+  /* nohyphen */
+  if (dict->nohyphen) {
+    char * nh = dict->nohyphen;
+    int nhi;
+    for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
+        char * nhy = (char *) strstr(word, nh);
+        while (nhy) {
+            hyphens[nhy - word + strlen(nh) - 1] = 0;
+            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
+            nhy = (char *) strstr(nhy + 1, nh);
+        }
+        nh = nh + strlen(nh) + 1;
+    }
+  }
+
+  if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
+  if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
+  return 0;
+}
+
+/* previous main api function with hyphenmin parameters */
+int hnj_hyphen_hyphenate3 (HyphenDict *dict,
+	const char *word, int word_size, char * hyphens,
+	char *hyphword, char *** rep, int ** pos, int ** cut,
+	int lhmin, int rhmin, int clhmin, int crhmin)
+{
+  lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
+  rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
+  hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
+    clhmin, crhmin, 1, 1);
+  hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
+    rep, pos, cut, (lhmin > 0 ? lhmin : 2));
+  hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
+    rep, pos, cut, (rhmin > 0 ? rhmin : 2));
+  if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
+
+  /* nohyphen */
+  if (dict->nohyphen) {
+    char * nh = dict->nohyphen;
+    int nhi;
+    for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
+        char * nhy = (char *) strstr(word, nh);
+        while (nhy) {
+            hyphens[nhy - word + strlen(nh) - 1] = 0;
+            if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
+            nhy = (char *) strstr(nhy + 1, nh);
+        }
+        nh = nh + strlen(nh) + 1;
+    }
+  }
+
+  if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
+  return 0;
+}
+/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
+ * licenses follows.
+ */
+
+/* LibHnj - a library for high quality hyphenation and justification
+ * Copyright (C) 1998 Raph Levien, 
+ * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 
+ *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
  *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
  *
  * This library is free software; you can redistribute it and/or