Document vteunistr.

2008-12-12 Behdad Esfahbod <behdad@gnome.org> * doc/reference/Makefile.am: * src/vteunistr.c (unistr_comp_hash), (unistr_comp_equal), (_vte_unistr_append_unichar), (_vte_unistr_get_base), (_vte_unistr_append_to_string), (_vte_unistr_strlen): * src/vteunistr.h: Document vteunistr. svn path=/trunk/; revision=2344
author: Behdad Esfahbod <behdad@gnome.org> 2008-12-12 11:20:31 +0000
committer: Behdad Esfahbod <behdad@src.gnome.org> 2008-12-12 11:20:31 +0000
commit: d453dce12191a1ee19fa70d83bab013999006385 (patch)
tree: 3840c3511ddcda498e1b113de39cd2cfd530942a
parent: beb61da0f7d5d1730f532cc2eb37df507a267a87 (diff)
download: vte-d453dce12191a1ee19fa70d83bab013999006385.tar.gz
4 files changed, 134 insertions, 48 deletions
diff --git a/ChangeLog b/ChangeLog
index c1eb288e..c09ba1d5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2008-12-12  Behdad Esfahbod  <behdad@gnome.org>
 
+	* doc/reference/Makefile.am:
+	* src/vteunistr.c (unistr_comp_hash), (unistr_comp_equal),
+	(_vte_unistr_append_unichar), (_vte_unistr_get_base),
+	(_vte_unistr_append_to_string), (_vte_unistr_strlen):
+	* src/vteunistr.h:
+	Document vteunistr.
+
+2008-12-12  Behdad Esfahbod  <behdad@gnome.org>
+
 	* src/vteunistr.h: Minor cleanup.
 
 2008-12-12  Behdad Esfahbod  <behdad@gnome.org>
diff --git a/doc/reference/Makefile.am b/doc/reference/Makefile.am
index 385969c8..b2d07a07 100644
--- a/doc/reference/Makefile.am
+++ b/doc/reference/Makefile.am
@@ -47,6 +47,7 @@ IGNORE_HFILES= \
 	vteskel.h \
 	vtetc.h \
 	vtetree.h \
+	vteunistr.h \
 	vtexft.h
 
 
diff --git a/src/vteunistr.c b/src/vteunistr.c
index 534d15b2..f6f7bc31 100644
--- a/src/vteunistr.c
+++ b/src/vteunistr.c
@@ -25,8 +25,57 @@
 
 #include "vteunistr.h"
 
-#define VTE_UNISTR_START 0x80000000
+/* Overview:
+ *
+ * The way vteunistr is implemented is very simple: Unicode only defines
+ * codepoints less than 0x10FFFF.  That leaves plenty of room in a guint32 to
+ * use for other things.  So, whenever our "string" contains only one Unicode
+ * character, we use its code as our vteunistr.  Otherwise, we register the
+ * string in a central registry and use assign a unique number to it.  That
+ * number can be thought as "our own private non-unicode code for this
+ * sequence of characters".
+ *
+ * The rest of the problem would be how to efficiently implement this
+ * registry.  It does *NOT* really have to be efficient at all, as it will
+ * only be access in case of combining marks.  And the strings are pretty
+ * short most of the time.  But our implementation is quite efficient and
+ * nifty anyway.
+ *
+ * The access pattern of using vteunistr's is that we have a vteunistr in a
+ * terminal cell, a new gunichar comes in and we decide to combine with it,
+ * and we combine them and get a new vteunistr.  So, that is exactly how we
+ * encode vteunistr's: all we need to know about a vteunistr to be able to
+ * reconstruct its string is the vteunistr and the gunichar that joined to
+ * form it.  That's what VteUnistrDecomp is.  That is the decomposition.
+ *
+ * We start giving new vteunistr's unique numbers starting at
+ * VTE_UNISTR_START+1 and going up.  We keep the decompositions in a GArray,
+ * called unistr_decomp.  The first entry of the array is unused (that's why
+ * we start from VTE_UNISTR_START plus one).  The decomposition table provides
+ * enough information to efficiently answer questions like "what's the first
+ * gunichar in this vteunistr?", "what's the sequence of gunichar's in this
+ * vteunistr?", and "how many gunichar's are there in this vteunistr?".
+ *
+ * We still do not have any efficient way to construct new vteunistr's though.
+ * Given a vteunistr and a gunichar, we have to walk over the entire
+ * decomposition table to see if we have already registered (encoded) this
+ * combination.  To make that operation fast, we use a reverse map, that is,
+ * a GHashTable named unistr_comp.  The hash table maps a decomposition to its
+ * encoded vteunistr value.  The value obivously fits in a pointer and does
+ * not need memory allocation.  We also want to avoid allocating memory for
+ * the keys of the hash table entries, as we already have those decompositions
+ * in the memory in the unistr_decomp array.  We cannot use direct pointers
+ * though as when growing the GArray may resize and move to a new memory
+ * buffer, rendering all our pointers invalid.  For this reason, we keep the
+ * index into the array as our hash table key.  When we want to perform a
+ * lookup in the hash table, we insert the decomposition that we are searching
+ * for as item zero in the unistr_decomp table, then lookup for an equal entry
+ * of it in the hash table.  Finally, if the hash lookup fails, we add the new
+ * decomposition to the lookup array and the hash table and return the newly
+ * encoded vteunistr value.
+ */
 
+#define VTE_UNISTR_START 0x80000000
 
 static vteunistr unistr_next = VTE_UNISTR_START + 1;
 
@@ -38,26 +87,22 @@ struct VteUnistrDecomp {
 GArray     *unistr_decomp;
 GHashTable *unistr_comp;
 
+#define DECOMP_FROM_INDEX(i)	g_array_index (unistr_decomp, struct VteUnistrDecomp, (i))
+#define DECOMP_FROM_UNISTR(s)	DECOMP_FROM_INDEX ((s) - VTE_UNISTR_START)
+
 static guint
 unistr_comp_hash (gconstpointer key)
 {
 	struct VteUnistrDecomp *decomp;
-	decomp = &g_array_index (unistr_decomp,
-				 struct VteUnistrDecomp,
-				 GPOINTER_TO_UINT (key));
+	decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
 	return decomp->prefix ^ decomp->suffix;
 }
 
 static gboolean
-unistr_comp_equal (gconstpointer a,
-		      gconstpointer b)
+unistr_comp_equal (gconstpointer a, gconstpointer b)
 {
-	return 0 == memcmp (&g_array_index (unistr_decomp,
-					    struct VteUnistrDecomp,
-					    GPOINTER_TO_UINT (a)),
-			    &g_array_index (unistr_decomp,
-					    struct VteUnistrDecomp,
-					    GPOINTER_TO_UINT (b)),
+	return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
+			    &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
 			    sizeof (struct VteUnistrDecomp));
 }
 
@@ -71,17 +116,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c)
 	decomp.suffix = c;
 
 	if (G_UNLIKELY (!unistr_decomp)) {
-		unistr_decomp = g_array_new (FALSE, TRUE,
-						sizeof (struct VteUnistrDecomp));
+		unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct VteUnistrDecomp));
 		g_array_set_size (unistr_decomp, 1);
-		unistr_comp = g_hash_table_new (unistr_comp_hash,
-						unistr_comp_equal);
+		unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
 	} else {
-		g_array_index (unistr_decomp,
-			       struct VteUnistrDecomp,
-			       0) = decomp;
-		ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp,
-							     GUINT_TO_POINTER (0)));
+		DECOMP_FROM_INDEX (0) = decomp;
+		ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
 	}
 
 	if (G_UNLIKELY (!ret)) {
@@ -95,30 +135,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c)
 	return ret;
 }
 
-/* Unused
-int
-_vte_unistr_strlen (vteunistr s)
-{
-	int len = 1;
-	g_return_val_if_fail (s < unistr_next, len);
-	while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
-		s = g_array_index (unistr_decomp,
-				   struct VteUnistrDecomp,
-				   s - VTE_UNISTR_START).prefix;
-		len++;
-	}
-	return len;
-}
-*/
-
 gunichar
 _vte_unistr_get_base (vteunistr s)
 {
 	g_return_val_if_fail (s < unistr_next, s);
 	while (G_UNLIKELY (s >= VTE_UNISTR_START))
-		s = g_array_index (unistr_decomp,
-				   struct VteUnistrDecomp,
-				   s - VTE_UNISTR_START).prefix;
+		s = DECOMP_FROM_UNISTR (s).prefix;
 	return (gunichar) s;
 }
 
@@ -128,11 +150,23 @@ _vte_unistr_append_to_string (vteunistr s, GString *gs)
 	g_return_if_fail (s < unistr_next);
 	if (G_UNLIKELY (s >= VTE_UNISTR_START)) {
 		struct VteUnistrDecomp *decomp;
-		decomp = &g_array_index (unistr_decomp,
-					 struct VteUnistrDecomp,
-					 s - VTE_UNISTR_START);
+		decomp = &DECOMP_FROM_UNISTR (s);
 		_vte_unistr_append_to_string (decomp->prefix, gs);
 		s = decomp->suffix;
 	}
 	g_string_append_unichar (gs, (gunichar) s);
 }
+
+#if 0 /* unused */
+int
+_vte_unistr_strlen (vteunistr s)
+{
+	int len = 1;
+	g_return_val_if_fail (s < unistr_next, len);
+	while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
+		s = DECOMP_FROM_UNISTR (s).prefix;
+		len++;
+	}
+	return len;
+}
+#endif
diff --git a/src/vteunistr.h b/src/vteunistr.h
index 730e3239..f8b7bf78 100644
--- a/src/vteunistr.h
+++ b/src/vteunistr.h
@@ -26,22 +26,64 @@
 
 G_BEGIN_DECLS
 
+/**
+ * vteunistr:
+ *
+ * vteunistr is a gunichar-compatible way to store strings.  A string
+ * consisting of a single unichar c is represented as the same value
+ * as c itself.  In that sense, gunichars can be readily used as
+ * vteunistrs.  Longer strings can be built by appending a unichar
+ * to an already existing string.
+ *
+ * vteunistr is essentially just a gunicode-compatible quark value.
+ * It can be used to store strings (of a base followed by combining
+ * characters) where the code was designed to only allow one character.
+ *
+ * Strings are internalized efficiently and never freed.  No memory
+ * management of vteunistr values is needed.
+ **/
 typedef guint32 vteunistr;
 
+/**
+ * _vte_unistr_append_unichar:
+ * @s: a #vteunistr
+ * @c: Unicode character to append to @s
+ *
+ * Creates a vteunistr value for the string @s followed by the
+ * character @c.
+ *
+ * Returns: the new #vteunistr value
+ **/
 vteunistr
 _vte_unistr_append_unichar (vteunistr s, gunichar c);
 
-/* Unused
-int
-_vte_unistr_strlen (vteunistr s);
-*/
-
 gunichar
 _vte_unistr_get_base (vteunistr s);
 
+/**
+ * _vte_unistr_append_to_string:
+ * @s: a #vteunistr
+ * @gs: a #GString to append @s to
+ *
+ * Appends @s to @gs.  This is how one converts a #vteunistr to a
+ * traditional string.
+ **/
 void
 _vte_unistr_append_to_string (vteunistr s, GString *gs);
 
+#if 0 /* unused */
+/**
+ * _vte_unistr_strlen:
+ * @s: a #vteunistr
+ *
+ * Counts the number of character in @s.
+ *
+ * Returns: length of @s in characters.
+ **/
+int
+_vte_unistr_strlen (vteunistr s);
+#endif
+
 G_END_DECLS
 
 #endif
author	Behdad Esfahbod <behdad@gnome.org>	2008-12-12 11:20:31 +0000
committer	Behdad Esfahbod <behdad@src.gnome.org>	2008-12-12 11:20:31 +0000
commit	d453dce12191a1ee19fa70d83bab013999006385 (patch)
tree	3840c3511ddcda498e1b113de39cd2cfd530942a
parent	beb61da0f7d5d1730f532cc2eb37df507a267a87 (diff)
download	vte-d453dce12191a1ee19fa70d83bab013999006385.tar.gz