summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBehdad Esfahbod <behdad@gnome.org>2008-12-12 11:20:31 +0000
committerBehdad Esfahbod <behdad@src.gnome.org>2008-12-12 11:20:31 +0000
commitd453dce12191a1ee19fa70d83bab013999006385 (patch)
tree3840c3511ddcda498e1b113de39cd2cfd530942a
parentbeb61da0f7d5d1730f532cc2eb37df507a267a87 (diff)
downloadvte-d453dce12191a1ee19fa70d83bab013999006385.tar.gz
Document vteunistr.
2008-12-12 Behdad Esfahbod <behdad@gnome.org> * doc/reference/Makefile.am: * src/vteunistr.c (unistr_comp_hash), (unistr_comp_equal), (_vte_unistr_append_unichar), (_vte_unistr_get_base), (_vte_unistr_append_to_string), (_vte_unistr_strlen): * src/vteunistr.h: Document vteunistr. svn path=/trunk/; revision=2344
-rw-r--r--ChangeLog9
-rw-r--r--doc/reference/Makefile.am1
-rw-r--r--src/vteunistr.c120
-rw-r--r--src/vteunistr.h52
4 files changed, 134 insertions, 48 deletions
diff --git a/ChangeLog b/ChangeLog
index c1eb288e..c09ba1d5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
2008-12-12 Behdad Esfahbod <behdad@gnome.org>
+ * doc/reference/Makefile.am:
+ * src/vteunistr.c (unistr_comp_hash), (unistr_comp_equal),
+ (_vte_unistr_append_unichar), (_vte_unistr_get_base),
+ (_vte_unistr_append_to_string), (_vte_unistr_strlen):
+ * src/vteunistr.h:
+ Document vteunistr.
+
+2008-12-12 Behdad Esfahbod <behdad@gnome.org>
+
* src/vteunistr.h: Minor cleanup.
2008-12-12 Behdad Esfahbod <behdad@gnome.org>
diff --git a/doc/reference/Makefile.am b/doc/reference/Makefile.am
index 385969c8..b2d07a07 100644
--- a/doc/reference/Makefile.am
+++ b/doc/reference/Makefile.am
@@ -47,6 +47,7 @@ IGNORE_HFILES= \
vteskel.h \
vtetc.h \
vtetree.h \
+ vteunistr.h \
vtexft.h
diff --git a/src/vteunistr.c b/src/vteunistr.c
index 534d15b2..f6f7bc31 100644
--- a/src/vteunistr.c
+++ b/src/vteunistr.c
@@ -25,8 +25,57 @@
#include "vteunistr.h"
-#define VTE_UNISTR_START 0x80000000
+/* Overview:
+ *
+ * The way vteunistr is implemented is very simple: Unicode only defines
+ * codepoints less than 0x10FFFF. That leaves plenty of room in a guint32 to
+ * use for other things. So, whenever our "string" contains only one Unicode
+ * character, we use its code as our vteunistr. Otherwise, we register the
+ * string in a central registry and use assign a unique number to it. That
+ * number can be thought as "our own private non-unicode code for this
+ * sequence of characters".
+ *
+ * The rest of the problem would be how to efficiently implement this
+ * registry. It does *NOT* really have to be efficient at all, as it will
+ * only be access in case of combining marks. And the strings are pretty
+ * short most of the time. But our implementation is quite efficient and
+ * nifty anyway.
+ *
+ * The access pattern of using vteunistr's is that we have a vteunistr in a
+ * terminal cell, a new gunichar comes in and we decide to combine with it,
+ * and we combine them and get a new vteunistr. So, that is exactly how we
+ * encode vteunistr's: all we need to know about a vteunistr to be able to
+ * reconstruct its string is the vteunistr and the gunichar that joined to
+ * form it. That's what VteUnistrDecomp is. That is the decomposition.
+ *
+ * We start giving new vteunistr's unique numbers starting at
+ * VTE_UNISTR_START+1 and going up. We keep the decompositions in a GArray,
+ * called unistr_decomp. The first entry of the array is unused (that's why
+ * we start from VTE_UNISTR_START plus one). The decomposition table provides
+ * enough information to efficiently answer questions like "what's the first
+ * gunichar in this vteunistr?", "what's the sequence of gunichar's in this
+ * vteunistr?", and "how many gunichar's are there in this vteunistr?".
+ *
+ * We still do not have any efficient way to construct new vteunistr's though.
+ * Given a vteunistr and a gunichar, we have to walk over the entire
+ * decomposition table to see if we have already registered (encoded) this
+ * combination. To make that operation fast, we use a reverse map, that is,
+ * a GHashTable named unistr_comp. The hash table maps a decomposition to its
+ * encoded vteunistr value. The value obivously fits in a pointer and does
+ * not need memory allocation. We also want to avoid allocating memory for
+ * the keys of the hash table entries, as we already have those decompositions
+ * in the memory in the unistr_decomp array. We cannot use direct pointers
+ * though as when growing the GArray may resize and move to a new memory
+ * buffer, rendering all our pointers invalid. For this reason, we keep the
+ * index into the array as our hash table key. When we want to perform a
+ * lookup in the hash table, we insert the decomposition that we are searching
+ * for as item zero in the unistr_decomp table, then lookup for an equal entry
+ * of it in the hash table. Finally, if the hash lookup fails, we add the new
+ * decomposition to the lookup array and the hash table and return the newly
+ * encoded vteunistr value.
+ */
+#define VTE_UNISTR_START 0x80000000
static vteunistr unistr_next = VTE_UNISTR_START + 1;
@@ -38,26 +87,22 @@ struct VteUnistrDecomp {
GArray *unistr_decomp;
GHashTable *unistr_comp;
+#define DECOMP_FROM_INDEX(i) g_array_index (unistr_decomp, struct VteUnistrDecomp, (i))
+#define DECOMP_FROM_UNISTR(s) DECOMP_FROM_INDEX ((s) - VTE_UNISTR_START)
+
static guint
unistr_comp_hash (gconstpointer key)
{
struct VteUnistrDecomp *decomp;
- decomp = &g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- GPOINTER_TO_UINT (key));
+ decomp = &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (key));
return decomp->prefix ^ decomp->suffix;
}
static gboolean
-unistr_comp_equal (gconstpointer a,
- gconstpointer b)
+unistr_comp_equal (gconstpointer a, gconstpointer b)
{
- return 0 == memcmp (&g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- GPOINTER_TO_UINT (a)),
- &g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- GPOINTER_TO_UINT (b)),
+ return 0 == memcmp (&DECOMP_FROM_INDEX (GPOINTER_TO_UINT (a)),
+ &DECOMP_FROM_INDEX (GPOINTER_TO_UINT (b)),
sizeof (struct VteUnistrDecomp));
}
@@ -71,17 +116,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c)
decomp.suffix = c;
if (G_UNLIKELY (!unistr_decomp)) {
- unistr_decomp = g_array_new (FALSE, TRUE,
- sizeof (struct VteUnistrDecomp));
+ unistr_decomp = g_array_new (FALSE, TRUE, sizeof (struct VteUnistrDecomp));
g_array_set_size (unistr_decomp, 1);
- unistr_comp = g_hash_table_new (unistr_comp_hash,
- unistr_comp_equal);
+ unistr_comp = g_hash_table_new (unistr_comp_hash, unistr_comp_equal);
} else {
- g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- 0) = decomp;
- ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp,
- GUINT_TO_POINTER (0)));
+ DECOMP_FROM_INDEX (0) = decomp;
+ ret = GPOINTER_TO_UINT (g_hash_table_lookup (unistr_comp, GUINT_TO_POINTER (0)));
}
if (G_UNLIKELY (!ret)) {
@@ -95,30 +135,12 @@ _vte_unistr_append_unichar (vteunistr s, gunichar c)
return ret;
}
-/* Unused
-int
-_vte_unistr_strlen (vteunistr s)
-{
- int len = 1;
- g_return_val_if_fail (s < unistr_next, len);
- while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
- s = g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- s - VTE_UNISTR_START).prefix;
- len++;
- }
- return len;
-}
-*/
-
gunichar
_vte_unistr_get_base (vteunistr s)
{
g_return_val_if_fail (s < unistr_next, s);
while (G_UNLIKELY (s >= VTE_UNISTR_START))
- s = g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- s - VTE_UNISTR_START).prefix;
+ s = DECOMP_FROM_UNISTR (s).prefix;
return (gunichar) s;
}
@@ -128,11 +150,23 @@ _vte_unistr_append_to_string (vteunistr s, GString *gs)
g_return_if_fail (s < unistr_next);
if (G_UNLIKELY (s >= VTE_UNISTR_START)) {
struct VteUnistrDecomp *decomp;
- decomp = &g_array_index (unistr_decomp,
- struct VteUnistrDecomp,
- s - VTE_UNISTR_START);
+ decomp = &DECOMP_FROM_UNISTR (s);
_vte_unistr_append_to_string (decomp->prefix, gs);
s = decomp->suffix;
}
g_string_append_unichar (gs, (gunichar) s);
}
+
+#if 0 /* unused */
+int
+_vte_unistr_strlen (vteunistr s)
+{
+ int len = 1;
+ g_return_val_if_fail (s < unistr_next, len);
+ while (G_UNLIKELY (s >= VTE_UNISTR_START)) {
+ s = DECOMP_FROM_UNISTR (s).prefix;
+ len++;
+ }
+ return len;
+}
+#endif
diff --git a/src/vteunistr.h b/src/vteunistr.h
index 730e3239..f8b7bf78 100644
--- a/src/vteunistr.h
+++ b/src/vteunistr.h
@@ -26,22 +26,64 @@
G_BEGIN_DECLS
+/**
+ * vteunistr:
+ *
+ * vteunistr is a gunichar-compatible way to store strings. A string
+ * consisting of a single unichar c is represented as the same value
+ * as c itself. In that sense, gunichars can be readily used as
+ * vteunistrs. Longer strings can be built by appending a unichar
+ * to an already existing string.
+ *
+ * vteunistr is essentially just a gunicode-compatible quark value.
+ * It can be used to store strings (of a base followed by combining
+ * characters) where the code was designed to only allow one character.
+ *
+ * Strings are internalized efficiently and never freed. No memory
+ * management of vteunistr values is needed.
+ **/
typedef guint32 vteunistr;
+/**
+ * _vte_unistr_append_unichar:
+ * @s: a #vteunistr
+ * @c: Unicode character to append to @s
+ *
+ * Creates a vteunistr value for the string @s followed by the
+ * character @c.
+ *
+ * Returns: the new #vteunistr value
+ **/
vteunistr
_vte_unistr_append_unichar (vteunistr s, gunichar c);
-/* Unused
-int
-_vte_unistr_strlen (vteunistr s);
-*/
-
gunichar
_vte_unistr_get_base (vteunistr s);
+/**
+ * _vte_unistr_append_to_string:
+ * @s: a #vteunistr
+ * @gs: a #GString to append @s to
+ *
+ * Appends @s to @gs. This is how one converts a #vteunistr to a
+ * traditional string.
+ **/
void
_vte_unistr_append_to_string (vteunistr s, GString *gs);
+#if 0 /* unused */
+/**
+ * _vte_unistr_strlen:
+ * @s: a #vteunistr
+ *
+ * Counts the number of character in @s.
+ *
+ * Returns: length of @s in characters.
+ **/
+int
+_vte_unistr_strlen (vteunistr s);
+#endif
+
G_END_DECLS
#endif