New function to convert a filename to a UTF-8 string for display purposes.

2004-11-02 Matthias Clasen <mclasen@redhat.com> * glib/gconvert.c (g_filename_display_name): New function to convert a filename to a UTF-8 string for display purposes. (g_get_filename_charsets): New function to return the encodings which are tried when converting a filename to UTF-8.
author: Matthias Clasen <mclasen@redhat.com> 2004-11-02 21:29:33 +0000
committer: Matthias Clasen <matthiasc@src.gnome.org> 2004-11-02 21:29:33 +0000
commit: 87ad7806a77156be2568ab768053ecb90ea0d66f (patch)
tree: 9b40ee264670b8e00be4662486405d18131dda7e
parent: 91ae46c37b2f7c576a558a19f58173e96974dff5 (diff)
download: glib-87ad7806a77156be2568ab768053ecb90ea0d66f.tar.gz
4 files changed, 184 insertions, 58 deletions
diff --git a/docs/reference/ChangeLog b/docs/reference/ChangeLog
index 4543059ee..16aa86c6e 100644
--- a/docs/reference/ChangeLog
+++ b/docs/reference/ChangeLog
@@ -1,3 +1,8 @@
+2004-11-02  Matthias Clasen  <mclasen@redhat.com>
+
+	* glib/glib-sections.txt: Add g_get_filename_charsets and
+	g_filename_display_name.
+
 2004-11-01  Matthias Clasen  <mclasen@redhat.com>
 
 	* glib/tmpl/option.sgml: Updates
diff --git a/docs/reference/glib/glib-sections.txt b/docs/reference/glib/glib-sections.txt
index 996ea5a8c..810535c7e 100644
--- a/docs/reference/glib/glib-sections.txt
+++ b/docs/reference/glib/glib-sections.txt
@@ -2079,6 +2079,8 @@ g_filename_to_utf8
 g_filename_from_utf8
 g_filename_from_uri
 g_filename_to_uri
+g_get_filename_charsets
+g_filename_display_name
 g_uri_list_extract_uris
 g_locale_from_utf8
 GConvertError
diff --git a/glib/gconvert.c b/glib/gconvert.c
index 4566f88d7..9752ae28c 100644
--- a/glib/gconvert.c
+++ b/glib/gconvert.c
@@ -998,7 +998,7 @@ typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
 struct _GFilenameCharsetCache {
   gboolean is_utf8;
   gchar *charset;
-  gchar *filename_charset;
+  gchar **filename_charsets;
 };
 
 static void
@@ -1006,41 +1006,47 @@ filename_charset_cache_free (gpointer data)
 {
   GFilenameCharsetCache *cache = data;
   g_free (cache->charset);
-  g_free (cache->filename_charset);
+  g_strfreev (cache->filename_charsets);
   g_free (cache);
 }
 
 /*
- * get_filename_charset:
- * @charset: return location for the name of the filename encoding 
+ * g_get_filename_charsets:
+ * @charsets: return location for the %NULL-terminated list of encoding names
  *
- * Determines the preferred character set used for filenames by
- * consulting the environment variables G_FILENAME_ENCODING and
- * G_BROKEN_FILENAMES.
+ * Determines the preferred character sets used for filenames.
+ * The first character set from the @charsets is the filename encoding, the
+ * subsequent character sets are used when trying to generate a displayable
+ * representation of a filename, see g_filename_get_display_name().
  *
- * G_FILENAME_ENCODING may be set to a comma-separated list of character 
- * set names. The special token "@locale" is taken to mean the character set 
- * for the current locale. The first character set from the list is taken 
- * as the filename encoding. 
- * If G_FILENAME_ENCODING is not set, but G_BROKEN_FILENAMES is, the
- * character set of the current locale is taken as the filename encoding.
+ * The character sets are determined by consulting the environment variables 
+ * <envar>G_FILENAME_ENCODING</envar> and <envar>G_BROKEN_FILENAMES</envar>.
  *
- * The returned @charset belongs to GLib and must not be freed.
+ * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list 
+ * of character set names. The special token "@locale" is taken to mean the 
+ * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar> 
+ * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of 
+ * the current locale is taken as the filename encoding. If neither environment
+ * variable is set, UTF-8 is taken as the filename encoding, but the character
+ * set of the current locale is also put in the list of encodings.
+ *
+ * The returned @charsets belong to GLib and must not be freed.
  *
  * Note that on Unix, regardless of the locale character set or
- * G_FILENAME_ENCODING value, the actual file names present on a
+ * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
  * system might be in any random encoding or just gibberish.
  *
- *  Return value: %TRUE
- * if the charset used for filename is UTF-8.
+ * Return value: %TRUE if the filename encoding is UTF-8.
+ * 
+ * Since: 2.6
  */
-static gboolean
-get_filename_charset (const gchar **filename_charset)
+gboolean
+g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
 {
   static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
   GFilenameCharsetCache *cache = g_static_private_get (&cache_private);
   const gchar *charset;
-  
+
   if (!cache)
     {
       cache = g_new0 (GFilenameCharsetCache, 1);
@@ -1052,77 +1058,95 @@ get_filename_charset (const gchar **filename_charset)
   if (!(cache->charset && strcmp (cache->charset, charset) == 0))
     {
       const gchar *new_charset;
-      gchar *p, *q;
+      gchar *p;
+      gint i;
 
       g_free (cache->charset);
-      g_free (cache->filename_charset);
+      g_strfreev (cache->filename_charsets);
       cache->charset = g_strdup (charset);
       
       p = getenv ("G_FILENAME_ENCODING");
       if (p != NULL) 
 	{
-	  q = strchr (p, ',');
-	  if (!q) 
-	    q = p + strlen (p);
+	  cache->filename_charsets = g_strsplit (p, ",", 0);
+	  cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
 
-	  if (strncmp ("@locale", p, q - p) == 0)
-	    {
-	      cache->is_utf8 = g_get_charset (&new_charset);
-	      cache->filename_charset = g_strdup (new_charset);
-	    }
-	  else
+	  for (i = 0; cache->filename_charsets[i]; i++)
 	    {
-	      cache->filename_charset = g_strndup (p, q - p);
-	      cache->is_utf8 = (strcmp (cache->filename_charset, "UTF-8") == 0);
+	      if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
+		{
+		  g_get_charset (&new_charset);
+		  g_free (cache->filename_charsets[i]);
+		  cache->filename_charsets[i] = g_strdup (new_charset);
+		}
 	    }
 	}
       else if (getenv ("G_BROKEN_FILENAMES") != NULL)
 	{
+	  cache->filename_charsets = g_new0 (gchar *, 2);
 	  cache->is_utf8 = g_get_charset (&new_charset);
-	  cache->filename_charset = g_strdup (new_charset);
+	  cache->filename_charsets[0] = g_strdup (new_charset);
 	}
       else 
 	{
-	  cache->filename_charset = g_strdup ("UTF-8");
+	  cache->filename_charsets = g_new0 (gchar *, 3);
 	  cache->is_utf8 = TRUE;
+	  cache->filename_charsets[0] = g_strdup ("UTF-8");
+	  if (!g_get_charset (&new_charset))
+	    cache->filename_charsets[1] = g_strdup (new_charset);
 	}
     }
 
-  if (filename_charset)
-    *filename_charset = cache->filename_charset;
+  if (filename_charsets)
+    *filename_charsets = (const gchar **)cache->filename_charsets;
 
   return cache->is_utf8;
 }
 
 #else /* G_PLATFORM_WIN32 */
 
-static gboolean
-get_filename_charset (const gchar **filename_charset) 
+gboolean
+g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) 
 {
+  static gchar *charsets[] = {
+    "UTF-8",
+    NULL
+  };
+
 #ifdef G_OS_WIN32
   /* On Windows GLib pretends that the filename charset is UTF-8 */
-  if (filename_charset)
-    *filename_charset = "UTF-8";
+  if (filename_charsets)
+    *filename_charsets = charsets;
+
   return TRUE;
 #else
+  gboolean result;
+
   /* Cygwin works like before */
-  g_get_charset (filename_charset);
-  return FALSE;
+  result = g_get_charset (&(charsets[0]));
+
+  if (filename_charsets)
+    *filename_charsets = charsets;
+
+  return result;
 #endif
 }
 
-#ifdef G_OS_WIN32
+#endif /* G_PLATFORM_WIN32 */
 
 static gboolean
-old_get_filename_charset (const gchar **filename_charset) 
+get_filename_charset (const gchar **filename_charset)
 {
-  g_get_charset (filename_charset);
-  return FALSE;
-}
-
-#endif
+  const gchar **charsets;
+  gboolean is_utf8;
+  
+  is_utf8 = g_get_filename_charsets (&charsets);
 
-#endif /* G_PLATFORM_WIN32 */
+  if (filename_charset)
+    *filename_charset = charsets[0];
+  
+  return is_utf8;
+}
 
 /* This is called from g_thread_init(). It's used to
  * initialize some static data in a threadsafe way.
@@ -1130,8 +1154,8 @@ old_get_filename_charset (const gchar **filename_charset)
 void 
 _g_convert_thread_init (void)
 {
-  const gchar *dummy;
-  (void) get_filename_charset (&dummy);
+  const gchar **dummy;
+  (void) get_filename_charsets (&dummy);
 }
 
 /**
@@ -1188,7 +1212,7 @@ g_filename_to_utf8 (const gchar *opsysstring,
 {
   const gchar *charset;
 
-  if (old_get_filename_charset (&charset))
+  if (g_get_charset (&charset))
     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
     return g_convert (opsysstring, len, 
@@ -1250,7 +1274,7 @@ g_filename_from_utf8 (const gchar *utf8string,
 {
   const gchar *charset;
 
-  if (old_get_filename_charset (&charset))
+  if (g_get_charset (&charset))
     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
   else
     return g_convert (utf8string, len,
@@ -1684,9 +1708,9 @@ g_filename_from_uri (const gchar *uri,
  *               URI, or %NULL on an error.
  **/
 gchar *
-g_filename_to_uri   (const gchar *filename,
-		     const gchar *hostname,
-		     GError     **error)
+g_filename_to_uri (const gchar *filename,
+		   const gchar *hostname,
+		   GError     **error)
 {
   char *escaped_uri;
 
@@ -1792,3 +1816,96 @@ g_uri_list_extract_uris (const gchar *uri_list)
 
   return result;
 }
+
+static gchar *
+make_valid_utf8 (const gchar *name)
+{
+  GString *string;
+  const gchar *remainder, *invalid;
+  gint remaining_bytes, valid_bytes;
+  
+  string = NULL;
+  remainder = name;
+  remaining_bytes = strlen (name);
+  
+  while (remaining_bytes != 0) 
+    {
+      if (g_utf8_validate (remainder, remaining_bytes, &invalid)) 
+	break;
+      valid_bytes = invalid - remainder;
+    
+      if (string == NULL) 
+	string = g_string_sized_new (remaining_bytes);
+
+      g_string_append_len (string, remainder, valid_bytes);
+      g_string_append_c (string, '?');
+      
+      remaining_bytes -= valid_bytes + 1;
+      remainder = invalid + 1;
+    }
+  
+  if (string == NULL)
+    return g_strdup (name);
+  
+  g_string_append (string, remainder);
+  g_string_append (string, " (invalid encoding)");
+
+  g_assert (g_utf8_validate (string->str, -1, NULL));
+  
+  return g_string_free (string, FALSE);
+}
+
+/**
+ * g_filename_display_name:
+ * @filename: a pathname in the GLib filename encoding
+ * 
+ * Converts a filename into a valid UTF-8 string. The 
+ * conversion is not necessarily reversible, so you 
+ * should keep the original around and use the return
+ * value of this function only for display purposes.
+ *
+ * Return value: a newly allocated string containing
+ *   a rendition of the filename in valid UTF-8
+ *
+ * Since: 2.6
+ **/
+gchar *
+g_filename_display_name (const gchar *filename)
+{
+  gint i;
+  const gchar **charsets;
+  gchar *display_name = NULL;
+  gboolean is_utf8;
+ 
+  is_utf8 = g_get_filename_charsets (&charsets);
+
+  if (is_utf8)
+    {
+      if (g_utf8_validate (filename, -1, NULL))
+	display_name = g_strdup (filename);
+    }
+  
+  if (!display_name)
+    {
+      /* Try to convert from the filename charsets to UTF-8.
+       * Skip the first charset if it is UTF-8.
+       */
+      for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
+	{
+	  display_name = g_convert (filename, -1, "UTF-8", charsets[i], 
+				    NULL, NULL, NULL);
+
+	  if (display_name)
+	    break;
+	}
+    }
+  
+  /* if all conversions failed, we replace invalid UTF-8
+   * by a question mark
+   */
+  if (!display_name) 
+    display_name = make_valid_utf8 (filename);
+
+  return display_name;
+}
+
diff --git a/glib/gconvert.h b/glib/gconvert.h
index f666e28bd..cc4f48bfd 100644
--- a/glib/gconvert.h
+++ b/glib/gconvert.h
@@ -121,6 +121,8 @@ gchar *g_filename_from_uri (const gchar *uri,
 gchar *g_filename_to_uri   (const gchar *filename,
 			    const gchar *hostname,
 			    GError     **error);
+gchar *g_filename_display_name (const gchar *filename);
+gboolean g_get_filename_charsets (G_CONST_RETURN gchar ***charsets);
 
 gchar **g_uri_list_extract_uris (const gchar *uri_list);
author	Matthias Clasen <mclasen@redhat.com>	2004-11-02 21:29:33 +0000
committer	Matthias Clasen <matthiasc@src.gnome.org>	2004-11-02 21:29:33 +0000
commit	87ad7806a77156be2568ab768053ecb90ea0d66f (patch)
tree	9b40ee264670b8e00be4662486405d18131dda7e
parent	91ae46c37b2f7c576a558a19f58173e96974dff5 (diff)
download	glib-87ad7806a77156be2568ab768053ecb90ea0d66f.tar.gz