summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilip Withnall <philip@tecnocode.co.uk>2020-11-09 11:43:42 +0000
committerPhilip Withnall <philip@tecnocode.co.uk>2020-11-09 11:43:42 +0000
commitb8927cc6ad866570b100aa71e5aeabd349f439f3 (patch)
tree32cddf7c7d8e02dd0b4659e26d1d5c9d01c73e6e
parentc1a11c02e5ef458f328c1ed938d6077a4fd7deb3 (diff)
parent9da213ea34b95d0c4387f593bf277a4a6b9a68d1 (diff)
downloadglib-b8927cc6ad866570b100aa71e5aeabd349f439f3.tar.gz
Merge branch 'wip/tingping/guri-normalize' into 'master'
guri: Normalize uri segments if they are encoded and add a flag to do scheme-based normalization See merge request GNOME/glib!1716
-rw-r--r--glib/guri.c132
-rw-r--r--glib/guri.h5
-rw-r--r--glib/tests/uri.c74
3 files changed, 188 insertions, 23 deletions
diff --git a/glib/guri.c b/glib/guri.c
index 2f42a496d..ceed7133d 100644
--- a/glib/guri.c
+++ b/glib/guri.c
@@ -132,11 +132,12 @@
*
* Note that there is no `g_uri_equal ()` function, because comparing
* URIs usefully requires scheme-specific knowledge that #GUri does
- * not have. For example, `http://example.com/` and
- * `http://EXAMPLE.COM:80` have exactly the same meaning according
- * to the HTTP specification, and `data:,foo` and
- * `data:;base64,Zm9v` resolve to the same thing according to the
- * `data:` URI specification.
+ * not have. #GUri can help with normalization if you use the various
+ * encoded #GUriFlags as well as %G_URI_FLAGS_SCHEME_NORMALIZE however
+ * it is not comprehensive.
+ * For example, `data:,foo` and `data:;base64,Zm9v` resolve to the same
+ * thing according to the `data:` URI specification which GLib does not
+ * handle.
*
* Since: 2.66
*/
@@ -289,15 +290,16 @@ uri_decoder (gchar **out,
GUriError parse_error,
GError **error)
{
- gchar *decoded, *d, c;
+ gchar c;
+ GString *decoded;
const gchar *invalid, *s, *end;
gssize len;
if (!(flags & G_URI_FLAGS_ENCODED))
just_normalize = FALSE;
- decoded = g_malloc (length + 1);
- for (s = start, end = s + length, d = decoded; s < end; s++)
+ decoded = g_string_sized_new (length + 1);
+ for (s = start, end = s + length; s < end; s++)
{
if (*s == '%')
{
@@ -311,7 +313,7 @@ uri_decoder (gchar **out,
g_set_error_literal (error, G_URI_ERROR, parse_error,
/* xgettext: no-c-format */
_("Invalid %-encoding in URI"));
- g_free (decoded);
+ g_string_free (decoded, TRUE);
return -1;
}
@@ -319,7 +321,7 @@ uri_decoder (gchar **out,
* fix it to "%25", since that might change the way that
* the URI's owner would interpret it.
*/
- *d++ = *s;
+ g_string_append_c (decoded, *s);
continue;
}
@@ -328,43 +330,49 @@ uri_decoder (gchar **out,
{
g_set_error_literal (error, G_URI_ERROR, parse_error,
_("Illegal character in URI"));
- g_free (decoded);
+ g_string_free (decoded, TRUE);
return -1;
}
if (just_normalize && !g_uri_char_is_unreserved (c))
{
- /* Leave the % sequence there. */
- *d++ = *s;
+ /* Leave the % sequence there but normalize it. */
+ g_string_append_c (decoded, *s);
+ g_string_append_c (decoded, g_ascii_toupper (s[1]));
+ g_string_append_c (decoded, g_ascii_toupper (s[2]));
+ s += 2;
}
else
{
- *d++ = c;
+ g_string_append_c (decoded, c);
s += 2;
}
}
else if (www_form && *s == '+')
- *d++ = ' ';
+ g_string_append_c (decoded, ' ');
+ /* Normalize any illegal characters. */
+ else if (just_normalize && (!g_ascii_isgraph (*s)))
+ g_string_append_printf (decoded, "%%%02X", (guchar)*s);
else
- *d++ = *s;
+ g_string_append_c (decoded, *s);
}
- *d = '\0';
- len = d - decoded;
+ len = decoded->len;
g_assert (len >= 0);
if (!(flags & G_URI_FLAGS_ENCODED) &&
- !g_utf8_validate (decoded, len, &invalid))
+ !g_utf8_validate (decoded->str, len, &invalid))
{
g_set_error_literal (error, G_URI_ERROR, parse_error,
_("Non-UTF-8 characters in URI"));
- g_free (decoded);
+ g_string_free (decoded, TRUE);
return -1;
}
if (out)
- *out = g_steal_pointer (&decoded);
+ *out = g_string_free (decoded, FALSE);
+ else
+ g_string_free (decoded, TRUE);
- g_free (decoded);
return len;
}
@@ -741,6 +749,52 @@ uri_cleanup (const gchar *uri_string)
}
static gboolean
+should_normalize_empty_path (const char *scheme)
+{
+ const char * const schemes[] = { "https", "http", "wss", "ws" };
+ int i;
+ for (i = 0; i < G_N_ELEMENTS (schemes); ++i)
+ {
+ if (!strcmp (schemes[i], scheme))
+ return TRUE;
+ }
+ return FALSE;
+}
+
+static int
+normalize_port (const char *scheme,
+ int port)
+{
+ const char *default_schemes[3] = { NULL };
+ int i;
+
+ switch (port)
+ {
+ case 21:
+ default_schemes[0] = "ftp";
+ break;
+ case 80:
+ default_schemes[0] = "http";
+ default_schemes[1] = "ws";
+ break;
+ case 443:
+ default_schemes[0] = "https";
+ default_schemes[1] = "wss";
+ break;
+ default:
+ break;
+ }
+
+ for (i = 0; default_schemes[i]; ++i)
+ {
+ if (!strcmp (scheme, default_schemes[i]))
+ return -1;
+ }
+
+ return port;
+}
+
+static gboolean
g_uri_split_internal (const gchar *uri_string,
GUriFlags flags,
gchar **scheme,
@@ -758,6 +812,7 @@ g_uri_split_internal (const gchar *uri_string,
const gchar *end, *colon, *at, *path_start, *semi, *question;
const gchar *p, *bracket, *hostend;
gchar *cleaned_uri_string = NULL;
+ gchar *normalized_scheme = NULL;
if (scheme)
*scheme = NULL;
@@ -795,8 +850,9 @@ g_uri_split_internal (const gchar *uri_string,
if (p > uri_string && *p == ':')
{
+ normalized_scheme = g_ascii_strdown (uri_string, p - uri_string);
if (scheme)
- *scheme = g_ascii_strdown (uri_string, p - uri_string);
+ *scheme = g_steal_pointer (&normalized_scheme);
p++;
}
else
@@ -922,6 +978,22 @@ g_uri_split_internal (const gchar *uri_string,
G_URI_ERROR_BAD_PATH, error))
goto fail;
+ /* Scheme-based normalization */
+ if (flags & G_URI_FLAGS_SCHEME_NORMALIZE && ((scheme && *scheme) || normalized_scheme))
+ {
+ const char *scheme_str = scheme && *scheme ? *scheme : normalized_scheme;
+
+ if (should_normalize_empty_path (scheme_str) && path && !**path)
+ {
+ g_free (*path);
+ *path = g_strdup ("/");
+ }
+
+ if (port && *port != -1)
+ *port = normalize_port (scheme_str, *port);
+ }
+
+ g_free (normalized_scheme);
g_free (cleaned_uri_string);
return TRUE;
@@ -941,6 +1013,7 @@ g_uri_split_internal (const gchar *uri_string,
if (fragment)
g_clear_pointer (fragment, g_free);
+ g_free (normalized_scheme);
g_free (cleaned_uri_string);
return FALSE;
}
@@ -1394,6 +1467,19 @@ g_uri_parse_relative (GUri *base_uri,
uri->port = base_uri->port;
}
}
+
+ /* Scheme normalization couldn't have been done earlier
+ * as the relative URI may not have had a scheme */
+ if (flags & G_URI_FLAGS_SCHEME_NORMALIZE)
+ {
+ if (should_normalize_empty_path (uri->scheme) && !*uri->path)
+ {
+ g_free (uri->path);
+ uri->path = g_strdup ("/");
+ }
+
+ uri->port = normalize_port (uri->scheme, uri->port);
+ }
}
return g_steal_pointer (&uri);
diff --git a/glib/guri.h b/glib/guri.h
index 3a7bb5c0e..fecbfed8e 100644
--- a/glib/guri.h
+++ b/glib/guri.h
@@ -62,6 +62,10 @@ void g_uri_unref (GUri *uri);
* @G_URI_FLAGS_ENCODED_PATH: Same as %G_URI_FLAGS_ENCODED, for the path only.
* @G_URI_FLAGS_ENCODED_FRAGMENT: Same as %G_URI_FLAGS_ENCODED, for the
* fragment only.
+ * @G_URI_FLAGS_SCHEME_NORMALIZE: Applies scheme-based normalization to the
+ * parsed URI. For example when parsing an HTTP URI changing empty paths
+ * to `/` and changing port `80` to `-1`. This only supports a subset
+ * of known schemes. (Since: 2.68)
*
* Flags that describe a URI.
*
@@ -83,6 +87,7 @@ typedef enum {
G_URI_FLAGS_ENCODED_QUERY = 1 << 5,
G_URI_FLAGS_ENCODED_PATH = 1 << 6,
G_URI_FLAGS_ENCODED_FRAGMENT = 1 << 7,
+ G_URI_FLAGS_SCHEME_NORMALIZE = 1 << 8,
} GUriFlags;
GLIB_AVAILABLE_IN_2_66
diff --git a/glib/tests/uri.c b/glib/tests/uri.c
index b8a0c6a47..77aa95604 100644
--- a/glib/tests/uri.c
+++ b/glib/tests/uri.c
@@ -1708,6 +1708,79 @@ test_uri_join_split_round_trip (void)
}
}
+static const struct
+{
+ /* Inputs */
+ const gchar *base;
+ const gchar *uri;
+ GUriFlags flags;
+ /* Outputs */
+ const gchar *path;
+ int port;
+} normalize_tests[] =
+ {
+ { NULL, "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
+ "/path%20with%20spaces", -1 },
+ { NULL, "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
+ "/path%20with%20spaces%202", -1 },
+ { NULL, "http://foo/%aa", G_URI_FLAGS_ENCODED,
+ "/%AA", -1 },
+ { NULL, "http://foo/p\xc3\xa4th/", G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
+ "/p%C3%A4th/", -1 },
+ { NULL, "http://foo", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "/", -1 },
+ { NULL, "nothttp://foo", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "", -1 },
+ { NULL, "http://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "/", -1 },
+ { NULL, "https://foo:443", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "/", -1 },
+ { NULL, "ftp://foo:21", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "", -1 },
+ { NULL, "nothttp://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "", 80 },
+ { "http://foo", "//bar", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "/", -1 },
+ { "http://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "/", -1 },
+ { "nothttp://foo", "//bar:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+ "", 80 },
+ { "http://foo", "//bar", 0,
+ "", -1 },
+ };
+
+static void
+test_uri_normalize (void)
+{
+ gsize i;
+ int port;
+
+ for (i = 0; i < G_N_ELEMENTS (normalize_tests); ++i)
+ {
+ GUri *uri, *base = NULL;
+ if (normalize_tests[i].base)
+ base = g_uri_parse (normalize_tests[i].base, normalize_tests[i].flags, NULL);
+
+ uri = g_uri_parse_relative (base,
+ normalize_tests[i].uri,
+ normalize_tests[i].flags,
+ NULL);
+
+ g_assert_nonnull (uri);
+ g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path);
+ g_assert_cmpint (g_uri_get_port (uri), ==, normalize_tests[i].port);
+
+ g_uri_unref (uri);
+ if (base)
+ g_uri_unref (base);
+ }
+
+ /* One off testing a codepath where scheme is NULL but internally we still normalize it. */
+ g_assert_true (g_uri_split ("HTTP://foo:80", G_URI_FLAGS_SCHEME_NORMALIZE,
+ NULL, NULL, NULL, &port, NULL, NULL, NULL, NULL));
+ g_assert_cmpint (port, ==, -1);
+}
+
int
main (int argc,
char *argv[])
@@ -1733,6 +1806,7 @@ main (int argc,
g_test_add_func ("/uri/to-string", test_uri_to_string);
g_test_add_func ("/uri/join", test_uri_join);
g_test_add_func ("/uri/join-split-round-trip", test_uri_join_split_round_trip);
+ g_test_add_func ("/uri/normalize", test_uri_normalize);
g_test_add_data_func ("/uri/iter-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_iter_params);
g_test_add_data_func ("/uri/iter-params/length", GINT_TO_POINTER (FALSE), test_uri_iter_params);
g_test_add_data_func ("/uri/parse-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_parse_params);