From 7b14d465c87c07d9a9dc6319ee0bfc134e3e975b Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Sun, 8 Dec 2013 19:10:04 +0100 Subject: sniffing: Document the specification we are targeting --- libsoup/soup-content-sniffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index fb2aa090..d2a0808f 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -3,6 +3,10 @@ * soup-content-sniffer.c * * Copyright (C) 2009 Gustavo Noronha Silva. + * + * This code implements the following specification: + * + * http://mimesniff.spec.whatwg.org/ as of 11 June 2013 */ #ifdef HAVE_CONFIG_H -- cgit v1.2.1 From 6510806d97713450625bbd648d3ce6cd953a4df9 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Sun, 8 Dec 2013 19:11:21 +0100 Subject: sniffing: Implement the check-apache-bug flag Run the text or binary algorithm when some specific text/plain Content-Types are provided, since older versions of apache would send that type for binary files. http://mimesniff.spec.whatwg.org/#dfnReturnLink-0 --- libsoup/soup-content-sniffer.c | 37 ++++++++++++++++++++++++++++--------- tests/resources/text.txt | 1 + tests/sniffing-test.c | 11 ++++++++++- tests/soup-tests.gresource.xml | 1 + 4 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 tests/resources/text.txt diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index d2a0808f..e16658ba 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -2,7 +2,7 @@ /* * soup-content-sniffer.c * - * Copyright (C) 2009 Gustavo Noronha Silva. + * Copyright (C) 2009, 2013 Gustavo Noronha Silva. * * This code implements the following specification: * @@ -297,7 +297,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup ("text/plain"); } -/* HTML5: 2.7.3 Content-Type sniffing: text or binary */ +/* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */ static char* sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) { @@ -306,15 +306,20 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) gboolean looks_binary = FALSE; int i; - /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */ - if (resource_length >= 4) { + /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */ + if (resource_length >= 2) { if ((resource[0] == 0xFE && resource[1] == 0xFF) || - (resource[0] == 0xFF && resource[1] == 0xFE) || - (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)) + (resource[0] == 0xFF && resource[1] == 0xFE)) return g_strdup ("text/plain"); } - /* Look to see if any of the first n bytes looks binary */ + /* 3. UTF-8 BOM. */ + if (resource_length >= 3) { + if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) + return g_strdup ("text/plain"); + } + + /* 4. Look to see if any of the first n bytes looks binary */ for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) { looks_binary = TRUE; @@ -325,6 +330,9 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (!looks_binary) return g_strdup ("text/plain"); + /* 5. Execute 7.1 Identifying a resource with an unknown MIME type. + * TODO: sniff-scriptable needs to be unset. + */ return sniff_unknown (sniffer, buffer, TRUE); } @@ -472,14 +480,25 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, content_type = soup_message_headers_get_content_type (msg->response_headers, params); - /* These comparisons are done in an ASCII-case-insensitive - * manner because the spec requires it */ + /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ + + /* 1. Unknown/undefined supplied type respecting sniff-scritable. */ if ((content_type == NULL) || !g_ascii_strcasecmp (content_type, "unknown/unknown") || !g_ascii_strcasecmp (content_type, "application/unknown") || !g_ascii_strcasecmp (content_type, "*/*")) return sniff_unknown (sniffer, buffer, FALSE); + /* TODO: 2. no-sniff flag handling. */ + + /* 3. check-for-apache-bug */ + if ((content_type != NULL) && + (g_str_equal (content_type, "text/plain") || + g_str_equal (content_type, "text/plain; charset=ISO-8859-1") || + g_str_equal (content_type, "text/plain; charset=iso-8859-1") || + g_str_equal (content_type, "text/plain; charset=UTF-8"))) + return sniff_text_or_binary (sniffer, buffer); + if (g_str_has_suffix (content_type, "+xml") || !g_ascii_strcasecmp (content_type, "text/xml") || !g_ascii_strcasecmp (content_type, "application/xml")) diff --git a/tests/resources/text.txt b/tests/resources/text.txt new file mode 100644 index 00000000..ff7066f6 --- /dev/null +++ b/tests/resources/text.txt @@ -0,0 +1 @@ +This is just text. diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 532e6ed5..b668f403 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -49,7 +49,7 @@ server_callback (SoupServer *server, SoupMessage *msg, "Content-Type", "text/plain"); } - if (g_str_has_prefix (path, "/text_or_binary/")) { + if (g_str_has_prefix (path, "/text_or_binary/") || g_str_has_prefix (path, "/apache_bug/")) { char *base_name = g_path_get_basename (path); response = soup_test_load_resource (base_name, &error); @@ -442,6 +442,15 @@ main (int argc, char **argv) GINT_TO_POINTER (TRUE), do_signals_tests); + /* Test the apache bug sniffing path */ + g_test_add_data_func ("/sniffing/apache-bug/binary", + "/apache_bug/text_binary.txt => application/octet-stream", + do_sniffing_test); + g_test_add_data_func ("/sniffing/apache-bug/text", + "/apache_bug/text.txt => text/plain", + do_sniffing_test); + + /* GIF is a 'safe' type */ g_test_add_data_func ("/sniffing/type/gif", "text_or_binary/home.gif => image/gif", do_sniffing_test); diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index 9b580a35..2fe21ddc 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -12,6 +12,7 @@ resources/ps_binary.ps resources/rss20.xml resources/test.html + resources/text.txt resources/text_binary.txt -- cgit v1.2.1 From b766f11049d98f54980f64a6261914610e4e5116 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Sun, 8 Dec 2013 20:04:48 +0100 Subject: sniffing: Implement handling of the X-Content-Type-Options header --- libsoup/soup-content-sniffer.c | 6 +++++- tests/sniffing-test.c | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index e16658ba..6dec3e30 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -477,6 +477,7 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params) { const char *content_type; + const char *x_content_type_options; content_type = soup_message_headers_get_content_type (msg->response_headers, params); @@ -489,7 +490,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, !g_ascii_strcasecmp (content_type, "*/*")) return sniff_unknown (sniffer, buffer, FALSE); - /* TODO: 2. no-sniff flag handling. */ + /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */ + x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); + if (!g_strcmp0 (x_content_type_options, "nosniff")) + return g_strdup (content_type); /* 3. check-for-apache-bug */ if ((content_type != NULL) && diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index b668f403..a8bc3ffc 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -49,6 +49,20 @@ server_callback (SoupServer *server, SoupMessage *msg, "Content-Type", "text/plain"); } + if (g_str_has_prefix (path, "/nosniff/")) { + char *base_name = g_path_get_basename (path); + + response = soup_test_load_resource (base_name, &error); + g_assert_no_error (error); + g_free (base_name); + + soup_message_headers_append (msg->response_headers, + "X-Content-Type-Options", "nosniff"); + + soup_message_headers_append (msg->response_headers, + "Content-Type", "no/sniffing-allowed"); + } + if (g_str_has_prefix (path, "/text_or_binary/") || g_str_has_prefix (path, "/apache_bug/")) { char *base_name = g_path_get_basename (path); @@ -450,6 +464,11 @@ main (int argc, char **argv) "/apache_bug/text.txt => text/plain", do_sniffing_test); + /* X-Content-Type-Options: nosniff */ + g_test_add_data_func ("/sniffing/nosniff", + "nosniff/home.gif => no/sniffing-allowed", + do_sniffing_test); + /* GIF is a 'safe' type */ g_test_add_data_func ("/sniffing/type/gif", "text_or_binary/home.gif => image/gif", -- cgit v1.2.1 From fa2b82c870277a4b6717c852ea0185442af93014 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Mon, 9 Dec 2013 11:54:29 +0100 Subject: sniffing: Adjust the general unknown MIME type algorithm This change adjusts the pattern matching table to the current form of the MIMESNIFF spec, adding a check for a tag-terminating byte and using the formally defined sniff-scriptable flag. --- libsoup/soup-content-sniffer.c | 240 ++++++++++++++++++++++++++++++--------- tests/resources/html_binary.html | 2 +- 2 files changed, 187 insertions(+), 55 deletions(-) diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 6dec3e30..39210af6 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -77,12 +77,16 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer) { } -/* This table is based on the HTML5 spec; - * See 2.7.4 Content-Type sniffing: unknown type +/* This table is based on the MIMESNIFF spec; + * See 7.1 Identifying a resource with an unknown MIME type */ typedef struct { /* @has_ws is TRUE if @pattern contains "generic" whitespace */ gboolean has_ws; + /* @has_tag_termination is TRUE if we should check for a tag-terminating + * byte (0x20 " " or 0x3E ">") after the pattern match. + */ + gboolean has_tag_termination; const guchar *mask; const guchar *pattern; guint pattern_length; @@ -90,111 +94,234 @@ typedef struct { gboolean scriptable; } SoupContentSnifferPattern; + +/* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space + * is allowed. Those spaces are marked with \x00 on the mask. + */ static SoupContentSnifferPattern types_table[] = { - { FALSE, - (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", - (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C", + /* Scriptable types. */ + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" data; int resource_length = MIN (512, buffer->length); @@ -232,9 +359,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, for (i = 0; i < G_N_ELEMENTS (types_table); i++) { SoupContentSnifferPattern *type_row = &(types_table[i]); - /* The scriptable types should be skiped for the text - * or binary path, but considered for other paths */ - if (for_text_or_binary && type_row->scriptable) + if (!sniff_scriptable && type_row->scriptable) continue; if (type_row->has_ws) { @@ -267,8 +392,14 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, if (skip_row) continue; - if (index_pattern > type_row->pattern_length) + if (index_pattern > type_row->pattern_length) { + if (type_row->has_tag_termination && + resource[index_stream] != '\x20' && + resource[index_stream] != '\x3E') + continue; + return g_strdup (type_row->sniffed_type); + } } else { int j; @@ -286,9 +417,6 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, } } - if (for_text_or_binary) - return g_strdup ("application/octet-stream"); - for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) return g_strdup ("application/octet-stream"); @@ -478,21 +606,25 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, { const char *content_type; const char *x_content_type_options; + gboolean no_sniff = FALSE; content_type = soup_message_headers_get_content_type (msg->response_headers, params); /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ - /* 1. Unknown/undefined supplied type respecting sniff-scritable. */ + x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); + if (!g_strcmp0 (x_content_type_options, "nosniff")) + no_sniff = TRUE; + + /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */ if ((content_type == NULL) || !g_ascii_strcasecmp (content_type, "unknown/unknown") || !g_ascii_strcasecmp (content_type, "application/unknown") || !g_ascii_strcasecmp (content_type, "*/*")) - return sniff_unknown (sniffer, buffer, FALSE); + return sniff_unknown (sniffer, buffer, !no_sniff); /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */ - x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); - if (!g_strcmp0 (x_content_type_options, "nosniff")) + if (no_sniff) return g_strdup (content_type); /* 3. check-for-apache-bug */ diff --git a/tests/resources/html_binary.html b/tests/resources/html_binary.html index 9200dd42..d443048c 100644 --- a/tests/resources/html_binary.html +++ b/tests/resources/html_binary.html @@ -1 +1 @@ - Date: Mon, 25 Nov 2013 01:22:19 +0200 Subject: sniffing: test that it allows leading whitespace in doctype headers https://bugzilla.gnome.org/show_bug.cgi?id=715126 --- tests/resources/leading_space.html | 12 ++++++++++++ tests/sniffing-test.c | 3 +++ tests/soup-tests.gresource.xml | 1 + 3 files changed, 16 insertions(+) create mode 100644 tests/resources/leading_space.html diff --git a/tests/resources/leading_space.html b/tests/resources/leading_space.html new file mode 100644 index 00000000..a640d653 --- /dev/null +++ b/tests/resources/leading_space.html @@ -0,0 +1,12 @@ + + + + + + + + + +

GNOME!

+ + diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index a8bc3ffc..2dc9fb2c 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -524,6 +524,9 @@ main (int argc, char **argv) g_test_add_data_func ("/sniffing/type/unknown-binary", "unknown/text_binary.txt => application/octet-stream", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/unknown-leading-space", + "unknown/leading_space.html => text/html", + do_sniffing_test); /* Test the XML sniffing path */ g_test_add_data_func ("/sniffing/type/xml", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index 2fe21ddc..d24a04b0 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -5,6 +5,7 @@ resources/atom.xml resources/home.gif resources/html_binary.html + resources/leading_space.html resources/mbox resources/mbox.gz resources/mbox.raw -- cgit v1.2.1 From f5498190f9717d1366b4d57a7860899f87f0fc86 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Mon, 9 Dec 2013 15:04:16 +0100 Subject: sniffing: Add comment to the XML types bit --- libsoup/soup-content-sniffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 39210af6..154df841 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -635,6 +635,7 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, g_str_equal (content_type, "text/plain; charset=UTF-8"))) return sniff_text_or_binary (sniffer, buffer); + /* 4. XML types sent by the server are always used. */ if (g_str_has_suffix (content_type, "+xml") || !g_ascii_strcasecmp (content_type, "text/xml") || !g_ascii_strcasecmp (content_type, "application/xml")) -- cgit v1.2.1 From cd4f6a94f9275670091326a5aec8a07bce7f8d79 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Mon, 9 Dec 2013 16:20:02 +0100 Subject: sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec * decide on that before doing the image sniffing to match the spec * use const char* and g_str_has_prefix for comparisons to make it more legible * deal with rdf:RDF tags --- libsoup/soup-content-sniffer.c | 117 ++++++++++++++++++++++++++++------------- tests/resources/feed.rdf | 32 +++++++++++ tests/sniffing-test.c | 13 +++-- tests/soup-tests.gresource.xml | 1 + 4 files changed, 120 insertions(+), 43 deletions(-) create mode 100644 tests/resources/feed.rdf diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 154df841..5b768bb2 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -491,10 +491,26 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup (content_type); } +static gboolean +skip_insignificant_space (const char *resource, int *pos, int resource_length) +{ + while ((resource[*pos] == '\x09') || + (resource[*pos] == '\x20') || + (resource[*pos] == '\x0A') || + (resource[*pos] == '\x0D')) { + *pos = *pos + 1; + + if (*pos > resource_length) + return TRUE; + } + + return FALSE; +} + static char* sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) { - const guchar *resource = (const guchar *)buffer->data; + const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); int pos = 0; @@ -509,19 +525,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (pos > resource_length) goto text_html; - /* Skip insignificant white space */ - while ((resource[pos] == '\x09') || - (resource[pos] == '\x20') || - (resource[pos] == '\x0A') || - (resource[pos] == '\x0D')) { - pos++; - - if (pos > resource_length) - goto text_html; - } + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; - /* != < */ - if (resource[pos] != '\x3C') + if (resource[pos] != '<') return g_strdup ("text/html"); pos++; @@ -529,73 +536,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if ((pos + 2) > resource_length) goto text_html; - /* Skipping comments */ - if ((resource[pos] == '\x2D') || - (resource[pos+1] == '\x2D') || - (resource[pos+2] == '\x3E')) { + /* Skip comments. */ + if (g_str_has_prefix (resource + pos, "!--")) { pos = pos + 3; if ((pos + 2) > resource_length) goto text_html; - while ((resource[pos] != '\x2D') && - (resource[pos+1] != '\x2D') && - (resource[pos+2] != '\x3E')) { + while (!g_str_has_prefix (resource + pos, "-->")) { pos++; if ((pos + 2) > resource_length) goto text_html; } + pos = pos + 3; + goto look_for_tag; } if (pos > resource_length) goto text_html; - /* == ! */ - if (resource[pos] == '\x21') { + if (resource[pos] == '!') { do { pos++; if (pos > resource_length) goto text_html; - } while (resource[pos] != '\x3E'); + } while (resource[pos] != '>'); pos++; goto look_for_tag; - } else if (resource[pos] == '\x3F') { /* ? */ + } else if (resource[pos] == '?') { do { pos++; if ((pos + 1) > resource_length) goto text_html; - } while ((resource[pos] != '\x3F') && - (resource[pos+1] != '\x3E')); + } while (!g_str_has_prefix (resource + pos, "?>")); pos = pos + 2; goto look_for_tag; } - if ((pos + 2) > resource_length) + if ((pos + 3) > resource_length) goto text_html; - if ((resource[pos] == '\x72') && - (resource[pos+1] == '\x73') && - (resource[pos+2] == '\x73')) + if (g_str_has_prefix (resource + pos, "rss")) return g_strdup ("application/rss+xml"); - if ((pos + 3) > resource_length) + if ((pos + 4) > resource_length) goto text_html; - if ((resource[pos] == '\x66') && - (resource[pos+1] == '\x65') && - (resource[pos+2] == '\x65') && - (resource[pos+3] == '\x64')) + if (g_str_has_prefix (resource + pos, "feed")) return g_strdup ("application/atom+xml"); + if ((pos + 7) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "rdf:RDF")) { + pos = pos + 7; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) { + pos = pos + 32; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) + return g_strdup ("application/rss+xml"); + } + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) { + pos = pos + 55; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) + return g_strdup ("application/rss+xml"); + } + } + text_html: return g_strdup ("text/html"); } @@ -641,6 +681,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, !g_ascii_strcasecmp (content_type, "application/xml")) return g_strdup (content_type); + /* 5. Distinguish feed from HTML. */ + if (!g_ascii_strcasecmp (content_type, "text/html")) + return sniff_feed_or_html (sniffer, buffer); + /* 2.7.5 Content-Type sniffing: image * The spec says: * @@ -659,9 +703,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, return sniff_text_or_binary (sniffer, buffer); } - if (!g_ascii_strcasecmp (content_type, "text/html")) - return sniff_feed_or_html (sniffer, buffer); - return g_strdup (content_type); } diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf new file mode 100644 index 00000000..f3d9e276 --- /dev/null +++ b/tests/resources/feed.rdf @@ -0,0 +1,32 @@ + + + + + + + + XML.com + http://xml.com/pub + + XML.com features a rich mix of information and services + for the XML community. + + + + + + + + + + + + + + + diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 2dc9fb2c..498df976 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -539,11 +539,6 @@ main (int argc, char **argv) "type/application_xml/home.gif => application/xml", do_sniffing_test); - /* Test the image sniffing path */ - g_test_add_data_func ("/sniffing/type/image", - "type/image_png/home.gif => image/gif", - do_sniffing_test); - /* Test the feed or html path */ g_test_add_data_func ("/sniffing/type/html/html", "type/text_html/test.html => text/html", @@ -554,6 +549,14 @@ main (int argc, char **argv) g_test_add_data_func ("/sniffing/type/html/atom", "type/text_html/atom.xml => application/atom+xml", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/html/rdf", + "type/text_html/feed.rdf => application/rss+xml", + do_sniffing_test); + + /* Test the image sniffing path */ + g_test_add_data_func ("/sniffing/type/image", + "type/image_png/home.gif => image/gif", + do_sniffing_test); /* The spec tells us to only use the last Content-Type header */ g_test_add_data_func ("/sniffing/multiple-headers", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index d24a04b0..320cd63d 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -3,6 +3,7 @@ index.txt resources/atom.xml + resources/feed.rdf resources/home.gif resources/html_binary.html resources/leading_space.html -- cgit v1.2.1 From 26a65181db0b1fc3eb97748a5e3d9ceeecdc62e3 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Tue, 10 Dec 2013 17:17:40 +0100 Subject: sniffing: Bring image sniffing up-to-date with the MIMESNIFF spec --- libsoup/soup-content-sniffer.c | 212 ++++++++++++++++++++++------------------- tests/resources/home.jpg | Bin 0 -> 1074 bytes tests/resources/home.png | Bin 0 -> 313 bytes tests/resources/tux.webp | Bin 0 -> 17128 bytes tests/sniffing-test.c | 11 ++- tests/soup-tests.gresource.xml | 3 + 6 files changed, 127 insertions(+), 99 deletions(-) create mode 100644 tests/resources/home.jpg create mode 100644 tests/resources/home.png create mode 100644 tests/resources/tux.webp diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 5b768bb2..5e0b2a70 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -77,6 +77,105 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer) { } +typedef struct { + const guchar *mask; + const guchar *pattern; + guint pattern_length; + const char *sniffed_type; +} SoupContentSnifferMediaPattern; + +static char* +sniff_media (SoupContentSniffer *sniffer, + SoupBuffer *buffer, + SoupContentSnifferMediaPattern table[], + int table_length) +{ + const guchar *resource = (const guchar *)buffer->data; + int resource_length = MIN (512, buffer->length); + int i; + + for (i = 0; i < table_length; i++) { + SoupContentSnifferMediaPattern *type_row = &(table[i]); + int j; + + if (resource_length < type_row->pattern_length) + continue; + + for (j = 0; j < type_row->pattern_length; j++) { + if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) + break; + } + + /* This means our comparison above matched completely */ + if (j == type_row->pattern_length) + return g_strdup (type_row->sniffed_type); + } + + return NULL; +} + +/* This table is based on the MIMESNIFF spec; + * See 6.1 Matching an image type pattern + */ +static SoupContentSnifferMediaPattern image_types_table[] = { + + /* Windows icon signature. */ + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)"\x00\x00\x01\x00", + 4, + "image/x-icon" }, + + /* Windows cursor signature. */ + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)"\x00\x00\x02\x00", + 4, + "image/x-icon" }, + + /* BMP. */ + { (const guchar *)"\xFF\xFF", + (const guchar *)"BM", + 2, + "image/bmp" }, + + /* GIFs. */ + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"GIF87a", + 6, + "image/gif" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"GIF89a", + 6, + "image/gif" }, + + /* WEBP. */ + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP", + 14, + "image/webp" }, + + /* PNG. */ + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A", + 8, + "image/png" }, + + /* JPEG. */ + { (const guchar *)"\xFF\xFF\xFF", + (const guchar *)"\xFF\xD8\xFF", + 3, + "image/jpeg" }, +}; + +static char* +sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer) +{ + return sniff_media (sniffer, + buffer, + image_types_table, + G_N_ELEMENTS (image_types_table)); +} + /* This table is based on the MIMESNIFF spec; * See 7.1 Identifying a resource with an unknown MIME type */ @@ -262,66 +361,6 @@ static SoupContentSnifferPattern types_table[] = { 4, "text/plain", FALSE }, - - /* Images. */ - - { FALSE, FALSE, /* Windows icon signature. */ - (const guchar *)"\xFF\xFF\xFF\xFF", - (const guchar *)"\x00\x00\x01\x00", - 4, - "image/x-icon", - FALSE }, - - { FALSE, FALSE, /* Windows cursor signature. */ - (const guchar *)"\xFF\xFF\xFF\xFF", - (const guchar *)"\x00\x00\x02\x00", - 4, - "image/x-icon", - FALSE }, - - { FALSE, FALSE, /* BMP. */ - (const guchar *)"\xFF\xFF", - (const guchar *)"BM", - 2, - "image/bmp", - FALSE }, - - { FALSE, FALSE, /* GIF. */ - (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"GIF87a", - 6, - "image/gif", - FALSE }, - - { FALSE, FALSE, /* GIF. */ - (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"GIF89a", - 6, - "image/gif", - FALSE }, - - { FALSE, FALSE, /* WEBP. */ - (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP", - 14, - "image/webp", - FALSE }, - - { FALSE, FALSE, /* PNG. */ - (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A", - 8, - "image/png", - FALSE }, - - { FALSE, FALSE, /* JPEG. */ - (const guchar *)"\xFF\xFF\xFF", - (const guchar *)"\xFF\xD8\xFF", - 3, - "image/jpeg", - FALSE }, - - /* TODO: audio/video, archive type. */ }; /* Whether a given byte looks like it might be part of binary content. @@ -352,6 +391,7 @@ static char* sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, gboolean sniff_scriptable) { + char *sniffed_type = NULL; const guchar *resource = (const guchar *)buffer->data; int resource_length = MIN (512, buffer->length); int i; @@ -417,6 +457,12 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, } } + sniffed_type = sniff_images (sniffer, buffer); + + if (sniffed_type != NULL) + return sniffed_type; + + for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) return g_strdup ("application/octet-stream"); @@ -464,33 +510,6 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) return sniff_unknown (sniffer, buffer, TRUE); } -static char* -sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer, - const char *content_type) -{ - const guchar *resource = (const guchar *)buffer->data; - int resource_length = MIN (512, buffer->length); - int i; - - for (i = 0; i < G_N_ELEMENTS (types_table); i++) { - SoupContentSnifferPattern *type_row = &(types_table[i]); - - if (resource_length < type_row->pattern_length) - continue; - - if (!g_str_has_prefix (type_row->sniffed_type, "image/")) - continue; - - /* All of the image types use all-\xFF for the mask, - * so we can just memcmp. - */ - if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0) - return g_strdup (type_row->sniffed_type); - } - - return g_strdup (content_type); -} - static gboolean skip_insignificant_space (const char *resource, int *pos, int resource_length) { @@ -646,6 +665,7 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, { const char *content_type; const char *x_content_type_options; + char *sniffed_type = NULL; gboolean no_sniff = FALSE; content_type = soup_message_headers_get_content_type (msg->response_headers, params); @@ -685,18 +705,14 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, if (!g_ascii_strcasecmp (content_type, "text/html")) return sniff_feed_or_html (sniffer, buffer); - /* 2.7.5 Content-Type sniffing: image - * The spec says: - * - * If the resource's official type is "image/svg+xml", then - * the sniffed type of the resource is its official type (an - * XML type) - * - * The XML case is handled by the if above; if you refactor - * this code, keep this in mind. + /* 6. Image types. */ - if (!g_ascii_strncasecmp (content_type, "image/", 6)) - return sniff_images (sniffer, buffer, content_type); + if (!g_ascii_strncasecmp (content_type, "image/", 6)) { + sniffed_type = sniff_images (sniffer, buffer); + if (sniffed_type != NULL) + return sniffed_type; + return g_strdup (content_type); + } /* If we got text/plain, use text_or_binary */ if (g_str_equal (content_type, "text/plain")) { diff --git a/tests/resources/home.jpg b/tests/resources/home.jpg new file mode 100644 index 00000000..ac1f3bbc Binary files /dev/null and b/tests/resources/home.jpg differ diff --git a/tests/resources/home.png b/tests/resources/home.png new file mode 100644 index 00000000..0bb82bac Binary files /dev/null and b/tests/resources/home.png differ diff --git a/tests/resources/tux.webp b/tests/resources/tux.webp new file mode 100644 index 00000000..8764f066 Binary files /dev/null and b/tests/resources/tux.webp differ diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 498df976..868b7c5f 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -554,9 +554,18 @@ main (int argc, char **argv) do_sniffing_test); /* Test the image sniffing path */ - g_test_add_data_func ("/sniffing/type/image", + g_test_add_data_func ("/sniffing/type/image/gif", "type/image_png/home.gif => image/gif", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/image/png", + "type/image_gif/home.png => image/png", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/image/jpeg", + "type/image_png/home.jpg => image/jpeg", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/image/webp", + "type/image_png/tux.webp => image/webp", + do_sniffing_test); /* The spec tells us to only use the last Content-Type header */ g_test_add_data_func ("/sniffing/multiple-headers", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index 320cd63d..444ed588 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -5,6 +5,8 @@ resources/atom.xml resources/feed.rdf resources/home.gif + resources/home.jpg + resources/home.png resources/html_binary.html resources/leading_space.html resources/mbox @@ -16,5 +18,6 @@ resources/test.html resources/text.txt resources/text_binary.txt + resources/tux.webp -- cgit v1.2.1 From 9530c7b58197d7d45e21aa60a9986735ccf515b1 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Tue, 10 Dec 2013 19:45:55 +0100 Subject: sniffing: Add audio/video sniffing --- libsoup/soup-content-sniffer.c | 111 +++++++++++++++++++++++++++++++++++++++++ tests/resources/test.aiff | Bin 0 -> 384088 bytes tests/resources/test.mp4 | Bin 0 -> 192844 bytes tests/resources/test.ogg | Bin 0 -> 16994 bytes tests/resources/test.wav | Bin 0 -> 384080 bytes tests/resources/test.webm | Bin 0 -> 149879 bytes tests/sniffing-test.c | 19 +++++++ tests/soup-tests.gresource.xml | 5 ++ 8 files changed, 135 insertions(+) create mode 100644 tests/resources/test.aiff create mode 100644 tests/resources/test.mp4 create mode 100644 tests/resources/test.ogg create mode 100644 tests/resources/test.wav create mode 100644 tests/resources/test.webm diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 5e0b2a70..5659af9a 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -176,6 +176,103 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer) G_N_ELEMENTS (image_types_table)); } +/* This table is based on the MIMESNIFF spec; + * See 6.2 Matching an audio or video type pattern + */ +static SoupContentSnifferMediaPattern audio_video_types_table[] = { + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)"\x1A\x45\xDF\xA3", + 4, + "video/webm" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)".snd", + 4, + "audio/basic" }, + + + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + (const guchar *)"FORM\0\0\0\0AIFF", + 12, + "audio/aiff" }, + + { (const guchar *)"\xFF\xFF\xFF", + (const guchar *)"ID3", + 3, + "audio/mpeg" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"OggS\0", + 5, + "application/ogg" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"MThd\x00\x00\x00\x06", + 8, + "audio/midi" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + (const guchar *)"RIFF\x00\x00\x00\x00AVI ", + 12, + "video/avi" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + (const guchar *)"RIFF\x00\x00\x00\x00WAVE", + 12, + "audio/wave" }, +}; + +static gboolean +sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer) +{ + const char *resource = (const char *)buffer->data; + int resource_length = MIN (512, buffer->length); + guint32 box_size = *((guint32*)resource); + int i; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + box_size = ((box_size >> 24) | + ((box_size << 8) & 0x00FF0000) | + ((box_size >> 8) & 0x0000FF00) | + (box_size << 24)); +#endif + + if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0) + return FALSE; + + if (!g_str_has_prefix (resource + 4, "ftyp")) + return FALSE; + + if (!g_str_has_prefix (resource + 8, "mp4")) + return FALSE; + + for (i = 16; i < box_size && i < resource_length; i = i + 4) { + if (g_str_has_prefix (resource + i, "mp4")) + return TRUE; + } + + return FALSE; +} + +static char* +sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer) +{ + char *sniffed_type; + + sniffed_type = sniff_media (sniffer, + buffer, + audio_video_types_table, + G_N_ELEMENTS (audio_video_types_table)); + + if (sniffed_type != NULL) + return sniffed_type; + + if (sniff_mp4 (sniffer, buffer)) + return g_strdup ("video/mp4"); + + return NULL; +} + /* This table is based on the MIMESNIFF spec; * See 7.1 Identifying a resource with an unknown MIME type */ @@ -462,6 +559,10 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, if (sniffed_type != NULL) return sniffed_type; + sniffed_type = sniff_audio_video (sniffer, buffer); + + if (sniffed_type != NULL) + return sniffed_type; for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) @@ -714,6 +815,16 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, return g_strdup (content_type); } + /* 7. Audio and video types. */ + if (!g_ascii_strncasecmp (content_type, "audio/", 6) || + !g_ascii_strncasecmp (content_type, "video/", 6) || + !g_ascii_strcasecmp (content_type, "application/ogg")) { + sniffed_type = sniff_audio_video (sniffer, buffer); + if (sniffed_type != NULL) + return sniffed_type; + return g_strdup (content_type); + } + /* If we got text/plain, use text_or_binary */ if (g_str_equal (content_type, "text/plain")) { return sniff_text_or_binary (sniffer, buffer); diff --git a/tests/resources/test.aiff b/tests/resources/test.aiff new file mode 100644 index 00000000..9a1ecbb2 Binary files /dev/null and b/tests/resources/test.aiff differ diff --git a/tests/resources/test.mp4 b/tests/resources/test.mp4 new file mode 100644 index 00000000..d278c8ad Binary files /dev/null and b/tests/resources/test.mp4 differ diff --git a/tests/resources/test.ogg b/tests/resources/test.ogg new file mode 100644 index 00000000..e8f49ac3 Binary files /dev/null and b/tests/resources/test.ogg differ diff --git a/tests/resources/test.wav b/tests/resources/test.wav new file mode 100644 index 00000000..11660b29 Binary files /dev/null and b/tests/resources/test.wav differ diff --git a/tests/resources/test.webm b/tests/resources/test.webm new file mode 100644 index 00000000..7e53d0b4 Binary files /dev/null and b/tests/resources/test.webm differ diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 868b7c5f..5b0e6ee7 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -567,6 +567,25 @@ main (int argc, char **argv) "type/image_png/tux.webp => image/webp", do_sniffing_test); + /* Test audio and video sniffing path */ + g_test_add_data_func ("/sniffing/type/audio/wav", + "type/audio_mpeg/test.wav => audio/wave", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/audio/aiff", + "type/audio_mpeg/test.aiff => audio/aiff", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/audio/ogg", + "type/audio_mpeg/test.ogg => application/ogg", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/video/webm", + "type/video_theora/test.webm => video/webm", + do_sniffing_test); + + /* Test the MP4 sniffing path */ + g_test_add_data_func ("/sniffing/type/video/mp4", + "unknown/test.mp4 => video/mp4", + do_sniffing_test); + /* The spec tells us to only use the last Content-Type header */ g_test_add_data_func ("/sniffing/multiple-headers", "multiple_headers/home.gif => image/gif", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index 444ed588..b24a7297 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -15,7 +15,12 @@ resources/mbox.zlib resources/ps_binary.ps resources/rss20.xml + resources/test.aiff resources/test.html + resources/test.mp4 + resources/test.ogg + resources/test.wav + resources/test.webm resources/text.txt resources/text_binary.txt resources/tux.webp -- cgit v1.2.1