From 6510806d97713450625bbd648d3ce6cd953a4df9 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Sun, 8 Dec 2013 19:11:21 +0100 Subject: sniffing: Implement the check-apache-bug flag Run the text or binary algorithm when some specific text/plain Content-Types are provided, since older versions of apache would send that type for binary files. http://mimesniff.spec.whatwg.org/#dfnReturnLink-0 --- libsoup/soup-content-sniffer.c | 37 ++++++++++++++++++++++++++++--------- tests/resources/text.txt | 1 + tests/sniffing-test.c | 11 ++++++++++- tests/soup-tests.gresource.xml | 1 + 4 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 tests/resources/text.txt diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index d2a0808f..e16658ba 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -2,7 +2,7 @@ /* * soup-content-sniffer.c * - * Copyright (C) 2009 Gustavo Noronha Silva. + * Copyright (C) 2009, 2013 Gustavo Noronha Silva. * * This code implements the following specification: * @@ -297,7 +297,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup ("text/plain"); } -/* HTML5: 2.7.3 Content-Type sniffing: text or binary */ +/* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */ static char* sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) { @@ -306,15 +306,20 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) gboolean looks_binary = FALSE; int i; - /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */ - if (resource_length >= 4) { + /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */ + if (resource_length >= 2) { if ((resource[0] == 0xFE && resource[1] == 0xFF) || - (resource[0] == 0xFF && resource[1] == 0xFE) || - (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)) + (resource[0] == 0xFF && resource[1] == 0xFE)) return g_strdup ("text/plain"); } - /* Look to see if any of the first n bytes looks binary */ + /* 3. UTF-8 BOM. */ + if (resource_length >= 3) { + if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) + return g_strdup ("text/plain"); + } + + /* 4. Look to see if any of the first n bytes looks binary */ for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) { looks_binary = TRUE; @@ -325,6 +330,9 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (!looks_binary) return g_strdup ("text/plain"); + /* 5. Execute 7.1 Identifying a resource with an unknown MIME type. + * TODO: sniff-scriptable needs to be unset. + */ return sniff_unknown (sniffer, buffer, TRUE); } @@ -472,14 +480,25 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, content_type = soup_message_headers_get_content_type (msg->response_headers, params); - /* These comparisons are done in an ASCII-case-insensitive - * manner because the spec requires it */ + /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ + + /* 1. Unknown/undefined supplied type respecting sniff-scritable. */ if ((content_type == NULL) || !g_ascii_strcasecmp (content_type, "unknown/unknown") || !g_ascii_strcasecmp (content_type, "application/unknown") || !g_ascii_strcasecmp (content_type, "*/*")) return sniff_unknown (sniffer, buffer, FALSE); + /* TODO: 2. no-sniff flag handling. */ + + /* 3. check-for-apache-bug */ + if ((content_type != NULL) && + (g_str_equal (content_type, "text/plain") || + g_str_equal (content_type, "text/plain; charset=ISO-8859-1") || + g_str_equal (content_type, "text/plain; charset=iso-8859-1") || + g_str_equal (content_type, "text/plain; charset=UTF-8"))) + return sniff_text_or_binary (sniffer, buffer); + if (g_str_has_suffix (content_type, "+xml") || !g_ascii_strcasecmp (content_type, "text/xml") || !g_ascii_strcasecmp (content_type, "application/xml")) diff --git a/tests/resources/text.txt b/tests/resources/text.txt new file mode 100644 index 00000000..ff7066f6 --- /dev/null +++ b/tests/resources/text.txt @@ -0,0 +1 @@ +This is just text. diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 532e6ed5..b668f403 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -49,7 +49,7 @@ server_callback (SoupServer *server, SoupMessage *msg, "Content-Type", "text/plain"); } - if (g_str_has_prefix (path, "/text_or_binary/")) { + if (g_str_has_prefix (path, "/text_or_binary/") || g_str_has_prefix (path, "/apache_bug/")) { char *base_name = g_path_get_basename (path); response = soup_test_load_resource (base_name, &error); @@ -442,6 +442,15 @@ main (int argc, char **argv) GINT_TO_POINTER (TRUE), do_signals_tests); + /* Test the apache bug sniffing path */ + g_test_add_data_func ("/sniffing/apache-bug/binary", + "/apache_bug/text_binary.txt => application/octet-stream", + do_sniffing_test); + g_test_add_data_func ("/sniffing/apache-bug/text", + "/apache_bug/text.txt => text/plain", + do_sniffing_test); + + /* GIF is a 'safe' type */ g_test_add_data_func ("/sniffing/type/gif", "text_or_binary/home.gif => image/gif", do_sniffing_test); diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index 9b580a35..2fe21ddc 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -12,6 +12,7 @@ resources/ps_binary.ps resources/rss20.xml resources/test.html + resources/text.txt resources/text_binary.txt -- cgit v1.2.1