From fa2b82c870277a4b6717c852ea0185442af93014 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Mon, 9 Dec 2013 11:54:29 +0100 Subject: sniffing: Adjust the general unknown MIME type algorithm This change adjusts the pattern matching table to the current form of the MIMESNIFF spec, adding a check for a tag-terminating byte and using the formally defined sniff-scriptable flag. --- libsoup/soup-content-sniffer.c | 240 ++++++++++++++++++++++++++++++--------- tests/resources/html_binary.html | 2 +- 2 files changed, 187 insertions(+), 55 deletions(-) diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 6dec3e30..39210af6 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -77,12 +77,16 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer) { } -/* This table is based on the HTML5 spec; - * See 2.7.4 Content-Type sniffing: unknown type +/* This table is based on the MIMESNIFF spec; + * See 7.1 Identifying a resource with an unknown MIME type */ typedef struct { /* @has_ws is TRUE if @pattern contains "generic" whitespace */ gboolean has_ws; + /* @has_tag_termination is TRUE if we should check for a tag-terminating + * byte (0x20 " " or 0x3E ">") after the pattern match. + */ + gboolean has_tag_termination; const guchar *mask; const guchar *pattern; guint pattern_length; @@ -90,111 +94,234 @@ typedef struct { gboolean scriptable; } SoupContentSnifferPattern; + +/* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space + * is allowed. Those spaces are marked with \x00 on the mask. + */ static SoupContentSnifferPattern types_table[] = { - { FALSE, - (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", - (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C", + /* Scriptable types. */ + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" data; int resource_length = MIN (512, buffer->length); @@ -232,9 +359,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, for (i = 0; i < G_N_ELEMENTS (types_table); i++) { SoupContentSnifferPattern *type_row = &(types_table[i]); - /* The scriptable types should be skiped for the text - * or binary path, but considered for other paths */ - if (for_text_or_binary && type_row->scriptable) + if (!sniff_scriptable && type_row->scriptable) continue; if (type_row->has_ws) { @@ -267,8 +392,14 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, if (skip_row) continue; - if (index_pattern > type_row->pattern_length) + if (index_pattern > type_row->pattern_length) { + if (type_row->has_tag_termination && + resource[index_stream] != '\x20' && + resource[index_stream] != '\x3E') + continue; + return g_strdup (type_row->sniffed_type); + } } else { int j; @@ -286,9 +417,6 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, } } - if (for_text_or_binary) - return g_strdup ("application/octet-stream"); - for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) return g_strdup ("application/octet-stream"); @@ -478,21 +606,25 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, { const char *content_type; const char *x_content_type_options; + gboolean no_sniff = FALSE; content_type = soup_message_headers_get_content_type (msg->response_headers, params); /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ - /* 1. Unknown/undefined supplied type respecting sniff-scritable. */ + x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); + if (!g_strcmp0 (x_content_type_options, "nosniff")) + no_sniff = TRUE; + + /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */ if ((content_type == NULL) || !g_ascii_strcasecmp (content_type, "unknown/unknown") || !g_ascii_strcasecmp (content_type, "application/unknown") || !g_ascii_strcasecmp (content_type, "*/*")) - return sniff_unknown (sniffer, buffer, FALSE); + return sniff_unknown (sniffer, buffer, !no_sniff); /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */ - x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); - if (!g_strcmp0 (x_content_type_options, "nosniff")) + if (no_sniff) return g_strdup (content_type); /* 3. check-for-apache-bug */ diff --git a/tests/resources/html_binary.html b/tests/resources/html_binary.html index 9200dd42..d443048c 100644 --- a/tests/resources/html_binary.html +++ b/tests/resources/html_binary.html @@ -1 +1 @@ -