diff options
-rw-r--r-- | libsoup/soup-content-sniffer.c | 632 | ||||
-rw-r--r-- | tests/resources/feed.rdf | 32 | ||||
-rw-r--r-- | tests/resources/home.jpg | bin | 0 -> 1074 bytes | |||
-rw-r--r-- | tests/resources/home.png | bin | 0 -> 313 bytes | |||
-rw-r--r-- | tests/resources/html_binary.html | 2 | ||||
-rw-r--r-- | tests/resources/leading_space.html | 12 | ||||
-rw-r--r-- | tests/resources/test.aiff | bin | 0 -> 384088 bytes | |||
-rw-r--r-- | tests/resources/test.mp4 | bin | 0 -> 192844 bytes | |||
-rw-r--r-- | tests/resources/test.ogg | bin | 0 -> 16994 bytes | |||
-rw-r--r-- | tests/resources/test.wav | bin | 0 -> 384080 bytes | |||
-rw-r--r-- | tests/resources/test.webm | bin | 0 -> 149879 bytes | |||
-rw-r--r-- | tests/resources/text.txt | 1 | ||||
-rw-r--r-- | tests/resources/tux.webp | bin | 0 -> 17128 bytes | |||
-rw-r--r-- | tests/sniffing-test.c | 74 | ||||
-rw-r--r-- | tests/soup-tests.gresource.xml | 11 |
15 files changed, 605 insertions, 159 deletions
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index fb2aa090..5659af9a 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -2,7 +2,11 @@ /* * soup-content-sniffer.c * - * Copyright (C) 2009 Gustavo Noronha Silva. + * Copyright (C) 2009, 2013 Gustavo Noronha Silva. + * + * This code implements the following specification: + * + * http://mimesniff.spec.whatwg.org/ as of 11 June 2013 */ #ifdef HAVE_CONFIG_H @@ -73,12 +77,212 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer) { } -/* This table is based on the HTML5 spec; - * See 2.7.4 Content-Type sniffing: unknown type +typedef struct { + const guchar *mask; + const guchar *pattern; + guint pattern_length; + const char *sniffed_type; +} SoupContentSnifferMediaPattern; + +static char* +sniff_media (SoupContentSniffer *sniffer, + SoupBuffer *buffer, + SoupContentSnifferMediaPattern table[], + int table_length) +{ + const guchar *resource = (const guchar *)buffer->data; + int resource_length = MIN (512, buffer->length); + int i; + + for (i = 0; i < table_length; i++) { + SoupContentSnifferMediaPattern *type_row = &(table[i]); + int j; + + if (resource_length < type_row->pattern_length) + continue; + + for (j = 0; j < type_row->pattern_length; j++) { + if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) + break; + } + + /* This means our comparison above matched completely */ + if (j == type_row->pattern_length) + return g_strdup (type_row->sniffed_type); + } + + return NULL; +} + +/* This table is based on the MIMESNIFF spec; + * See 6.1 Matching an image type pattern + */ +static SoupContentSnifferMediaPattern image_types_table[] = { + + /* Windows icon signature. */ + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)"\x00\x00\x01\x00", + 4, + "image/x-icon" }, + + /* Windows cursor signature. */ + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)"\x00\x00\x02\x00", + 4, + "image/x-icon" }, + + /* BMP. */ + { (const guchar *)"\xFF\xFF", + (const guchar *)"BM", + 2, + "image/bmp" }, + + /* GIFs. */ + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"GIF87a", + 6, + "image/gif" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"GIF89a", + 6, + "image/gif" }, + + /* WEBP. */ + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP", + 14, + "image/webp" }, + + /* PNG. */ + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A", + 8, + "image/png" }, + + /* JPEG. */ + { (const guchar *)"\xFF\xFF\xFF", + (const guchar *)"\xFF\xD8\xFF", + 3, + "image/jpeg" }, +}; + +static char* +sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer) +{ + return sniff_media (sniffer, + buffer, + image_types_table, + G_N_ELEMENTS (image_types_table)); +} + +/* This table is based on the MIMESNIFF spec; + * See 6.2 Matching an audio or video type pattern + */ +static SoupContentSnifferMediaPattern audio_video_types_table[] = { + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)"\x1A\x45\xDF\xA3", + 4, + "video/webm" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF", + (const guchar *)".snd", + 4, + "audio/basic" }, + + + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + (const guchar *)"FORM\0\0\0\0AIFF", + 12, + "audio/aiff" }, + + { (const guchar *)"\xFF\xFF\xFF", + (const guchar *)"ID3", + 3, + "audio/mpeg" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"OggS\0", + 5, + "application/ogg" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", + (const guchar *)"MThd\x00\x00\x00\x06", + 8, + "audio/midi" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + (const guchar *)"RIFF\x00\x00\x00\x00AVI ", + 12, + "video/avi" }, + + { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", + (const guchar *)"RIFF\x00\x00\x00\x00WAVE", + 12, + "audio/wave" }, +}; + +static gboolean +sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer) +{ + const char *resource = (const char *)buffer->data; + int resource_length = MIN (512, buffer->length); + guint32 box_size = *((guint32*)resource); + int i; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + box_size = ((box_size >> 24) | + ((box_size << 8) & 0x00FF0000) | + ((box_size >> 8) & 0x0000FF00) | + (box_size << 24)); +#endif + + if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0) + return FALSE; + + if (!g_str_has_prefix (resource + 4, "ftyp")) + return FALSE; + + if (!g_str_has_prefix (resource + 8, "mp4")) + return FALSE; + + for (i = 16; i < box_size && i < resource_length; i = i + 4) { + if (g_str_has_prefix (resource + i, "mp4")) + return TRUE; + } + + return FALSE; +} + +static char* +sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer) +{ + char *sniffed_type; + + sniffed_type = sniff_media (sniffer, + buffer, + audio_video_types_table, + G_N_ELEMENTS (audio_video_types_table)); + + if (sniffed_type != NULL) + return sniffed_type; + + if (sniff_mp4 (sniffer, buffer)) + return g_strdup ("video/mp4"); + + return NULL; +} + +/* This table is based on the MIMESNIFF spec; + * See 7.1 Identifying a resource with an unknown MIME type */ typedef struct { /* @has_ws is TRUE if @pattern contains "generic" whitespace */ gboolean has_ws; + /* @has_tag_termination is TRUE if we should check for a tag-terminating + * byte (0x20 " " or 0x3E ">") after the pattern match. + */ + gboolean has_tag_termination; const guchar *mask; const guchar *pattern; guint pattern_length; @@ -86,111 +290,174 @@ typedef struct { gboolean scriptable; } SoupContentSnifferPattern; + +/* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space + * is allowed. Those spaces are marked with \x00 on the mask. + */ static SoupContentSnifferPattern types_table[] = { - { FALSE, - (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", - (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C", + /* Scriptable types. */ + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" <!DOCTYPE HTML", 14, "text/html", TRUE }, - { TRUE, - (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF", - (const guchar *)" \x3C\x48\x54\x4D\x4C", + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" <HTML", 5, "text/html", TRUE }, - { TRUE, - (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF", - (const guchar *)" \x3C\x48\x45\x41\x44", + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" <HEAD", 5, "text/html", TRUE }, - { TRUE, - (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF", - (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54", + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF", + (const guchar *)" <SCRIPT", 7, "text/html", TRUE }, - { FALSE, + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF", + (const guchar *)" <IFRAME", + 7, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xFF", + (const guchar *)" <H1", + 3, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF", + (const guchar *)" <DIV", + 4, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" <FONT", + 5, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF", + (const guchar *)" <TABLE", + 6, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF", + (const guchar *)" <A", + 2, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF", + (const guchar *)" <STYLE", + 6, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF", + (const guchar *)" <TITLE", + 6, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF", + (const guchar *)" <B", + 2, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF", + (const guchar *)" <BODY", + 5, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF\xDF", + (const guchar *)" <BR", + 3, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xDF", + (const guchar *)" <P", + 2, + "text/html", + TRUE }, + + { TRUE, TRUE, + (const guchar *)"\x00\xFF\xFF\xFF\xFF", + (const guchar *)" <!--", + 4, + "text/html", + TRUE }, + + { TRUE, FALSE, + (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF", + (const guchar *)" <?xml", + 5, + "text/html", + TRUE }, + + { FALSE, FALSE, (const guchar *)"\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"\x25\x50\x44\x46\x2D", + (const guchar *)"%PDF-", 5, "application/pdf", TRUE }, - { FALSE, + /* Non-scriptable types. */ + { FALSE, FALSE, (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D", + (const guchar *)"%!PS-Adobe-", 11, "application/postscript", FALSE }, - { FALSE, + { FALSE, FALSE, /* UTF-16BE BOM */ (const guchar *)"\xFF\xFF\x00\x00", (const guchar *)"\xFE\xFF\x00\x00", 4, "text/plain", FALSE }, - { FALSE, - (const guchar *)"\xFF\xFF\x00\x00", + { FALSE, FALSE, /* UTF-16LE BOM */ (const guchar *)"\xFF\xFF\x00\x00", + (const guchar *)"\xFF\xFE\x00\x00", 4, "text/plain", FALSE }, - { FALSE, + { FALSE, FALSE, /* UTF-8 BOM */ (const guchar *)"\xFF\xFF\xFF\x00", (const guchar *)"\xEF\xBB\xBF\x00", 4, "text/plain", FALSE }, - - { FALSE, - (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"\x47\x49\x46\x38\x37\x61", - 6, - "image/gif", - FALSE }, - - { FALSE, - (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"\x47\x49\x46\x38\x39\x61", - 6, - "image/gif", - FALSE }, - - { FALSE, - (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", - (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A", - 8, - "image/png", - FALSE }, - - { FALSE, - (const guchar *)"\xFF\xFF\xFF", - (const guchar *)"\xFF\xD8\xFF", - 3, - "image/jpeg", - FALSE }, - - { FALSE, - (const guchar *)"\xFF\xFF", - (const guchar *)"\x42\x4D", - 2, - "image/bmp", - FALSE }, - - { FALSE, - (const guchar *)"\xFF\xFF\xFF\xFF", - (const guchar *)"\x00\x00\x01\x00", - 4, - "image/vnd.microsoft.icon", - FALSE } }; /* Whether a given byte looks like it might be part of binary content. @@ -219,8 +486,9 @@ static char byte_looks_binary[] = { /* HTML5: 2.7.4 Content-Type sniffing: unknown type */ static char* sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, - gboolean for_text_or_binary) + gboolean sniff_scriptable) { + char *sniffed_type = NULL; const guchar *resource = (const guchar *)buffer->data; int resource_length = MIN (512, buffer->length); int i; @@ -228,9 +496,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, for (i = 0; i < G_N_ELEMENTS (types_table); i++) { SoupContentSnifferPattern *type_row = &(types_table[i]); - /* The scriptable types should be skiped for the text - * or binary path, but considered for other paths */ - if (for_text_or_binary && type_row->scriptable) + if (!sniff_scriptable && type_row->scriptable) continue; if (type_row->has_ws) { @@ -263,8 +529,14 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, if (skip_row) continue; - if (index_pattern > type_row->pattern_length) + if (index_pattern > type_row->pattern_length) { + if (type_row->has_tag_termination && + resource[index_stream] != '\x20' && + resource[index_stream] != '\x3E') + continue; + return g_strdup (type_row->sniffed_type); + } } else { int j; @@ -282,8 +554,15 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, } } - if (for_text_or_binary) - return g_strdup ("application/octet-stream"); + sniffed_type = sniff_images (sniffer, buffer); + + if (sniffed_type != NULL) + return sniffed_type; + + sniffed_type = sniff_audio_video (sniffer, buffer); + + if (sniffed_type != NULL) + return sniffed_type; for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) @@ -293,7 +572,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup ("text/plain"); } -/* HTML5: 2.7.3 Content-Type sniffing: text or binary */ +/* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */ static char* sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) { @@ -302,15 +581,20 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) gboolean looks_binary = FALSE; int i; - /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */ - if (resource_length >= 4) { + /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */ + if (resource_length >= 2) { if ((resource[0] == 0xFE && resource[1] == 0xFF) || - (resource[0] == 0xFF && resource[1] == 0xFE) || - (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)) + (resource[0] == 0xFF && resource[1] == 0xFE)) + return g_strdup ("text/plain"); + } + + /* 3. UTF-8 BOM. */ + if (resource_length >= 3) { + if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) return g_strdup ("text/plain"); } - /* Look to see if any of the first n bytes looks binary */ + /* 4. Look to see if any of the first n bytes looks binary */ for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) { looks_binary = TRUE; @@ -321,40 +605,32 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (!looks_binary) return g_strdup ("text/plain"); + /* 5. Execute 7.1 Identifying a resource with an unknown MIME type. + * TODO: sniff-scriptable needs to be unset. + */ return sniff_unknown (sniffer, buffer, TRUE); } -static char* -sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer, - const char *content_type) +static gboolean +skip_insignificant_space (const char *resource, int *pos, int resource_length) { - const guchar *resource = (const guchar *)buffer->data; - int resource_length = MIN (512, buffer->length); - int i; - - for (i = 0; i < G_N_ELEMENTS (types_table); i++) { - SoupContentSnifferPattern *type_row = &(types_table[i]); - - if (resource_length < type_row->pattern_length) - continue; - - if (!g_str_has_prefix (type_row->sniffed_type, "image/")) - continue; - - /* All of the image types use all-\xFF for the mask, - * so we can just memcmp. - */ - if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0) - return g_strdup (type_row->sniffed_type); + while ((resource[*pos] == '\x09') || + (resource[*pos] == '\x20') || + (resource[*pos] == '\x0A') || + (resource[*pos] == '\x0D')) { + *pos = *pos + 1; + + if (*pos > resource_length) + return TRUE; } - return g_strdup (content_type); + return FALSE; } static char* sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) { - const guchar *resource = (const guchar *)buffer->data; + const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); int pos = 0; @@ -369,19 +645,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (pos > resource_length) goto text_html; - /* Skip insignificant white space */ - while ((resource[pos] == '\x09') || - (resource[pos] == '\x20') || - (resource[pos] == '\x0A') || - (resource[pos] == '\x0D')) { - pos++; - - if (pos > resource_length) - goto text_html; - } + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; - /* != < */ - if (resource[pos] != '\x3C') + if (resource[pos] != '<') return g_strdup ("text/html"); pos++; @@ -389,73 +656,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if ((pos + 2) > resource_length) goto text_html; - /* Skipping comments */ - if ((resource[pos] == '\x2D') || - (resource[pos+1] == '\x2D') || - (resource[pos+2] == '\x3E')) { + /* Skip comments. */ + if (g_str_has_prefix (resource + pos, "!--")) { pos = pos + 3; if ((pos + 2) > resource_length) goto text_html; - while ((resource[pos] != '\x2D') && - (resource[pos+1] != '\x2D') && - (resource[pos+2] != '\x3E')) { + while (!g_str_has_prefix (resource + pos, "-->")) { pos++; if ((pos + 2) > resource_length) goto text_html; } + pos = pos + 3; + goto look_for_tag; } if (pos > resource_length) goto text_html; - /* == ! */ - if (resource[pos] == '\x21') { + if (resource[pos] == '!') { do { pos++; if (pos > resource_length) goto text_html; - } while (resource[pos] != '\x3E'); + } while (resource[pos] != '>'); pos++; goto look_for_tag; - } else if (resource[pos] == '\x3F') { /* ? */ + } else if (resource[pos] == '?') { do { pos++; if ((pos + 1) > resource_length) goto text_html; - } while ((resource[pos] != '\x3F') && - (resource[pos+1] != '\x3E')); + } while (!g_str_has_prefix (resource + pos, "?>")); pos = pos + 2; goto look_for_tag; } - if ((pos + 2) > resource_length) + if ((pos + 3) > resource_length) goto text_html; - if ((resource[pos] == '\x72') && - (resource[pos+1] == '\x73') && - (resource[pos+2] == '\x73')) + if (g_str_has_prefix (resource + pos, "rss")) return g_strdup ("application/rss+xml"); - if ((pos + 3) > resource_length) + if ((pos + 4) > resource_length) goto text_html; - if ((resource[pos] == '\x66') && - (resource[pos+1] == '\x65') && - (resource[pos+2] == '\x65') && - (resource[pos+3] == '\x64')) + if (g_str_has_prefix (resource + pos, "feed")) return g_strdup ("application/atom+xml"); + if ((pos + 7) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "rdf:RDF")) { + pos = pos + 7; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) { + pos = pos + 32; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) + return g_strdup ("application/rss+xml"); + } + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) { + pos = pos + 55; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) + return g_strdup ("application/rss+xml"); + } + } + text_html: return g_strdup ("text/html"); } @@ -465,43 +765,71 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params) { const char *content_type; + const char *x_content_type_options; + char *sniffed_type = NULL; + gboolean no_sniff = FALSE; content_type = soup_message_headers_get_content_type (msg->response_headers, params); - /* These comparisons are done in an ASCII-case-insensitive - * manner because the spec requires it */ + /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ + + x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); + if (!g_strcmp0 (x_content_type_options, "nosniff")) + no_sniff = TRUE; + + /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */ if ((content_type == NULL) || !g_ascii_strcasecmp (content_type, "unknown/unknown") || !g_ascii_strcasecmp (content_type, "application/unknown") || !g_ascii_strcasecmp (content_type, "*/*")) - return sniff_unknown (sniffer, buffer, FALSE); + return sniff_unknown (sniffer, buffer, !no_sniff); + + /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */ + if (no_sniff) + return g_strdup (content_type); + + /* 3. check-for-apache-bug */ + if ((content_type != NULL) && + (g_str_equal (content_type, "text/plain") || + g_str_equal (content_type, "text/plain; charset=ISO-8859-1") || + g_str_equal (content_type, "text/plain; charset=iso-8859-1") || + g_str_equal (content_type, "text/plain; charset=UTF-8"))) + return sniff_text_or_binary (sniffer, buffer); + /* 4. XML types sent by the server are always used. */ if (g_str_has_suffix (content_type, "+xml") || !g_ascii_strcasecmp (content_type, "text/xml") || !g_ascii_strcasecmp (content_type, "application/xml")) return g_strdup (content_type); - /* 2.7.5 Content-Type sniffing: image - * The spec says: - * - * If the resource's official type is "image/svg+xml", then - * the sniffed type of the resource is its official type (an - * XML type) - * - * The XML case is handled by the if above; if you refactor - * this code, keep this in mind. + /* 5. Distinguish feed from HTML. */ + if (!g_ascii_strcasecmp (content_type, "text/html")) + return sniff_feed_or_html (sniffer, buffer); + + /* 6. Image types. */ - if (!g_ascii_strncasecmp (content_type, "image/", 6)) - return sniff_images (sniffer, buffer, content_type); + if (!g_ascii_strncasecmp (content_type, "image/", 6)) { + sniffed_type = sniff_images (sniffer, buffer); + if (sniffed_type != NULL) + return sniffed_type; + return g_strdup (content_type); + } + + /* 7. Audio and video types. */ + if (!g_ascii_strncasecmp (content_type, "audio/", 6) || + !g_ascii_strncasecmp (content_type, "video/", 6) || + !g_ascii_strcasecmp (content_type, "application/ogg")) { + sniffed_type = sniff_audio_video (sniffer, buffer); + if (sniffed_type != NULL) + return sniffed_type; + return g_strdup (content_type); + } /* If we got text/plain, use text_or_binary */ if (g_str_equal (content_type, "text/plain")) { return sniff_text_or_binary (sniffer, buffer); } - if (!g_ascii_strcasecmp (content_type, "text/html")) - return sniff_feed_or_html (sniffer, buffer); - return g_strdup (content_type); } diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf new file mode 100644 index 00000000..f3d9e276 --- /dev/null +++ b/tests/resources/feed.rdf @@ -0,0 +1,32 @@ +<?xml version="1.0"?> + +<!-- RDF Site Summary (RSS) 1.0 + http://groups.yahoo.com/group/rss-dev/files/specification.html + Section 5.3 + --> + +<rdf:RDF + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns="http://purl.org/rss/1.0/"> + + <channel rdf:about="http://www.xml.com/xml/news.rss"> + <title>XML.com</title> + <link>http://xml.com/pub</link> + <description> + XML.com features a rich mix of information and services + for the XML community. + </description> + + <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif" /> + + <items> + <rdf:Seq> + <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html" /> + <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html" /> + </rdf:Seq> + </items> + + <textinput rdf:resource="http://search.xml.com" /> + </channel> + +</rdf:RDF> diff --git a/tests/resources/home.jpg b/tests/resources/home.jpg Binary files differnew file mode 100644 index 00000000..ac1f3bbc --- /dev/null +++ b/tests/resources/home.jpg diff --git a/tests/resources/home.png b/tests/resources/home.png Binary files differnew file mode 100644 index 00000000..0bb82bac --- /dev/null +++ b/tests/resources/home.png diff --git a/tests/resources/html_binary.html b/tests/resources/html_binary.html index 9200dd42..d443048c 100644 --- a/tests/resources/html_binary.html +++ b/tests/resources/html_binary.html @@ -1 +1 @@ -<HTML +<HTML diff --git a/tests/resources/leading_space.html b/tests/resources/leading_space.html new file mode 100644 index 00000000..a640d653 --- /dev/null +++ b/tests/resources/leading_space.html @@ -0,0 +1,12 @@ + + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> +<title></title> +</head> +<body> +<h1>GNOME!</h1> +</body> +</html> diff --git a/tests/resources/test.aiff b/tests/resources/test.aiff Binary files differnew file mode 100644 index 00000000..9a1ecbb2 --- /dev/null +++ b/tests/resources/test.aiff diff --git a/tests/resources/test.mp4 b/tests/resources/test.mp4 Binary files differnew file mode 100644 index 00000000..d278c8ad --- /dev/null +++ b/tests/resources/test.mp4 diff --git a/tests/resources/test.ogg b/tests/resources/test.ogg Binary files differnew file mode 100644 index 00000000..e8f49ac3 --- /dev/null +++ b/tests/resources/test.ogg diff --git a/tests/resources/test.wav b/tests/resources/test.wav Binary files differnew file mode 100644 index 00000000..11660b29 --- /dev/null +++ b/tests/resources/test.wav diff --git a/tests/resources/test.webm b/tests/resources/test.webm Binary files differnew file mode 100644 index 00000000..7e53d0b4 --- /dev/null +++ b/tests/resources/test.webm diff --git a/tests/resources/text.txt b/tests/resources/text.txt new file mode 100644 index 00000000..ff7066f6 --- /dev/null +++ b/tests/resources/text.txt @@ -0,0 +1 @@ +This is just text. diff --git a/tests/resources/tux.webp b/tests/resources/tux.webp Binary files differnew file mode 100644 index 00000000..8764f066 --- /dev/null +++ b/tests/resources/tux.webp diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 532e6ed5..5b0e6ee7 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -49,7 +49,21 @@ server_callback (SoupServer *server, SoupMessage *msg, "Content-Type", "text/plain"); } - if (g_str_has_prefix (path, "/text_or_binary/")) { + if (g_str_has_prefix (path, "/nosniff/")) { + char *base_name = g_path_get_basename (path); + + response = soup_test_load_resource (base_name, &error); + g_assert_no_error (error); + g_free (base_name); + + soup_message_headers_append (msg->response_headers, + "X-Content-Type-Options", "nosniff"); + + soup_message_headers_append (msg->response_headers, + "Content-Type", "no/sniffing-allowed"); + } + + if (g_str_has_prefix (path, "/text_or_binary/") || g_str_has_prefix (path, "/apache_bug/")) { char *base_name = g_path_get_basename (path); response = soup_test_load_resource (base_name, &error); @@ -442,6 +456,20 @@ main (int argc, char **argv) GINT_TO_POINTER (TRUE), do_signals_tests); + /* Test the apache bug sniffing path */ + g_test_add_data_func ("/sniffing/apache-bug/binary", + "/apache_bug/text_binary.txt => application/octet-stream", + do_sniffing_test); + g_test_add_data_func ("/sniffing/apache-bug/text", + "/apache_bug/text.txt => text/plain", + do_sniffing_test); + + /* X-Content-Type-Options: nosniff */ + g_test_add_data_func ("/sniffing/nosniff", + "nosniff/home.gif => no/sniffing-allowed", + do_sniffing_test); + + /* GIF is a 'safe' type */ g_test_add_data_func ("/sniffing/type/gif", "text_or_binary/home.gif => image/gif", do_sniffing_test); @@ -496,6 +524,9 @@ main (int argc, char **argv) g_test_add_data_func ("/sniffing/type/unknown-binary", "unknown/text_binary.txt => application/octet-stream", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/unknown-leading-space", + "unknown/leading_space.html => text/html", + do_sniffing_test); /* Test the XML sniffing path */ g_test_add_data_func ("/sniffing/type/xml", @@ -508,11 +539,6 @@ main (int argc, char **argv) "type/application_xml/home.gif => application/xml", do_sniffing_test); - /* Test the image sniffing path */ - g_test_add_data_func ("/sniffing/type/image", - "type/image_png/home.gif => image/gif", - do_sniffing_test); - /* Test the feed or html path */ g_test_add_data_func ("/sniffing/type/html/html", "type/text_html/test.html => text/html", @@ -523,6 +549,42 @@ main (int argc, char **argv) g_test_add_data_func ("/sniffing/type/html/atom", "type/text_html/atom.xml => application/atom+xml", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/html/rdf", + "type/text_html/feed.rdf => application/rss+xml", + do_sniffing_test); + + /* Test the image sniffing path */ + g_test_add_data_func ("/sniffing/type/image/gif", + "type/image_png/home.gif => image/gif", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/image/png", + "type/image_gif/home.png => image/png", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/image/jpeg", + "type/image_png/home.jpg => image/jpeg", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/image/webp", + "type/image_png/tux.webp => image/webp", + do_sniffing_test); + + /* Test audio and video sniffing path */ + g_test_add_data_func ("/sniffing/type/audio/wav", + "type/audio_mpeg/test.wav => audio/wave", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/audio/aiff", + "type/audio_mpeg/test.aiff => audio/aiff", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/audio/ogg", + "type/audio_mpeg/test.ogg => application/ogg", + do_sniffing_test); + g_test_add_data_func ("/sniffing/type/video/webm", + "type/video_theora/test.webm => video/webm", + do_sniffing_test); + + /* Test the MP4 sniffing path */ + g_test_add_data_func ("/sniffing/type/video/mp4", + "unknown/test.mp4 => video/mp4", + do_sniffing_test); /* The spec tells us to only use the last Content-Type header */ g_test_add_data_func ("/sniffing/multiple-headers", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index 9b580a35..b24a7297 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -3,15 +3,26 @@ <gresource prefix="/org/gnome/libsoup/tests"> <file>index.txt</file> <file>resources/atom.xml</file> + <file>resources/feed.rdf</file> <file>resources/home.gif</file> + <file>resources/home.jpg</file> + <file>resources/home.png</file> <file>resources/html_binary.html</file> + <file>resources/leading_space.html</file> <file>resources/mbox</file> <file>resources/mbox.gz</file> <file>resources/mbox.raw</file> <file>resources/mbox.zlib</file> <file>resources/ps_binary.ps</file> <file>resources/rss20.xml</file> + <file>resources/test.aiff</file> <file>resources/test.html</file> + <file>resources/test.mp4</file> + <file>resources/test.ogg</file> + <file>resources/test.wav</file> + <file>resources/test.webm</file> + <file>resources/text.txt</file> <file>resources/text_binary.txt</file> + <file>resources/tux.webp</file> </gresource> </gresources> |