summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Winship <danw@gnome.org>2014-02-17 12:28:28 -0500
committerDan Winship <danw@gnome.org>2014-02-17 12:28:28 -0500
commita70ea67ade04ffc8048c93308d88ae70028dcf38 (patch)
tree3ccaf69955349beed2c0c3f671d0deb552bb58c4
parent3c4cea7bcc7dd44d61bf1d87a4ec8dce99a875cf (diff)
parent9530c7b58197d7d45e21aa60a9986735ccf515b1 (diff)
downloadlibsoup-a70ea67ade04ffc8048c93308d88ae70028dcf38.tar.gz
Merge branch 'content-sniffing-update'
Update SoupContentSniffer (and sniffing-test) to match the current version of the MIME sniffing spec. https://bugzilla.gnome.org/show_bug.cgi?id=648849 https://bugzilla.gnome.org/show_bug.cgi?id=715126
-rw-r--r--libsoup/soup-content-sniffer.c632
-rw-r--r--tests/resources/feed.rdf32
-rw-r--r--tests/resources/home.jpgbin0 -> 1074 bytes
-rw-r--r--tests/resources/home.pngbin0 -> 313 bytes
-rw-r--r--tests/resources/html_binary.html2
-rw-r--r--tests/resources/leading_space.html12
-rw-r--r--tests/resources/test.aiffbin0 -> 384088 bytes
-rw-r--r--tests/resources/test.mp4bin0 -> 192844 bytes
-rw-r--r--tests/resources/test.oggbin0 -> 16994 bytes
-rw-r--r--tests/resources/test.wavbin0 -> 384080 bytes
-rw-r--r--tests/resources/test.webmbin0 -> 149879 bytes
-rw-r--r--tests/resources/text.txt1
-rw-r--r--tests/resources/tux.webpbin0 -> 17128 bytes
-rw-r--r--tests/sniffing-test.c74
-rw-r--r--tests/soup-tests.gresource.xml11
15 files changed, 605 insertions, 159 deletions
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
index fb2aa090..5659af9a 100644
--- a/libsoup/soup-content-sniffer.c
+++ b/libsoup/soup-content-sniffer.c
@@ -2,7 +2,11 @@
/*
* soup-content-sniffer.c
*
- * Copyright (C) 2009 Gustavo Noronha Silva.
+ * Copyright (C) 2009, 2013 Gustavo Noronha Silva.
+ *
+ * This code implements the following specification:
+ *
+ * http://mimesniff.spec.whatwg.org/ as of 11 June 2013
*/
#ifdef HAVE_CONFIG_H
@@ -73,12 +77,212 @@ soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
{
}
-/* This table is based on the HTML5 spec;
- * See 2.7.4 Content-Type sniffing: unknown type
+typedef struct {
+ const guchar *mask;
+ const guchar *pattern;
+ guint pattern_length;
+ const char *sniffed_type;
+} SoupContentSnifferMediaPattern;
+
+static char*
+sniff_media (SoupContentSniffer *sniffer,
+ SoupBuffer *buffer,
+ SoupContentSnifferMediaPattern table[],
+ int table_length)
+{
+ const guchar *resource = (const guchar *)buffer->data;
+ int resource_length = MIN (512, buffer->length);
+ int i;
+
+ for (i = 0; i < table_length; i++) {
+ SoupContentSnifferMediaPattern *type_row = &(table[i]);
+ int j;
+
+ if (resource_length < type_row->pattern_length)
+ continue;
+
+ for (j = 0; j < type_row->pattern_length; j++) {
+ if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
+ break;
+ }
+
+ /* This means our comparison above matched completely */
+ if (j == type_row->pattern_length)
+ return g_strdup (type_row->sniffed_type);
+ }
+
+ return NULL;
+}
+
+/* This table is based on the MIMESNIFF spec;
+ * See 6.1 Matching an image type pattern
+ */
+static SoupContentSnifferMediaPattern image_types_table[] = {
+
+ /* Windows icon signature. */
+ { (const guchar *)"\xFF\xFF\xFF\xFF",
+ (const guchar *)"\x00\x00\x01\x00",
+ 4,
+ "image/x-icon" },
+
+ /* Windows cursor signature. */
+ { (const guchar *)"\xFF\xFF\xFF\xFF",
+ (const guchar *)"\x00\x00\x02\x00",
+ 4,
+ "image/x-icon" },
+
+ /* BMP. */
+ { (const guchar *)"\xFF\xFF",
+ (const guchar *)"BM",
+ 2,
+ "image/bmp" },
+
+ /* GIFs. */
+ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)"GIF87a",
+ 6,
+ "image/gif" },
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)"GIF89a",
+ 6,
+ "image/gif" },
+
+ /* WEBP. */
+ { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP",
+ 14,
+ "image/webp" },
+
+ /* PNG. */
+ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A",
+ 8,
+ "image/png" },
+
+ /* JPEG. */
+ { (const guchar *)"\xFF\xFF\xFF",
+ (const guchar *)"\xFF\xD8\xFF",
+ 3,
+ "image/jpeg" },
+};
+
+static char*
+sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer)
+{
+ return sniff_media (sniffer,
+ buffer,
+ image_types_table,
+ G_N_ELEMENTS (image_types_table));
+}
+
+/* This table is based on the MIMESNIFF spec;
+ * See 6.2 Matching an audio or video type pattern
+ */
+static SoupContentSnifferMediaPattern audio_video_types_table[] = {
+ { (const guchar *)"\xFF\xFF\xFF\xFF",
+ (const guchar *)"\x1A\x45\xDF\xA3",
+ 4,
+ "video/webm" },
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF",
+ (const guchar *)".snd",
+ 4,
+ "audio/basic" },
+
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
+ (const guchar *)"FORM\0\0\0\0AIFF",
+ 12,
+ "audio/aiff" },
+
+ { (const guchar *)"\xFF\xFF\xFF",
+ (const guchar *)"ID3",
+ 3,
+ "audio/mpeg" },
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)"OggS\0",
+ 5,
+ "application/ogg" },
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)"MThd\x00\x00\x00\x06",
+ 8,
+ "audio/midi" },
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
+ (const guchar *)"RIFF\x00\x00\x00\x00AVI ",
+ 12,
+ "video/avi" },
+
+ { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF",
+ (const guchar *)"RIFF\x00\x00\x00\x00WAVE",
+ 12,
+ "audio/wave" },
+};
+
+static gboolean
+sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer)
+{
+ const char *resource = (const char *)buffer->data;
+ int resource_length = MIN (512, buffer->length);
+ guint32 box_size = *((guint32*)resource);
+ int i;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ box_size = ((box_size >> 24) |
+ ((box_size << 8) & 0x00FF0000) |
+ ((box_size >> 8) & 0x0000FF00) |
+ (box_size << 24));
+#endif
+
+ if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0)
+ return FALSE;
+
+ if (!g_str_has_prefix (resource + 4, "ftyp"))
+ return FALSE;
+
+ if (!g_str_has_prefix (resource + 8, "mp4"))
+ return FALSE;
+
+ for (i = 16; i < box_size && i < resource_length; i = i + 4) {
+ if (g_str_has_prefix (resource + i, "mp4"))
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static char*
+sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer)
+{
+ char *sniffed_type;
+
+ sniffed_type = sniff_media (sniffer,
+ buffer,
+ audio_video_types_table,
+ G_N_ELEMENTS (audio_video_types_table));
+
+ if (sniffed_type != NULL)
+ return sniffed_type;
+
+ if (sniff_mp4 (sniffer, buffer))
+ return g_strdup ("video/mp4");
+
+ return NULL;
+}
+
+/* This table is based on the MIMESNIFF spec;
+ * See 7.1 Identifying a resource with an unknown MIME type
*/
typedef struct {
/* @has_ws is TRUE if @pattern contains "generic" whitespace */
gboolean has_ws;
+ /* @has_tag_termination is TRUE if we should check for a tag-terminating
+ * byte (0x20 " " or 0x3E ">") after the pattern match.
+ */
+ gboolean has_tag_termination;
const guchar *mask;
const guchar *pattern;
guint pattern_length;
@@ -86,111 +290,174 @@ typedef struct {
gboolean scriptable;
} SoupContentSnifferPattern;
+
+/* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space
+ * is allowed. Those spaces are marked with \x00 on the mask.
+ */
static SoupContentSnifferPattern types_table[] = {
- { FALSE,
- (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
- (const guchar *)"\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
+ /* Scriptable types. */
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <!DOCTYPE HTML",
14,
"text/html",
TRUE },
- { TRUE,
- (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
- (const guchar *)" \x3C\x48\x54\x4D\x4C",
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <HTML",
5,
"text/html",
TRUE },
- { TRUE,
- (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF",
- (const guchar *)" \x3C\x48\x45\x41\x44",
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <HEAD",
5,
"text/html",
TRUE },
- { TRUE,
- (const guchar *)"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
- (const guchar *)" \x3C\x53\x43\x52\x49\x50\x54",
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <SCRIPT",
7,
"text/html",
TRUE },
- { FALSE,
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <IFRAME",
+ 7,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xFF",
+ (const guchar *)" <H1",
+ 3,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF",
+ (const guchar *)" <DIV",
+ 4,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <FONT",
+ 5,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <TABLE",
+ 6,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF",
+ (const guchar *)" <A",
+ 2,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <STYLE",
+ 6,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <TITLE",
+ 6,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF",
+ (const guchar *)" <B",
+ 2,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF\xDF\xDF",
+ (const guchar *)" <BODY",
+ 5,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF\xDF",
+ (const guchar *)" <BR",
+ 3,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xDF",
+ (const guchar *)" <P",
+ 2,
+ "text/html",
+ TRUE },
+
+ { TRUE, TRUE,
+ (const guchar *)"\x00\xFF\xFF\xFF\xFF",
+ (const guchar *)" <!--",
+ 4,
+ "text/html",
+ TRUE },
+
+ { TRUE, FALSE,
+ (const guchar *)"\x00\xFF\xFF\xFF\xFF\xFF",
+ (const guchar *)" <?xml",
+ 5,
+ "text/html",
+ TRUE },
+
+ { FALSE, FALSE,
(const guchar *)"\xFF\xFF\xFF\xFF\xFF",
- (const guchar *)"\x25\x50\x44\x46\x2D",
+ (const guchar *)"%PDF-",
5,
"application/pdf",
TRUE },
- { FALSE,
+ /* Non-scriptable types. */
+ { FALSE, FALSE,
(const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
- (const guchar *)"\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
+ (const guchar *)"%!PS-Adobe-",
11,
"application/postscript",
FALSE },
- { FALSE,
+ { FALSE, FALSE, /* UTF-16BE BOM */
(const guchar *)"\xFF\xFF\x00\x00",
(const guchar *)"\xFE\xFF\x00\x00",
4,
"text/plain",
FALSE },
- { FALSE,
- (const guchar *)"\xFF\xFF\x00\x00",
+ { FALSE, FALSE, /* UTF-16LE BOM */
(const guchar *)"\xFF\xFF\x00\x00",
+ (const guchar *)"\xFF\xFE\x00\x00",
4,
"text/plain",
FALSE },
- { FALSE,
+ { FALSE, FALSE, /* UTF-8 BOM */
(const guchar *)"\xFF\xFF\xFF\x00",
(const guchar *)"\xEF\xBB\xBF\x00",
4,
"text/plain",
FALSE },
-
- { FALSE,
- (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
- (const guchar *)"\x47\x49\x46\x38\x37\x61",
- 6,
- "image/gif",
- FALSE },
-
- { FALSE,
- (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF",
- (const guchar *)"\x47\x49\x46\x38\x39\x61",
- 6,
- "image/gif",
- FALSE },
-
- { FALSE,
- (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
- (const guchar *)"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
- 8,
- "image/png",
- FALSE },
-
- { FALSE,
- (const guchar *)"\xFF\xFF\xFF",
- (const guchar *)"\xFF\xD8\xFF",
- 3,
- "image/jpeg",
- FALSE },
-
- { FALSE,
- (const guchar *)"\xFF\xFF",
- (const guchar *)"\x42\x4D",
- 2,
- "image/bmp",
- FALSE },
-
- { FALSE,
- (const guchar *)"\xFF\xFF\xFF\xFF",
- (const guchar *)"\x00\x00\x01\x00",
- 4,
- "image/vnd.microsoft.icon",
- FALSE }
};
/* Whether a given byte looks like it might be part of binary content.
@@ -219,8 +486,9 @@ static char byte_looks_binary[] = {
/* HTML5: 2.7.4 Content-Type sniffing: unknown type */
static char*
sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
- gboolean for_text_or_binary)
+ gboolean sniff_scriptable)
{
+ char *sniffed_type = NULL;
const guchar *resource = (const guchar *)buffer->data;
int resource_length = MIN (512, buffer->length);
int i;
@@ -228,9 +496,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
SoupContentSnifferPattern *type_row = &(types_table[i]);
- /* The scriptable types should be skiped for the text
- * or binary path, but considered for other paths */
- if (for_text_or_binary && type_row->scriptable)
+ if (!sniff_scriptable && type_row->scriptable)
continue;
if (type_row->has_ws) {
@@ -263,8 +529,14 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
if (skip_row)
continue;
- if (index_pattern > type_row->pattern_length)
+ if (index_pattern > type_row->pattern_length) {
+ if (type_row->has_tag_termination &&
+ resource[index_stream] != '\x20' &&
+ resource[index_stream] != '\x3E')
+ continue;
+
return g_strdup (type_row->sniffed_type);
+ }
} else {
int j;
@@ -282,8 +554,15 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
}
}
- if (for_text_or_binary)
- return g_strdup ("application/octet-stream");
+ sniffed_type = sniff_images (sniffer, buffer);
+
+ if (sniffed_type != NULL)
+ return sniffed_type;
+
+ sniffed_type = sniff_audio_video (sniffer, buffer);
+
+ if (sniffed_type != NULL)
+ return sniffed_type;
for (i = 0; i < resource_length; i++) {
if (byte_looks_binary[resource[i]])
@@ -293,7 +572,7 @@ sniff_unknown (SoupContentSniffer *sniffer, SoupBuffer *buffer,
return g_strdup ("text/plain");
}
-/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+/* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */
static char*
sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
{
@@ -302,15 +581,20 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
gboolean looks_binary = FALSE;
int i;
- /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
- if (resource_length >= 4) {
+ /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */
+ if (resource_length >= 2) {
if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
- (resource[0] == 0xFF && resource[1] == 0xFE) ||
- (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
+ (resource[0] == 0xFF && resource[1] == 0xFE))
+ return g_strdup ("text/plain");
+ }
+
+ /* 3. UTF-8 BOM. */
+ if (resource_length >= 3) {
+ if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
return g_strdup ("text/plain");
}
- /* Look to see if any of the first n bytes looks binary */
+ /* 4. Look to see if any of the first n bytes looks binary */
for (i = 0; i < resource_length; i++) {
if (byte_looks_binary[resource[i]]) {
looks_binary = TRUE;
@@ -321,40 +605,32 @@ sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if (!looks_binary)
return g_strdup ("text/plain");
+ /* 5. Execute 7.1 Identifying a resource with an unknown MIME type.
+ * TODO: sniff-scriptable needs to be unset.
+ */
return sniff_unknown (sniffer, buffer, TRUE);
}
-static char*
-sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer,
- const char *content_type)
+static gboolean
+skip_insignificant_space (const char *resource, int *pos, int resource_length)
{
- const guchar *resource = (const guchar *)buffer->data;
- int resource_length = MIN (512, buffer->length);
- int i;
-
- for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
- SoupContentSnifferPattern *type_row = &(types_table[i]);
-
- if (resource_length < type_row->pattern_length)
- continue;
-
- if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
- continue;
-
- /* All of the image types use all-\xFF for the mask,
- * so we can just memcmp.
- */
- if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
- return g_strdup (type_row->sniffed_type);
+ while ((resource[*pos] == '\x09') ||
+ (resource[*pos] == '\x20') ||
+ (resource[*pos] == '\x0A') ||
+ (resource[*pos] == '\x0D')) {
+ *pos = *pos + 1;
+
+ if (*pos > resource_length)
+ return TRUE;
}
- return g_strdup (content_type);
+ return FALSE;
}
static char*
sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
{
- const guchar *resource = (const guchar *)buffer->data;
+ const char *resource = (const char *)buffer->data;
int resource_length = MIN (512, buffer->length);
int pos = 0;
@@ -369,19 +645,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if (pos > resource_length)
goto text_html;
- /* Skip insignificant white space */
- while ((resource[pos] == '\x09') ||
- (resource[pos] == '\x20') ||
- (resource[pos] == '\x0A') ||
- (resource[pos] == '\x0D')) {
- pos++;
-
- if (pos > resource_length)
- goto text_html;
- }
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
- /* != < */
- if (resource[pos] != '\x3C')
+ if (resource[pos] != '<')
return g_strdup ("text/html");
pos++;
@@ -389,73 +656,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer)
if ((pos + 2) > resource_length)
goto text_html;
- /* Skipping comments */
- if ((resource[pos] == '\x2D') ||
- (resource[pos+1] == '\x2D') ||
- (resource[pos+2] == '\x3E')) {
+ /* Skip comments. */
+ if (g_str_has_prefix (resource + pos, "!--")) {
pos = pos + 3;
if ((pos + 2) > resource_length)
goto text_html;
- while ((resource[pos] != '\x2D') &&
- (resource[pos+1] != '\x2D') &&
- (resource[pos+2] != '\x3E')) {
+ while (!g_str_has_prefix (resource + pos, "-->")) {
pos++;
if ((pos + 2) > resource_length)
goto text_html;
}
+ pos = pos + 3;
+
goto look_for_tag;
}
if (pos > resource_length)
goto text_html;
- /* == ! */
- if (resource[pos] == '\x21') {
+ if (resource[pos] == '!') {
do {
pos++;
if (pos > resource_length)
goto text_html;
- } while (resource[pos] != '\x3E');
+ } while (resource[pos] != '>');
pos++;
goto look_for_tag;
- } else if (resource[pos] == '\x3F') { /* ? */
+ } else if (resource[pos] == '?') {
do {
pos++;
if ((pos + 1) > resource_length)
goto text_html;
- } while ((resource[pos] != '\x3F') &&
- (resource[pos+1] != '\x3E'));
+ } while (!g_str_has_prefix (resource + pos, "?>"));
pos = pos + 2;
goto look_for_tag;
}
- if ((pos + 2) > resource_length)
+ if ((pos + 3) > resource_length)
goto text_html;
- if ((resource[pos] == '\x72') &&
- (resource[pos+1] == '\x73') &&
- (resource[pos+2] == '\x73'))
+ if (g_str_has_prefix (resource + pos, "rss"))
return g_strdup ("application/rss+xml");
- if ((pos + 3) > resource_length)
+ if ((pos + 4) > resource_length)
goto text_html;
- if ((resource[pos] == '\x66') &&
- (resource[pos+1] == '\x65') &&
- (resource[pos+2] == '\x65') &&
- (resource[pos+3] == '\x64'))
+ if (g_str_has_prefix (resource + pos, "feed"))
return g_strdup ("application/atom+xml");
+ if ((pos + 7) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "rdf:RDF")) {
+ pos = pos + 7;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 32) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) {
+ pos = pos + 32;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 55) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""))
+ return g_strdup ("application/rss+xml");
+ }
+
+ if ((pos + 55) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) {
+ pos = pos + 55;
+
+ if (skip_insignificant_space (resource, &pos, resource_length))
+ goto text_html;
+
+ if ((pos + 32) > resource_length)
+ goto text_html;
+
+ if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\""))
+ return g_strdup ("application/rss+xml");
+ }
+ }
+
text_html:
return g_strdup ("text/html");
}
@@ -465,43 +765,71 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg,
SoupBuffer *buffer, GHashTable **params)
{
const char *content_type;
+ const char *x_content_type_options;
+ char *sniffed_type = NULL;
+ gboolean no_sniff = FALSE;
content_type = soup_message_headers_get_content_type (msg->response_headers, params);
- /* These comparisons are done in an ASCII-case-insensitive
- * manner because the spec requires it */
+ /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */
+
+ x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options");
+ if (!g_strcmp0 (x_content_type_options, "nosniff"))
+ no_sniff = TRUE;
+
+ /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */
if ((content_type == NULL) ||
!g_ascii_strcasecmp (content_type, "unknown/unknown") ||
!g_ascii_strcasecmp (content_type, "application/unknown") ||
!g_ascii_strcasecmp (content_type, "*/*"))
- return sniff_unknown (sniffer, buffer, FALSE);
+ return sniff_unknown (sniffer, buffer, !no_sniff);
+
+ /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */
+ if (no_sniff)
+ return g_strdup (content_type);
+
+ /* 3. check-for-apache-bug */
+ if ((content_type != NULL) &&
+ (g_str_equal (content_type, "text/plain") ||
+ g_str_equal (content_type, "text/plain; charset=ISO-8859-1") ||
+ g_str_equal (content_type, "text/plain; charset=iso-8859-1") ||
+ g_str_equal (content_type, "text/plain; charset=UTF-8")))
+ return sniff_text_or_binary (sniffer, buffer);
+ /* 4. XML types sent by the server are always used. */
if (g_str_has_suffix (content_type, "+xml") ||
!g_ascii_strcasecmp (content_type, "text/xml") ||
!g_ascii_strcasecmp (content_type, "application/xml"))
return g_strdup (content_type);
- /* 2.7.5 Content-Type sniffing: image
- * The spec says:
- *
- * If the resource's official type is "image/svg+xml", then
- * the sniffed type of the resource is its official type (an
- * XML type)
- *
- * The XML case is handled by the if above; if you refactor
- * this code, keep this in mind.
+ /* 5. Distinguish feed from HTML. */
+ if (!g_ascii_strcasecmp (content_type, "text/html"))
+ return sniff_feed_or_html (sniffer, buffer);
+
+ /* 6. Image types.
*/
- if (!g_ascii_strncasecmp (content_type, "image/", 6))
- return sniff_images (sniffer, buffer, content_type);
+ if (!g_ascii_strncasecmp (content_type, "image/", 6)) {
+ sniffed_type = sniff_images (sniffer, buffer);
+ if (sniffed_type != NULL)
+ return sniffed_type;
+ return g_strdup (content_type);
+ }
+
+ /* 7. Audio and video types. */
+ if (!g_ascii_strncasecmp (content_type, "audio/", 6) ||
+ !g_ascii_strncasecmp (content_type, "video/", 6) ||
+ !g_ascii_strcasecmp (content_type, "application/ogg")) {
+ sniffed_type = sniff_audio_video (sniffer, buffer);
+ if (sniffed_type != NULL)
+ return sniffed_type;
+ return g_strdup (content_type);
+ }
/* If we got text/plain, use text_or_binary */
if (g_str_equal (content_type, "text/plain")) {
return sniff_text_or_binary (sniffer, buffer);
}
- if (!g_ascii_strcasecmp (content_type, "text/html"))
- return sniff_feed_or_html (sniffer, buffer);
-
return g_strdup (content_type);
}
diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf
new file mode 100644
index 00000000..f3d9e276
--- /dev/null
+++ b/tests/resources/feed.rdf
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<!-- RDF Site Summary (RSS) 1.0
+ http://groups.yahoo.com/group/rss-dev/files/specification.html
+ Section 5.3
+ -->
+
+<rdf:RDF
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns="http://purl.org/rss/1.0/">
+
+ <channel rdf:about="http://www.xml.com/xml/news.rss">
+ <title>XML.com</title>
+ <link>http://xml.com/pub</link>
+ <description>
+ XML.com features a rich mix of information and services
+ for the XML community.
+ </description>
+
+ <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif" />
+
+ <items>
+ <rdf:Seq>
+ <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html" />
+ <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html" />
+ </rdf:Seq>
+ </items>
+
+ <textinput rdf:resource="http://search.xml.com" />
+ </channel>
+
+</rdf:RDF>
diff --git a/tests/resources/home.jpg b/tests/resources/home.jpg
new file mode 100644
index 00000000..ac1f3bbc
--- /dev/null
+++ b/tests/resources/home.jpg
Binary files differ
diff --git a/tests/resources/home.png b/tests/resources/home.png
new file mode 100644
index 00000000..0bb82bac
--- /dev/null
+++ b/tests/resources/home.png
Binary files differ
diff --git a/tests/resources/html_binary.html b/tests/resources/html_binary.html
index 9200dd42..d443048c 100644
--- a/tests/resources/html_binary.html
+++ b/tests/resources/html_binary.html
@@ -1 +1 @@
-<HTML 
+<HTML
diff --git a/tests/resources/leading_space.html b/tests/resources/leading_space.html
new file mode 100644
index 00000000..a640d653
--- /dev/null
+++ b/tests/resources/leading_space.html
@@ -0,0 +1,12 @@
+
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title></title>
+</head>
+<body>
+<h1>GNOME!</h1>
+</body>
+</html>
diff --git a/tests/resources/test.aiff b/tests/resources/test.aiff
new file mode 100644
index 00000000..9a1ecbb2
--- /dev/null
+++ b/tests/resources/test.aiff
Binary files differ
diff --git a/tests/resources/test.mp4 b/tests/resources/test.mp4
new file mode 100644
index 00000000..d278c8ad
--- /dev/null
+++ b/tests/resources/test.mp4
Binary files differ
diff --git a/tests/resources/test.ogg b/tests/resources/test.ogg
new file mode 100644
index 00000000..e8f49ac3
--- /dev/null
+++ b/tests/resources/test.ogg
Binary files differ
diff --git a/tests/resources/test.wav b/tests/resources/test.wav
new file mode 100644
index 00000000..11660b29
--- /dev/null
+++ b/tests/resources/test.wav
Binary files differ
diff --git a/tests/resources/test.webm b/tests/resources/test.webm
new file mode 100644
index 00000000..7e53d0b4
--- /dev/null
+++ b/tests/resources/test.webm
Binary files differ
diff --git a/tests/resources/text.txt b/tests/resources/text.txt
new file mode 100644
index 00000000..ff7066f6
--- /dev/null
+++ b/tests/resources/text.txt
@@ -0,0 +1 @@
+This is just text.
diff --git a/tests/resources/tux.webp b/tests/resources/tux.webp
new file mode 100644
index 00000000..8764f066
--- /dev/null
+++ b/tests/resources/tux.webp
Binary files differ
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
index 532e6ed5..5b0e6ee7 100644
--- a/tests/sniffing-test.c
+++ b/tests/sniffing-test.c
@@ -49,7 +49,21 @@ server_callback (SoupServer *server, SoupMessage *msg,
"Content-Type", "text/plain");
}
- if (g_str_has_prefix (path, "/text_or_binary/")) {
+ if (g_str_has_prefix (path, "/nosniff/")) {
+ char *base_name = g_path_get_basename (path);
+
+ response = soup_test_load_resource (base_name, &error);
+ g_assert_no_error (error);
+ g_free (base_name);
+
+ soup_message_headers_append (msg->response_headers,
+ "X-Content-Type-Options", "nosniff");
+
+ soup_message_headers_append (msg->response_headers,
+ "Content-Type", "no/sniffing-allowed");
+ }
+
+ if (g_str_has_prefix (path, "/text_or_binary/") || g_str_has_prefix (path, "/apache_bug/")) {
char *base_name = g_path_get_basename (path);
response = soup_test_load_resource (base_name, &error);
@@ -442,6 +456,20 @@ main (int argc, char **argv)
GINT_TO_POINTER (TRUE),
do_signals_tests);
+ /* Test the apache bug sniffing path */
+ g_test_add_data_func ("/sniffing/apache-bug/binary",
+ "/apache_bug/text_binary.txt => application/octet-stream",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/apache-bug/text",
+ "/apache_bug/text.txt => text/plain",
+ do_sniffing_test);
+
+ /* X-Content-Type-Options: nosniff */
+ g_test_add_data_func ("/sniffing/nosniff",
+ "nosniff/home.gif => no/sniffing-allowed",
+ do_sniffing_test);
+
+ /* GIF is a 'safe' type */
g_test_add_data_func ("/sniffing/type/gif",
"text_or_binary/home.gif => image/gif",
do_sniffing_test);
@@ -496,6 +524,9 @@ main (int argc, char **argv)
g_test_add_data_func ("/sniffing/type/unknown-binary",
"unknown/text_binary.txt => application/octet-stream",
do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/unknown-leading-space",
+ "unknown/leading_space.html => text/html",
+ do_sniffing_test);
/* Test the XML sniffing path */
g_test_add_data_func ("/sniffing/type/xml",
@@ -508,11 +539,6 @@ main (int argc, char **argv)
"type/application_xml/home.gif => application/xml",
do_sniffing_test);
- /* Test the image sniffing path */
- g_test_add_data_func ("/sniffing/type/image",
- "type/image_png/home.gif => image/gif",
- do_sniffing_test);
-
/* Test the feed or html path */
g_test_add_data_func ("/sniffing/type/html/html",
"type/text_html/test.html => text/html",
@@ -523,6 +549,42 @@ main (int argc, char **argv)
g_test_add_data_func ("/sniffing/type/html/atom",
"type/text_html/atom.xml => application/atom+xml",
do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/html/rdf",
+ "type/text_html/feed.rdf => application/rss+xml",
+ do_sniffing_test);
+
+ /* Test the image sniffing path */
+ g_test_add_data_func ("/sniffing/type/image/gif",
+ "type/image_png/home.gif => image/gif",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/image/png",
+ "type/image_gif/home.png => image/png",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/image/jpeg",
+ "type/image_png/home.jpg => image/jpeg",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/image/webp",
+ "type/image_png/tux.webp => image/webp",
+ do_sniffing_test);
+
+ /* Test audio and video sniffing path */
+ g_test_add_data_func ("/sniffing/type/audio/wav",
+ "type/audio_mpeg/test.wav => audio/wave",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/audio/aiff",
+ "type/audio_mpeg/test.aiff => audio/aiff",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/audio/ogg",
+ "type/audio_mpeg/test.ogg => application/ogg",
+ do_sniffing_test);
+ g_test_add_data_func ("/sniffing/type/video/webm",
+ "type/video_theora/test.webm => video/webm",
+ do_sniffing_test);
+
+ /* Test the MP4 sniffing path */
+ g_test_add_data_func ("/sniffing/type/video/mp4",
+ "unknown/test.mp4 => video/mp4",
+ do_sniffing_test);
/* The spec tells us to only use the last Content-Type header */
g_test_add_data_func ("/sniffing/multiple-headers",
diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml
index 9b580a35..b24a7297 100644
--- a/tests/soup-tests.gresource.xml
+++ b/tests/soup-tests.gresource.xml
@@ -3,15 +3,26 @@
<gresource prefix="/org/gnome/libsoup/tests">
<file>index.txt</file>
<file>resources/atom.xml</file>
+ <file>resources/feed.rdf</file>
<file>resources/home.gif</file>
+ <file>resources/home.jpg</file>
+ <file>resources/home.png</file>
<file>resources/html_binary.html</file>
+ <file>resources/leading_space.html</file>
<file>resources/mbox</file>
<file>resources/mbox.gz</file>
<file>resources/mbox.raw</file>
<file>resources/mbox.zlib</file>
<file>resources/ps_binary.ps</file>
<file>resources/rss20.xml</file>
+ <file>resources/test.aiff</file>
<file>resources/test.html</file>
+ <file>resources/test.mp4</file>
+ <file>resources/test.ogg</file>
+ <file>resources/test.wav</file>
+ <file>resources/test.webm</file>
+ <file>resources/text.txt</file>
<file>resources/text_binary.txt</file>
+ <file>resources/tux.webp</file>
</gresource>
</gresources>