From cd4f6a94f9275670091326a5aec8a07bce7f8d79 Mon Sep 17 00:00:00 2001 From: Gustavo Noronha Silva Date: Mon, 9 Dec 2013 16:20:02 +0100 Subject: sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec * decide on that before doing the image sniffing to match the spec * use const char* and g_str_has_prefix for comparisons to make it more legible * deal with rdf:RDF tags --- libsoup/soup-content-sniffer.c | 117 ++++++++++++++++++++++++++++------------- tests/resources/feed.rdf | 32 +++++++++++ tests/sniffing-test.c | 13 +++-- tests/soup-tests.gresource.xml | 1 + 4 files changed, 120 insertions(+), 43 deletions(-) create mode 100644 tests/resources/feed.rdf diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 154df841..5b768bb2 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -491,10 +491,26 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup (content_type); } +static gboolean +skip_insignificant_space (const char *resource, int *pos, int resource_length) +{ + while ((resource[*pos] == '\x09') || + (resource[*pos] == '\x20') || + (resource[*pos] == '\x0A') || + (resource[*pos] == '\x0D')) { + *pos = *pos + 1; + + if (*pos > resource_length) + return TRUE; + } + + return FALSE; +} + static char* sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) { - const guchar *resource = (const guchar *)buffer->data; + const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); int pos = 0; @@ -509,19 +525,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (pos > resource_length) goto text_html; - /* Skip insignificant white space */ - while ((resource[pos] == '\x09') || - (resource[pos] == '\x20') || - (resource[pos] == '\x0A') || - (resource[pos] == '\x0D')) { - pos++; - - if (pos > resource_length) - goto text_html; - } + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; - /* != < */ - if (resource[pos] != '\x3C') + if (resource[pos] != '<') return g_strdup ("text/html"); pos++; @@ -529,73 +536,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if ((pos + 2) > resource_length) goto text_html; - /* Skipping comments */ - if ((resource[pos] == '\x2D') || - (resource[pos+1] == '\x2D') || - (resource[pos+2] == '\x3E')) { + /* Skip comments. */ + if (g_str_has_prefix (resource + pos, "!--")) { pos = pos + 3; if ((pos + 2) > resource_length) goto text_html; - while ((resource[pos] != '\x2D') && - (resource[pos+1] != '\x2D') && - (resource[pos+2] != '\x3E')) { + while (!g_str_has_prefix (resource + pos, "-->")) { pos++; if ((pos + 2) > resource_length) goto text_html; } + pos = pos + 3; + goto look_for_tag; } if (pos > resource_length) goto text_html; - /* == ! */ - if (resource[pos] == '\x21') { + if (resource[pos] == '!') { do { pos++; if (pos > resource_length) goto text_html; - } while (resource[pos] != '\x3E'); + } while (resource[pos] != '>'); pos++; goto look_for_tag; - } else if (resource[pos] == '\x3F') { /* ? */ + } else if (resource[pos] == '?') { do { pos++; if ((pos + 1) > resource_length) goto text_html; - } while ((resource[pos] != '\x3F') && - (resource[pos+1] != '\x3E')); + } while (!g_str_has_prefix (resource + pos, "?>")); pos = pos + 2; goto look_for_tag; } - if ((pos + 2) > resource_length) + if ((pos + 3) > resource_length) goto text_html; - if ((resource[pos] == '\x72') && - (resource[pos+1] == '\x73') && - (resource[pos+2] == '\x73')) + if (g_str_has_prefix (resource + pos, "rss")) return g_strdup ("application/rss+xml"); - if ((pos + 3) > resource_length) + if ((pos + 4) > resource_length) goto text_html; - if ((resource[pos] == '\x66') && - (resource[pos+1] == '\x65') && - (resource[pos+2] == '\x65') && - (resource[pos+3] == '\x64')) + if (g_str_has_prefix (resource + pos, "feed")) return g_strdup ("application/atom+xml"); + if ((pos + 7) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "rdf:RDF")) { + pos = pos + 7; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) { + pos = pos + 32; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) + return g_strdup ("application/rss+xml"); + } + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) { + pos = pos + 55; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) + return g_strdup ("application/rss+xml"); + } + } + text_html: return g_strdup ("text/html"); } @@ -641,6 +681,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, !g_ascii_strcasecmp (content_type, "application/xml")) return g_strdup (content_type); + /* 5. Distinguish feed from HTML. */ + if (!g_ascii_strcasecmp (content_type, "text/html")) + return sniff_feed_or_html (sniffer, buffer); + /* 2.7.5 Content-Type sniffing: image * The spec says: * @@ -659,9 +703,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, return sniff_text_or_binary (sniffer, buffer); } - if (!g_ascii_strcasecmp (content_type, "text/html")) - return sniff_feed_or_html (sniffer, buffer); - return g_strdup (content_type); } diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf new file mode 100644 index 00000000..f3d9e276 --- /dev/null +++ b/tests/resources/feed.rdf @@ -0,0 +1,32 @@ + + + + + + + + XML.com + http://xml.com/pub + + XML.com features a rich mix of information and services + for the XML community. + + + + + + + + + + + + + + + diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 2dc9fb2c..498df976 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -539,11 +539,6 @@ main (int argc, char **argv) "type/application_xml/home.gif => application/xml", do_sniffing_test); - /* Test the image sniffing path */ - g_test_add_data_func ("/sniffing/type/image", - "type/image_png/home.gif => image/gif", - do_sniffing_test); - /* Test the feed or html path */ g_test_add_data_func ("/sniffing/type/html/html", "type/text_html/test.html => text/html", @@ -554,6 +549,14 @@ main (int argc, char **argv) g_test_add_data_func ("/sniffing/type/html/atom", "type/text_html/atom.xml => application/atom+xml", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/html/rdf", + "type/text_html/feed.rdf => application/rss+xml", + do_sniffing_test); + + /* Test the image sniffing path */ + g_test_add_data_func ("/sniffing/type/image", + "type/image_png/home.gif => image/gif", + do_sniffing_test); /* The spec tells us to only use the last Content-Type header */ g_test_add_data_func ("/sniffing/multiple-headers", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index d24a04b0..320cd63d 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -3,6 +3,7 @@ index.txt resources/atom.xml + resources/feed.rdf resources/home.gif resources/html_binary.html resources/leading_space.html -- cgit v1.2.1