diff options
author | Gustavo Noronha Silva <gns@gnome.org> | 2013-12-09 16:20:02 +0100 |
---|---|---|
committer | Dan Winship <danw@gnome.org> | 2014-02-17 12:22:22 -0500 |
commit | cd4f6a94f9275670091326a5aec8a07bce7f8d79 (patch) | |
tree | fb9c51cf2d265756068f0d7322c6dc8659e0948f | |
parent | f5498190f9717d1366b4d57a7860899f87f0fc86 (diff) | |
download | libsoup-cd4f6a94f9275670091326a5aec8a07bce7f8d79.tar.gz |
sniffing: Bring feed vs HTML up-to-date with the MIMESNIFF spec
* decide on that before doing the image sniffing to match the spec
* use const char* and g_str_has_prefix for comparisons to make it
more legible
* deal with rdf:RDF tags
-rw-r--r-- | libsoup/soup-content-sniffer.c | 117 | ||||
-rw-r--r-- | tests/resources/feed.rdf | 32 | ||||
-rw-r--r-- | tests/sniffing-test.c | 13 | ||||
-rw-r--r-- | tests/soup-tests.gresource.xml | 1 |
4 files changed, 120 insertions, 43 deletions
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 154df841..5b768bb2 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -491,10 +491,26 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup (content_type); } +static gboolean +skip_insignificant_space (const char *resource, int *pos, int resource_length) +{ + while ((resource[*pos] == '\x09') || + (resource[*pos] == '\x20') || + (resource[*pos] == '\x0A') || + (resource[*pos] == '\x0D')) { + *pos = *pos + 1; + + if (*pos > resource_length) + return TRUE; + } + + return FALSE; +} + static char* sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) { - const guchar *resource = (const guchar *)buffer->data; + const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); int pos = 0; @@ -509,19 +525,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (pos > resource_length) goto text_html; - /* Skip insignificant white space */ - while ((resource[pos] == '\x09') || - (resource[pos] == '\x20') || - (resource[pos] == '\x0A') || - (resource[pos] == '\x0D')) { - pos++; - - if (pos > resource_length) - goto text_html; - } + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; - /* != < */ - if (resource[pos] != '\x3C') + if (resource[pos] != '<') return g_strdup ("text/html"); pos++; @@ -529,73 +536,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if ((pos + 2) > resource_length) goto text_html; - /* Skipping comments */ - if ((resource[pos] == '\x2D') || - (resource[pos+1] == '\x2D') || - (resource[pos+2] == '\x3E')) { + /* Skip comments. */ + if (g_str_has_prefix (resource + pos, "!--")) { pos = pos + 3; if ((pos + 2) > resource_length) goto text_html; - while ((resource[pos] != '\x2D') && - (resource[pos+1] != '\x2D') && - (resource[pos+2] != '\x3E')) { + while (!g_str_has_prefix (resource + pos, "-->")) { pos++; if ((pos + 2) > resource_length) goto text_html; } + pos = pos + 3; + goto look_for_tag; } if (pos > resource_length) goto text_html; - /* == ! */ - if (resource[pos] == '\x21') { + if (resource[pos] == '!') { do { pos++; if (pos > resource_length) goto text_html; - } while (resource[pos] != '\x3E'); + } while (resource[pos] != '>'); pos++; goto look_for_tag; - } else if (resource[pos] == '\x3F') { /* ? */ + } else if (resource[pos] == '?') { do { pos++; if ((pos + 1) > resource_length) goto text_html; - } while ((resource[pos] != '\x3F') && - (resource[pos+1] != '\x3E')); + } while (!g_str_has_prefix (resource + pos, "?>")); pos = pos + 2; goto look_for_tag; } - if ((pos + 2) > resource_length) + if ((pos + 3) > resource_length) goto text_html; - if ((resource[pos] == '\x72') && - (resource[pos+1] == '\x73') && - (resource[pos+2] == '\x73')) + if (g_str_has_prefix (resource + pos, "rss")) return g_strdup ("application/rss+xml"); - if ((pos + 3) > resource_length) + if ((pos + 4) > resource_length) goto text_html; - if ((resource[pos] == '\x66') && - (resource[pos+1] == '\x65') && - (resource[pos+2] == '\x65') && - (resource[pos+3] == '\x64')) + if (g_str_has_prefix (resource + pos, "feed")) return g_strdup ("application/atom+xml"); + if ((pos + 7) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "rdf:RDF")) { + pos = pos + 7; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) { + pos = pos + 32; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) + return g_strdup ("application/rss+xml"); + } + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) { + pos = pos + 55; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) + return g_strdup ("application/rss+xml"); + } + } + text_html: return g_strdup ("text/html"); } @@ -641,6 +681,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, !g_ascii_strcasecmp (content_type, "application/xml")) return g_strdup (content_type); + /* 5. Distinguish feed from HTML. */ + if (!g_ascii_strcasecmp (content_type, "text/html")) + return sniff_feed_or_html (sniffer, buffer); + /* 2.7.5 Content-Type sniffing: image * The spec says: * @@ -659,9 +703,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, return sniff_text_or_binary (sniffer, buffer); } - if (!g_ascii_strcasecmp (content_type, "text/html")) - return sniff_feed_or_html (sniffer, buffer); - return g_strdup (content_type); } diff --git a/tests/resources/feed.rdf b/tests/resources/feed.rdf new file mode 100644 index 00000000..f3d9e276 --- /dev/null +++ b/tests/resources/feed.rdf @@ -0,0 +1,32 @@ +<?xml version="1.0"?> + +<!-- RDF Site Summary (RSS) 1.0 + http://groups.yahoo.com/group/rss-dev/files/specification.html + Section 5.3 + --> + +<rdf:RDF + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns="http://purl.org/rss/1.0/"> + + <channel rdf:about="http://www.xml.com/xml/news.rss"> + <title>XML.com</title> + <link>http://xml.com/pub</link> + <description> + XML.com features a rich mix of information and services + for the XML community. + </description> + + <image rdf:resource="http://xml.com/universal/images/xml_tiny.gif" /> + + <items> + <rdf:Seq> + <rdf:li resource="http://xml.com/pub/2000/08/09/xslt/xslt.html" /> + <rdf:li resource="http://xml.com/pub/2000/08/09/rdfdb/index.html" /> + </rdf:Seq> + </items> + + <textinput rdf:resource="http://search.xml.com" /> + </channel> + +</rdf:RDF> diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c index 2dc9fb2c..498df976 100644 --- a/tests/sniffing-test.c +++ b/tests/sniffing-test.c @@ -539,11 +539,6 @@ main (int argc, char **argv) "type/application_xml/home.gif => application/xml", do_sniffing_test); - /* Test the image sniffing path */ - g_test_add_data_func ("/sniffing/type/image", - "type/image_png/home.gif => image/gif", - do_sniffing_test); - /* Test the feed or html path */ g_test_add_data_func ("/sniffing/type/html/html", "type/text_html/test.html => text/html", @@ -554,6 +549,14 @@ main (int argc, char **argv) g_test_add_data_func ("/sniffing/type/html/atom", "type/text_html/atom.xml => application/atom+xml", do_sniffing_test); + g_test_add_data_func ("/sniffing/type/html/rdf", + "type/text_html/feed.rdf => application/rss+xml", + do_sniffing_test); + + /* Test the image sniffing path */ + g_test_add_data_func ("/sniffing/type/image", + "type/image_png/home.gif => image/gif", + do_sniffing_test); /* The spec tells us to only use the last Content-Type header */ g_test_add_data_func ("/sniffing/multiple-headers", diff --git a/tests/soup-tests.gresource.xml b/tests/soup-tests.gresource.xml index d24a04b0..320cd63d 100644 --- a/tests/soup-tests.gresource.xml +++ b/tests/soup-tests.gresource.xml @@ -3,6 +3,7 @@ <gresource prefix="/org/gnome/libsoup/tests"> <file>index.txt</file> <file>resources/atom.xml</file> + <file>resources/feed.rdf</file> <file>resources/home.gif</file> <file>resources/html_binary.html</file> <file>resources/leading_space.html</file> |