diff options
Diffstat (limited to 'libsoup/soup-content-sniffer.c')
-rw-r--r-- | libsoup/soup-content-sniffer.c | 117 |
1 files changed, 79 insertions, 38 deletions
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c index 154df841..5b768bb2 100644 --- a/libsoup/soup-content-sniffer.c +++ b/libsoup/soup-content-sniffer.c @@ -491,10 +491,26 @@ sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer, return g_strdup (content_type); } +static gboolean +skip_insignificant_space (const char *resource, int *pos, int resource_length) +{ + while ((resource[*pos] == '\x09') || + (resource[*pos] == '\x20') || + (resource[*pos] == '\x0A') || + (resource[*pos] == '\x0D')) { + *pos = *pos + 1; + + if (*pos > resource_length) + return TRUE; + } + + return FALSE; +} + static char* sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) { - const guchar *resource = (const guchar *)buffer->data; + const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); int pos = 0; @@ -509,19 +525,10 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if (pos > resource_length) goto text_html; - /* Skip insignificant white space */ - while ((resource[pos] == '\x09') || - (resource[pos] == '\x20') || - (resource[pos] == '\x0A') || - (resource[pos] == '\x0D')) { - pos++; - - if (pos > resource_length) - goto text_html; - } + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; - /* != < */ - if (resource[pos] != '\x3C') + if (resource[pos] != '<') return g_strdup ("text/html"); pos++; @@ -529,73 +536,106 @@ sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) if ((pos + 2) > resource_length) goto text_html; - /* Skipping comments */ - if ((resource[pos] == '\x2D') || - (resource[pos+1] == '\x2D') || - (resource[pos+2] == '\x3E')) { + /* Skip comments. */ + if (g_str_has_prefix (resource + pos, "!--")) { pos = pos + 3; if ((pos + 2) > resource_length) goto text_html; - while ((resource[pos] != '\x2D') && - (resource[pos+1] != '\x2D') && - (resource[pos+2] != '\x3E')) { + while (!g_str_has_prefix (resource + pos, "-->")) { pos++; if ((pos + 2) > resource_length) goto text_html; } + pos = pos + 3; + goto look_for_tag; } if (pos > resource_length) goto text_html; - /* == ! */ - if (resource[pos] == '\x21') { + if (resource[pos] == '!') { do { pos++; if (pos > resource_length) goto text_html; - } while (resource[pos] != '\x3E'); + } while (resource[pos] != '>'); pos++; goto look_for_tag; - } else if (resource[pos] == '\x3F') { /* ? */ + } else if (resource[pos] == '?') { do { pos++; if ((pos + 1) > resource_length) goto text_html; - } while ((resource[pos] != '\x3F') && - (resource[pos+1] != '\x3E')); + } while (!g_str_has_prefix (resource + pos, "?>")); pos = pos + 2; goto look_for_tag; } - if ((pos + 2) > resource_length) + if ((pos + 3) > resource_length) goto text_html; - if ((resource[pos] == '\x72') && - (resource[pos+1] == '\x73') && - (resource[pos+2] == '\x73')) + if (g_str_has_prefix (resource + pos, "rss")) return g_strdup ("application/rss+xml"); - if ((pos + 3) > resource_length) + if ((pos + 4) > resource_length) goto text_html; - if ((resource[pos] == '\x66') && - (resource[pos+1] == '\x65') && - (resource[pos+2] == '\x65') && - (resource[pos+3] == '\x64')) + if (g_str_has_prefix (resource + pos, "feed")) return g_strdup ("application/atom+xml"); + if ((pos + 7) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "rdf:RDF")) { + pos = pos + 7; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) { + pos = pos + 32; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) + return g_strdup ("application/rss+xml"); + } + + if ((pos + 55) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) { + pos = pos + 55; + + if (skip_insignificant_space (resource, &pos, resource_length)) + goto text_html; + + if ((pos + 32) > resource_length) + goto text_html; + + if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) + return g_strdup ("application/rss+xml"); + } + } + text_html: return g_strdup ("text/html"); } @@ -641,6 +681,10 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, !g_ascii_strcasecmp (content_type, "application/xml")) return g_strdup (content_type); + /* 5. Distinguish feed from HTML. */ + if (!g_ascii_strcasecmp (content_type, "text/html")) + return sniff_feed_or_html (sniffer, buffer); + /* 2.7.5 Content-Type sniffing: image * The spec says: * @@ -659,9 +703,6 @@ soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, return sniff_text_or_binary (sniffer, buffer); } - if (!g_ascii_strcasecmp (content_type, "text/html")) - return sniff_feed_or_html (sniffer, buffer); - return g_strdup (content_type); } |