/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* * soup-content-sniffer.c * * Copyright (C) 2009, 2013 Gustavo Noronha Silva. * * This code implements the following specification: * * http://mimesniff.spec.whatwg.org/ as of 11 June 2013 */ #ifdef HAVE_CONFIG_H #include #endif #include #include "soup-content-sniffer.h" #include "soup.h" #include "soup-content-processor.h" #include "soup-content-sniffer-stream.h" #include "soup-message-private.h" /** * SECTION:soup-content-sniffer * @short_description: Content sniffing for SoupSession * * A #SoupContentSniffer tries to detect the actual content type of * the files that are being downloaded by looking at some of the data * before the #SoupMessage emits its #SoupMessage::got-headers signal. * #SoupContentSniffer implements #SoupSessionFeature, so you can add * content sniffing to a session with soup_session_add_feature() or * soup_session_add_feature_by_type(). * * Since: 2.28 **/ static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data); static SoupContentProcessorInterface *soup_content_sniffer_default_content_processor_interface; static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *interface, gpointer interface_data); G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT, G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE, soup_content_sniffer_session_feature_init) G_IMPLEMENT_INTERFACE (SOUP_TYPE_CONTENT_PROCESSOR, soup_content_sniffer_content_processor_init)) static GInputStream * soup_content_sniffer_content_processor_wrap_input (SoupContentProcessor *processor, GInputStream *base_stream, SoupMessage *msg, GError **error) { return g_object_new (SOUP_TYPE_CONTENT_SNIFFER_STREAM, "base-stream", base_stream, "message", msg, "sniffer", SOUP_CONTENT_SNIFFER (processor), NULL); } static void soup_content_sniffer_content_processor_init (SoupContentProcessorInterface *processor_interface, gpointer interface_data) { soup_content_sniffer_default_content_processor_interface = g_type_default_interface_peek (SOUP_TYPE_CONTENT_PROCESSOR); processor_interface->processing_stage = SOUP_STAGE_BODY_DATA; processor_interface->wrap_input = soup_content_sniffer_content_processor_wrap_input; } static void soup_content_sniffer_init (SoupContentSniffer *content_sniffer) { } typedef struct { const guchar *mask; const guchar *pattern; guint pattern_length; const char *sniffed_type; } SoupContentSnifferMediaPattern; static char* sniff_media (SoupContentSniffer *sniffer, SoupBuffer *buffer, SoupContentSnifferMediaPattern table[], int table_length) { const guchar *resource = (const guchar *)buffer->data; int resource_length = MIN (512, buffer->length); int i; for (i = 0; i < table_length; i++) { SoupContentSnifferMediaPattern *type_row = &(table[i]); int j; if (resource_length < type_row->pattern_length) continue; for (j = 0; j < type_row->pattern_length; j++) { if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) break; } /* This means our comparison above matched completely */ if (j == type_row->pattern_length) return g_strdup (type_row->sniffed_type); } return NULL; } /* This table is based on the MIMESNIFF spec; * See 6.1 Matching an image type pattern */ static SoupContentSnifferMediaPattern image_types_table[] = { /* Windows icon signature. */ { (const guchar *)"\xFF\xFF\xFF\xFF", (const guchar *)"\x00\x00\x01\x00", 4, "image/x-icon" }, /* Windows cursor signature. */ { (const guchar *)"\xFF\xFF\xFF\xFF", (const guchar *)"\x00\x00\x02\x00", 4, "image/x-icon" }, /* BMP. */ { (const guchar *)"\xFF\xFF", (const guchar *)"BM", 2, "image/bmp" }, /* GIFs. */ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", (const guchar *)"GIF87a", 6, "image/gif" }, { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF", (const guchar *)"GIF89a", 6, "image/gif" }, /* WEBP. */ { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", (const guchar *)"RIFF\x00\x00\x00\x00WEBPVP", 14, "image/webp" }, /* PNG. */ { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", (const guchar *)"\x89PNG\x0D\x0A\x1A\x0A", 8, "image/png" }, /* JPEG. */ { (const guchar *)"\xFF\xFF\xFF", (const guchar *)"\xFF\xD8\xFF", 3, "image/jpeg" }, }; static char* sniff_images (SoupContentSniffer *sniffer, SoupBuffer *buffer) { return sniff_media (sniffer, buffer, image_types_table, G_N_ELEMENTS (image_types_table)); } /* This table is based on the MIMESNIFF spec; * See 6.2 Matching an audio or video type pattern */ static SoupContentSnifferMediaPattern audio_video_types_table[] = { { (const guchar *)"\xFF\xFF\xFF\xFF", (const guchar *)"\x1A\x45\xDF\xA3", 4, "video/webm" }, { (const guchar *)"\xFF\xFF\xFF\xFF", (const guchar *)".snd", 4, "audio/basic" }, { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", (const guchar *)"FORM\0\0\0\0AIFF", 12, "audio/aiff" }, { (const guchar *)"\xFF\xFF\xFF", (const guchar *)"ID3", 3, "audio/mpeg" }, { (const guchar *)"\xFF\xFF\xFF\xFF\xFF", (const guchar *)"OggS\0", 5, "application/ogg" }, { (const guchar *)"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", (const guchar *)"MThd\x00\x00\x00\x06", 8, "audio/midi" }, { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", (const guchar *)"RIFF\x00\x00\x00\x00AVI ", 12, "video/avi" }, { (const guchar *)"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", (const guchar *)"RIFF\x00\x00\x00\x00WAVE", 12, "audio/wave" }, }; static gboolean sniff_mp4 (SoupContentSniffer *sniffer, SoupBuffer *buffer) { const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); guint32 box_size = *((guint32*)resource); int i; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ box_size = ((box_size >> 24) | ((box_size << 8) & 0x00FF0000) | ((box_size >> 8) & 0x0000FF00) | (box_size << 24)); #endif if (resource_length < 12 || resource_length < box_size || box_size % 4 != 0) return FALSE; if (!g_str_has_prefix (resource + 4, "ftyp")) return FALSE; if (!g_str_has_prefix (resource + 8, "mp4")) return FALSE; for (i = 16; i < box_size && i < resource_length; i = i + 4) { if (g_str_has_prefix (resource + i, "mp4")) return TRUE; } return FALSE; } static char* sniff_audio_video (SoupContentSniffer *sniffer, SoupBuffer *buffer) { char *sniffed_type; sniffed_type = sniff_media (sniffer, buffer, audio_video_types_table, G_N_ELEMENTS (audio_video_types_table)); if (sniffed_type != NULL) return sniffed_type; if (sniff_mp4 (sniffer, buffer)) return g_strdup ("video/mp4"); return NULL; } /* This table is based on the MIMESNIFF spec; * See 7.1 Identifying a resource with an unknown MIME type */ typedef struct { /* @has_ws is TRUE if @pattern contains "generic" whitespace */ gboolean has_ws; /* @has_tag_termination is TRUE if we should check for a tag-terminating * byte (0x20 " " or 0x3E ">") after the pattern match. */ gboolean has_tag_termination; const guchar *mask; const guchar *pattern; guint pattern_length; const char *sniffed_type; gboolean scriptable; } SoupContentSnifferPattern; /* When has_ws is TRUE, spaces in the pattern will indicate where insignificant space * is allowed. Those spaces are marked with \x00 on the mask. */ static SoupContentSnifferPattern types_table[] = { /* Scriptable types. */ { TRUE, TRUE, (const guchar *)"\x00\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", (const guchar *)" data; int resource_length = MIN (512, buffer->length); int i; for (i = 0; i < G_N_ELEMENTS (types_table); i++) { SoupContentSnifferPattern *type_row = &(types_table[i]); if (!sniff_scriptable && type_row->scriptable) continue; if (type_row->has_ws) { int index_stream = 0; int index_pattern = 0; gboolean skip_row = FALSE; while ((index_stream < resource_length) && (index_pattern <= type_row->pattern_length)) { /* Skip insignificant white space ("WS" in the spec) */ if (type_row->pattern[index_pattern] == ' ') { if (resource[index_stream] == '\x09' || resource[index_stream] == '\x0a' || resource[index_stream] == '\x0c' || resource[index_stream] == '\x0d' || resource[index_stream] == '\x20') index_stream++; else index_pattern++; } else { if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) { skip_row = TRUE; break; } index_pattern++; index_stream++; } } if (skip_row) continue; if (index_pattern > type_row->pattern_length) { if (type_row->has_tag_termination && resource[index_stream] != '\x20' && resource[index_stream] != '\x3E') continue; return g_strdup (type_row->sniffed_type); } } else { int j; if (resource_length < type_row->pattern_length) continue; for (j = 0; j < type_row->pattern_length; j++) { if ((type_row->mask[j] & resource[j]) != type_row->pattern[j]) break; } /* This means our comparison above matched completely */ if (j == type_row->pattern_length) return g_strdup (type_row->sniffed_type); } } sniffed_type = sniff_images (sniffer, buffer); if (sniffed_type != NULL) return sniffed_type; sniffed_type = sniff_audio_video (sniffer, buffer); if (sniffed_type != NULL) return sniffed_type; for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) return g_strdup ("application/octet-stream"); } return g_strdup ("text/plain"); } /* MIMESNIFF: 7.2 Sniffing a mislabeled binary resource */ static char* sniff_text_or_binary (SoupContentSniffer *sniffer, SoupBuffer *buffer) { const guchar *resource = (const guchar *)buffer->data; int resource_length = MIN (512, buffer->length); gboolean looks_binary = FALSE; int i; /* 2. Detecting UTF-16BE, UTF-16LE BOMs means it's text/plain */ if (resource_length >= 2) { if ((resource[0] == 0xFE && resource[1] == 0xFF) || (resource[0] == 0xFF && resource[1] == 0xFE)) return g_strdup ("text/plain"); } /* 3. UTF-8 BOM. */ if (resource_length >= 3) { if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) return g_strdup ("text/plain"); } /* 4. Look to see if any of the first n bytes looks binary */ for (i = 0; i < resource_length; i++) { if (byte_looks_binary[resource[i]]) { looks_binary = TRUE; break; } } if (!looks_binary) return g_strdup ("text/plain"); /* 5. Execute 7.1 Identifying a resource with an unknown MIME type. * TODO: sniff-scriptable needs to be unset. */ return sniff_unknown (sniffer, buffer, TRUE); } static gboolean skip_insignificant_space (const char *resource, int *pos, int resource_length) { while ((resource[*pos] == '\x09') || (resource[*pos] == '\x20') || (resource[*pos] == '\x0A') || (resource[*pos] == '\x0D')) { *pos = *pos + 1; if (*pos > resource_length) return TRUE; } return FALSE; } static char* sniff_feed_or_html (SoupContentSniffer *sniffer, SoupBuffer *buffer) { const char *resource = (const char *)buffer->data; int resource_length = MIN (512, buffer->length); int pos = 0; if (resource_length < 3) goto text_html; /* Skip a leading UTF-8 BOM */ if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF) pos = 3; look_for_tag: if (pos > resource_length) goto text_html; if (skip_insignificant_space (resource, &pos, resource_length)) goto text_html; if (resource[pos] != '<') return g_strdup ("text/html"); pos++; if ((pos + 2) > resource_length) goto text_html; /* Skip comments. */ if (g_str_has_prefix (resource + pos, "!--")) { pos = pos + 3; if ((pos + 2) > resource_length) goto text_html; while (!g_str_has_prefix (resource + pos, "-->")) { pos++; if ((pos + 2) > resource_length) goto text_html; } pos = pos + 3; goto look_for_tag; } if (pos > resource_length) goto text_html; if (resource[pos] == '!') { do { pos++; if (pos > resource_length) goto text_html; } while (resource[pos] != '>'); pos++; goto look_for_tag; } else if (resource[pos] == '?') { do { pos++; if ((pos + 1) > resource_length) goto text_html; } while (!g_str_has_prefix (resource + pos, "?>")); pos = pos + 2; goto look_for_tag; } if ((pos + 3) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "rss")) return g_strdup ("application/rss+xml"); if ((pos + 4) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "feed")) return g_strdup ("application/atom+xml"); if ((pos + 7) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "rdf:RDF")) { pos = pos + 7; if (skip_insignificant_space (resource, &pos, resource_length)) goto text_html; if ((pos + 32) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) { pos = pos + 32; if (skip_insignificant_space (resource, &pos, resource_length)) goto text_html; if ((pos + 55) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) return g_strdup ("application/rss+xml"); } if ((pos + 55) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"")) { pos = pos + 55; if (skip_insignificant_space (resource, &pos, resource_length)) goto text_html; if ((pos + 32) > resource_length) goto text_html; if (g_str_has_prefix (resource + pos, "xmlns=\"http://purl.org/rss/1.0/\"")) return g_strdup ("application/rss+xml"); } } text_html: return g_strdup ("text/html"); } static char * soup_content_sniffer_real_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params) { const char *content_type; const char *x_content_type_options; char *sniffed_type = NULL; gboolean no_sniff = FALSE; content_type = soup_message_headers_get_content_type (msg->response_headers, params); /* MIMESNIFF: 7 Determining the sniffed MIME type of a resource. */ x_content_type_options = soup_message_headers_get_one (msg->response_headers, "X-Content-Type-Options"); if (!g_strcmp0 (x_content_type_options, "nosniff")) no_sniff = TRUE; /* 1. Unknown/undefined supplied type with sniff-scritable = !nosniff. */ if ((content_type == NULL) || !g_ascii_strcasecmp (content_type, "unknown/unknown") || !g_ascii_strcasecmp (content_type, "application/unknown") || !g_ascii_strcasecmp (content_type, "*/*")) return sniff_unknown (sniffer, buffer, !no_sniff); /* 2. If nosniff is specified in X-Content-Type-Options use the supplied MIME type. */ if (no_sniff) return g_strdup (content_type); /* 3. check-for-apache-bug */ if ((content_type != NULL) && (g_str_equal (content_type, "text/plain") || g_str_equal (content_type, "text/plain; charset=ISO-8859-1") || g_str_equal (content_type, "text/plain; charset=iso-8859-1") || g_str_equal (content_type, "text/plain; charset=UTF-8"))) return sniff_text_or_binary (sniffer, buffer); /* 4. XML types sent by the server are always used. */ if (g_str_has_suffix (content_type, "+xml") || !g_ascii_strcasecmp (content_type, "text/xml") || !g_ascii_strcasecmp (content_type, "application/xml")) return g_strdup (content_type); /* 5. Distinguish feed from HTML. */ if (!g_ascii_strcasecmp (content_type, "text/html")) return sniff_feed_or_html (sniffer, buffer); /* 6. Image types. */ if (!g_ascii_strncasecmp (content_type, "image/", 6)) { sniffed_type = sniff_images (sniffer, buffer); if (sniffed_type != NULL) return sniffed_type; return g_strdup (content_type); } /* 7. Audio and video types. */ if (!g_ascii_strncasecmp (content_type, "audio/", 6) || !g_ascii_strncasecmp (content_type, "video/", 6) || !g_ascii_strcasecmp (content_type, "application/ogg")) { sniffed_type = sniff_audio_video (sniffer, buffer); if (sniffed_type != NULL) return sniffed_type; return g_strdup (content_type); } /* If we got text/plain, use text_or_binary */ if (g_str_equal (content_type, "text/plain")) { return sniff_text_or_binary (sniffer, buffer); } return g_strdup (content_type); } static gsize soup_content_sniffer_real_get_buffer_size (SoupContentSniffer *sniffer) { return 512; } static void soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer) { SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); priv->bytes_for_sniffing = soup_content_sniffer_get_buffer_size (sniffer); } static void soup_content_sniffer_request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg) { SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); priv->sniffer = g_object_ref (feature); g_signal_connect (msg, "got-headers", G_CALLBACK (soup_content_sniffer_got_headers_cb), feature); } static void soup_content_sniffer_request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg) { SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg); g_object_unref (priv->sniffer); priv->sniffer = NULL; g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature); } static void soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class) { content_sniffer_class->sniff = soup_content_sniffer_real_sniff; content_sniffer_class->get_buffer_size = soup_content_sniffer_real_get_buffer_size; } static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data) { feature_interface->request_queued = soup_content_sniffer_request_queued; feature_interface->request_unqueued = soup_content_sniffer_request_unqueued; } /** * soup_content_sniffer_new: * * Creates a new #SoupContentSniffer. * * Returns: a new #SoupContentSniffer * * Since: 2.28 **/ SoupContentSniffer * soup_content_sniffer_new () { return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL); } /** * soup_content_sniffer_sniff: * @sniffer: a #SoupContentSniffer * @msg: the message to sniff * @buffer: a buffer containing the start of @msg's response body * @params: (element-type utf8 utf8) (out) (transfer full) (allow-none): return * location for Content-Type parameters (eg, "charset"), or %NULL * * Sniffs @buffer to determine its Content-Type. The result may also * be influenced by the Content-Type declared in @msg's response * headers. * * Return value: the sniffed Content-Type of @buffer; this will never be %NULL, * but may be "application/octet-stream". * * Since: 2.28 */ char * soup_content_sniffer_sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params) { g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL); g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL); g_return_val_if_fail (buffer != NULL, NULL); return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params); } /** * soup_content_sniffer_get_buffer_size: * @sniffer: a #SoupContentSniffer * * Gets the number of bytes @sniffer needs in order to properly sniff * a buffer. * * Return value: the number of bytes to sniff * * Since: 2.28 */ gsize soup_content_sniffer_get_buffer_size (SoupContentSniffer *sniffer) { g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), 0); return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->get_buffer_size (sniffer); }