/* gmarkup.c - Simple XML-like parser * * Copyright 2000, 2003 Red Hat, Inc. * Copyright 2007, 2008 Ryan Lortie * * GLib is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * GLib is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with GLib; see the file COPYING.LIB. If not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include "gmarkup.h" #include "gatomic.h" #include "gslice.h" #include "galloca.h" #include "gstrfuncs.h" #include "gstring.h" #include "gtestutils.h" #include "glibintl.h" #include "gthread.h" /** * SECTION:markup * @Title: Simple XML Subset Parser * @Short_description: parses a subset of XML * @See_also: XML * Specification * * The "GMarkup" parser is intended to parse a simple markup format * that's a subset of XML. This is a small, efficient, easy-to-use * parser. It should not be used if you expect to interoperate with * other applications generating full-scale XML. However, it's very * useful for application data files, config files, etc. where you * know your application will be the only one writing the file. * Full-scale XML parsers should be able to parse the subset used by * GMarkup, so you can easily migrate to full-scale XML at a later * time if the need arises. * * GMarkup is not guaranteed to signal an error on all invalid XML; * the parser may accept documents that an XML parser would not. * However, XML documents which are not well-formedBeing wellformed is a weaker condition than being * valid. See the XML * specification for definitions of these terms. * are not considered valid GMarkup documents. * * Simplifications to XML include: * * Only UTF-8 encoding is allowed * No user-defined entities * Processing instructions, comments and the doctype declaration * are "passed through" but are not interpreted in any way * No DTD or validation. * * * The markup format does support: * * Elements * Attributes * 5 standard entities: * & < > " ' * * Character references * Sections marked as CDATA * */ G_DEFINE_QUARK (g-markup-error-quark, g_markup_error) typedef enum { STATE_START, STATE_AFTER_OPEN_ANGLE, STATE_AFTER_CLOSE_ANGLE, STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */ STATE_INSIDE_OPEN_TAG_NAME, STATE_INSIDE_ATTRIBUTE_NAME, STATE_AFTER_ATTRIBUTE_NAME, STATE_BETWEEN_ATTRIBUTES, STATE_AFTER_ATTRIBUTE_EQUALS_SIGN, STATE_INSIDE_ATTRIBUTE_VALUE_SQ, STATE_INSIDE_ATTRIBUTE_VALUE_DQ, STATE_INSIDE_TEXT, STATE_AFTER_CLOSE_TAG_SLASH, STATE_INSIDE_CLOSE_TAG_NAME, STATE_AFTER_CLOSE_TAG_NAME, STATE_INSIDE_PASSTHROUGH, STATE_ERROR } GMarkupParseState; typedef struct { const char *prev_element; const GMarkupParser *prev_parser; gpointer prev_user_data; } GMarkupRecursionTracker; struct _GMarkupParseContext { const GMarkupParser *parser; volatile gint ref_count; GMarkupParseFlags flags; gint line_number; gint char_number; GMarkupParseState state; gpointer user_data; GDestroyNotify dnotify; /* A piece of character data or an element that * hasn't "ended" yet so we haven't yet called * the callback for it. */ GString *partial_chunk; GSList *spare_chunks; GSList *tag_stack; GSList *tag_stack_gstr; GSList *spare_list_nodes; GString **attr_names; GString **attr_values; gint cur_attr; gint alloc_attrs; const gchar *current_text; gssize current_text_len; const gchar *current_text_end; /* used to save the start of the last interesting thingy */ const gchar *start; const gchar *iter; guint document_empty : 1; guint parsing : 1; guint awaiting_pop : 1; gint balance; /* subparser support */ GSList *subparser_stack; /* (GMarkupRecursionTracker *) */ const char *subparser_element; gpointer held_user_data; }; /* * Helpers to reduce our allocation overhead, we have * a well defined allocation lifecycle. */ static GSList * get_list_node (GMarkupParseContext *context, gpointer data) { GSList *node; if (context->spare_list_nodes != NULL) { node = context->spare_list_nodes; context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node); } else node = g_slist_alloc(); node->data = data; return node; } static void free_list_node (GMarkupParseContext *context, GSList *node) { node->data = NULL; context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes); } static inline void string_blank (GString *string) { string->str[0] = '\0'; string->len = 0; } /** * g_markup_parse_context_new: * @parser: a #GMarkupParser * @flags: one or more #GMarkupParseFlags * @user_data: user data to pass to #GMarkupParser functions * @user_data_dnotify: user data destroy notifier called when * the parse context is freed * * Creates a new parse context. A parse context is used to parse * marked-up documents. You can feed any number of documents into * a context, as long as no errors occur; once an error occurs, * the parse context can't continue to parse text (you have to * free it and create a new parse context). * * Return value: a new #GMarkupParseContext **/ GMarkupParseContext * g_markup_parse_context_new (const GMarkupParser *parser, GMarkupParseFlags flags, gpointer user_data, GDestroyNotify user_data_dnotify) { GMarkupParseContext *context; g_return_val_if_fail (parser != NULL, NULL); context = g_new (GMarkupParseContext, 1); context->ref_count = 1; context->parser = parser; context->flags = flags; context->user_data = user_data; context->dnotify = user_data_dnotify; context->line_number = 1; context->char_number = 1; context->partial_chunk = NULL; context->spare_chunks = NULL; context->spare_list_nodes = NULL; context->state = STATE_START; context->tag_stack = NULL; context->tag_stack_gstr = NULL; context->attr_names = NULL; context->attr_values = NULL; context->cur_attr = -1; context->alloc_attrs = 0; context->current_text = NULL; context->current_text_len = -1; context->current_text_end = NULL; context->start = NULL; context->iter = NULL; context->document_empty = TRUE; context->parsing = FALSE; context->awaiting_pop = FALSE; context->subparser_stack = NULL; context->subparser_element = NULL; /* this is only looked at if awaiting_pop = TRUE. initialise anyway. */ context->held_user_data = NULL; context->balance = 0; return context; } /** * g_markup_parse_context_ref: * @context: a #GMarkupParseContext * * Increases the reference count of @context. * * Returns: the same @context * * Since: 2.36 **/ GMarkupParseContext * g_markup_parse_context_ref (GMarkupParseContext *context) { g_return_val_if_fail (context != NULL, NULL); g_return_val_if_fail (context->ref_count > 0, NULL); g_atomic_int_inc (&context->ref_count); return context; } /** * g_markup_parse_context_unref: * @context: a #GMarkupParseContext * * Decreases the reference count of @context. When its reference count * drops to 0, it is freed. * * Since: 2.36 **/ void g_markup_parse_context_unref (GMarkupParseContext *context) { g_return_if_fail (context != NULL); g_return_if_fail (context->ref_count > 0); if (g_atomic_int_dec_and_test (&context->ref_count)) g_markup_parse_context_free (context); } static void string_full_free (gpointer ptr) { g_string_free (ptr, TRUE); } static void clear_attributes (GMarkupParseContext *context); /** * g_markup_parse_context_free: * @context: a #GMarkupParseContext * * Frees a #GMarkupParseContext. * * This function can't be called from inside one of the * #GMarkupParser functions or while a subparser is pushed. */ void g_markup_parse_context_free (GMarkupParseContext *context) { g_return_if_fail (context != NULL); g_return_if_fail (!context->parsing); g_return_if_fail (!context->subparser_stack); g_return_if_fail (!context->awaiting_pop); if (context->dnotify) (* context->dnotify) (context->user_data); clear_attributes (context); g_free (context->attr_names); g_free (context->attr_values); g_slist_free_full (context->tag_stack_gstr, string_full_free); g_slist_free (context->tag_stack); g_slist_free_full (context->spare_chunks, string_full_free); g_slist_free (context->spare_list_nodes); if (context->partial_chunk) g_string_free (context->partial_chunk, TRUE); g_free (context); } static void pop_subparser_stack (GMarkupParseContext *context); static void mark_error (GMarkupParseContext *context, GError *error) { context->state = STATE_ERROR; if (context->parser->error) (*context->parser->error) (context, error, context->user_data); /* report the error all the way up to free all the user-data */ while (context->subparser_stack) { pop_subparser_stack (context); context->awaiting_pop = FALSE; /* already been freed */ if (context->parser->error) (*context->parser->error) (context, error, context->user_data); } } static void set_error (GMarkupParseContext *context, GError **error, GMarkupError code, const gchar *format, ...) G_GNUC_PRINTF (4, 5); static void set_error_literal (GMarkupParseContext *context, GError **error, GMarkupError code, const gchar *message) { GError *tmp_error; tmp_error = g_error_new_literal (G_MARKUP_ERROR, code, message); g_prefix_error (&tmp_error, _("Error on line %d char %d: "), context->line_number, context->char_number); mark_error (context, tmp_error); g_propagate_error (error, tmp_error); } G_GNUC_PRINTF(4, 5) static void set_error (GMarkupParseContext *context, GError **error, GMarkupError code, const gchar *format, ...) { gchar *s; gchar *s_valid; va_list args; va_start (args, format); s = g_strdup_vprintf (format, args); va_end (args); /* Make sure that the GError message is valid UTF-8 * even if it is complaining about invalid UTF-8 in the markup */ s_valid = _g_utf8_make_valid (s); set_error_literal (context, error, code, s); g_free (s); g_free (s_valid); } static void propagate_error (GMarkupParseContext *context, GError **dest, GError *src) { if (context->flags & G_MARKUP_PREFIX_ERROR_POSITION) g_prefix_error (&src, _("Error on line %d char %d: "), context->line_number, context->char_number); mark_error (context, src); g_propagate_error (dest, src); } #define IS_COMMON_NAME_END_CHAR(c) \ ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ') static gboolean slow_name_validate (GMarkupParseContext *context, const gchar *name, GError **error) { const gchar *p = name; if (!g_utf8_validate (name, strlen (name), NULL)) { set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, _("Invalid UTF-8 encoded text in name - not valid '%s'"), name); return FALSE; } if (!(g_ascii_isalpha (*p) || (!IS_COMMON_NAME_END_CHAR (*p) && (*p == '_' || *p == ':' || g_unichar_isalpha (g_utf8_get_char (p)))))) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid name"), name); return FALSE; } for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p)) { /* is_name_char */ if (!(g_ascii_isalnum (*p) || (!IS_COMMON_NAME_END_CHAR (*p) && (*p == '.' || *p == '-' || *p == '_' || *p == ':' || g_unichar_isalpha (g_utf8_get_char (p)))))) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid name: '%c'"), name, *p); return FALSE; } } return TRUE; } /* * Use me for elements, attributes etc. */ static gboolean name_validate (GMarkupParseContext *context, const gchar *name, GError **error) { char mask; const char *p; /* name start char */ p = name; if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) || !(g_ascii_isalpha (*p) || *p == '_' || *p == ':'))) goto slow_validate; for (mask = *p++; *p != '\0'; p++) { mask |= *p; /* is_name_char */ if (G_UNLIKELY (!(g_ascii_isalnum (*p) || (!IS_COMMON_NAME_END_CHAR (*p) && (*p == '.' || *p == '-' || *p == '_' || *p == ':'))))) goto slow_validate; } if (mask & 0x80) /* un-common / non-ascii */ goto slow_validate; return TRUE; slow_validate: return slow_name_validate (context, name, error); } static gboolean text_validate (GMarkupParseContext *context, const gchar *p, gint len, GError **error) { if (!g_utf8_validate (p, len, NULL)) { set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, _("Invalid UTF-8 encoded text in name - not valid '%s'"), p); return FALSE; } else return TRUE; } static gchar* char_str (gunichar c, gchar *buf) { memset (buf, 0, 8); g_unichar_to_utf8 (c, buf); return buf; } static gchar* utf8_str (const gchar *utf8, gchar *buf) { char_str (g_utf8_get_char (utf8), buf); return buf; } G_GNUC_PRINTF(5, 6) static void set_unescape_error (GMarkupParseContext *context, GError **error, const gchar *remaining_text, GMarkupError code, const gchar *format, ...) { GError *tmp_error; gchar *s; va_list args; gint remaining_newlines; const gchar *p; remaining_newlines = 0; p = remaining_text; while (*p != '\0') { if (*p == '\n') ++remaining_newlines; ++p; } va_start (args, format); s = g_strdup_vprintf (format, args); va_end (args); tmp_error = g_error_new (G_MARKUP_ERROR, code, _("Error on line %d: %s"), context->line_number - remaining_newlines, s); g_free (s); mark_error (context, tmp_error); g_propagate_error (error, tmp_error); } /* * re-write the GString in-place, unescaping anything that escaped. * most XML does not contain entities, or escaping. */ static gboolean unescape_gstring_inplace (GMarkupParseContext *context, GString *string, gboolean *is_ascii, GError **error) { char mask, *to; int line_num = 1; const char *from; gboolean normalize_attribute; *is_ascii = FALSE; /* are we unescaping an attribute or not ? */ if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ || context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ) normalize_attribute = TRUE; else normalize_attribute = FALSE; /* * Meeks' theorum: unescaping can only shrink text. * for < etc. this is obvious, for more * thought is required, but this is patently so. */ mask = 0; for (from = to = string->str; *from != '\0'; from++, to++) { *to = *from; mask |= *to; if (*to == '\n') line_num++; if (normalize_attribute && (*to == '\t' || *to == '\n')) *to = ' '; if (*to == '\r') { *to = normalize_attribute ? ' ' : '\n'; if (from[1] == '\n') from++; } if (*from == '&') { from++; if (*from == '#') { gboolean is_hex = FALSE; gulong l; gchar *end = NULL; from++; if (*from == 'x') { is_hex = TRUE; from++; } /* digit is between start and p */ errno = 0; if (is_hex) l = strtoul (from, &end, 16); else l = strtoul (from, &end, 10); if (end == from || errno != 0) { set_unescape_error (context, error, from, G_MARKUP_ERROR_PARSE, _("Failed to parse '%-.*s', which " "should have been a digit " "inside a character reference " "(ê for example) - perhaps " "the digit is too large"), (int)(end - from), from); return FALSE; } else if (*end != ';') { set_unescape_error (context, error, from, G_MARKUP_ERROR_PARSE, _("Character reference did not end with a " "semicolon; " "most likely you used an ampersand " "character without intending to start " "an entity - escape ampersand as &")); return FALSE; } else { /* characters XML 1.1 permits */ if ((0 < l && l <= 0xD7FF) || (0xE000 <= l && l <= 0xFFFD) || (0x10000 <= l && l <= 0x10FFFF)) { gchar buf[8]; char_str (l, buf); strcpy (to, buf); to += strlen (buf) - 1; from = end; if (l >= 0x80) /* not ascii */ mask |= 0x80; } else { set_unescape_error (context, error, from, G_MARKUP_ERROR_PARSE, _("Character reference '%-.*s' does not " "encode a permitted character"), (int)(end - from), from); return FALSE; } } } else if (strncmp (from, "lt;", 3) == 0) { *to = '<'; from += 2; } else if (strncmp (from, "gt;", 3) == 0) { *to = '>'; from += 2; } else if (strncmp (from, "amp;", 4) == 0) { *to = '&'; from += 3; } else if (strncmp (from, "quot;", 5) == 0) { *to = '"'; from += 4; } else if (strncmp (from, "apos;", 5) == 0) { *to = '\''; from += 4; } else { if (*from == ';') set_unescape_error (context, error, from, G_MARKUP_ERROR_PARSE, _("Empty entity '&;' seen; valid " "entities are: & " < > '")); else { const char *end = strchr (from, ';'); if (end) set_unescape_error (context, error, from, G_MARKUP_ERROR_PARSE, _("Entity name '%-.*s' is not known"), (int)(end - from), from); else set_unescape_error (context, error, from, G_MARKUP_ERROR_PARSE, _("Entity did not end with a semicolon; " "most likely you used an ampersand " "character without intending to start " "an entity - escape ampersand as &")); } return FALSE; } } } g_assert (to - string->str <= string->len); if (to - string->str != string->len) g_string_truncate (string, to - string->str); *is_ascii = !(mask & 0x80); return TRUE; } static inline gboolean advance_char (GMarkupParseContext *context) { context->iter++; context->char_number++; if (G_UNLIKELY (context->iter == context->current_text_end)) return FALSE; else if (G_UNLIKELY (*context->iter == '\n')) { context->line_number++; context->char_number = 1; } return TRUE; } static inline gboolean xml_isspace (char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } static void skip_spaces (GMarkupParseContext *context) { do { if (!xml_isspace (*context->iter)) return; } while (advance_char (context)); } static void advance_to_name_end (GMarkupParseContext *context) { do { if (IS_COMMON_NAME_END_CHAR (*(context->iter))) return; if (xml_isspace (*(context->iter))) return; } while (advance_char (context)); } static void release_chunk (GMarkupParseContext *context, GString *str) { GSList *node; if (!str) return; if (str->allocated_len > 256) { /* large strings are unusual and worth freeing */ g_string_free (str, TRUE); return; } string_blank (str); node = get_list_node (context, str); context->spare_chunks = g_slist_concat (node, context->spare_chunks); } static void add_to_partial (GMarkupParseContext *context, const gchar *text_start, const gchar *text_end) { if (context->partial_chunk == NULL) { /* allocate a new chunk to parse into */ if (context->spare_chunks != NULL) { GSList *node = context->spare_chunks; context->spare_chunks = g_slist_remove_link (context->spare_chunks, node); context->partial_chunk = node->data; free_list_node (context, node); } else context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start)); } if (text_start != text_end) g_string_insert_len (context->partial_chunk, -1, text_start, text_end - text_start); } static inline void truncate_partial (GMarkupParseContext *context) { if (context->partial_chunk != NULL) string_blank (context->partial_chunk); } static inline const gchar* current_element (GMarkupParseContext *context) { return context->tag_stack->data; } static void pop_subparser_stack (GMarkupParseContext *context) { GMarkupRecursionTracker *tracker; g_assert (context->subparser_stack); tracker = context->subparser_stack->data; context->awaiting_pop = TRUE; context->held_user_data = context->user_data; context->user_data = tracker->prev_user_data; context->parser = tracker->prev_parser; context->subparser_element = tracker->prev_element; g_slice_free (GMarkupRecursionTracker, tracker); context->subparser_stack = g_slist_delete_link (context->subparser_stack, context->subparser_stack); } static void push_partial_as_tag (GMarkupParseContext *context) { GString *str = context->partial_chunk; /* sadly, this is exported by gmarkup_get_element_stack as-is */ context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack); context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr); context->partial_chunk = NULL; } static void pop_tag (GMarkupParseContext *context) { GSList *nodea, *nodeb; nodea = context->tag_stack; nodeb = context->tag_stack_gstr; release_chunk (context, nodeb->data); context->tag_stack = g_slist_remove_link (context->tag_stack, nodea); context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb); free_list_node (context, nodea); free_list_node (context, nodeb); } static void possibly_finish_subparser (GMarkupParseContext *context) { if (current_element (context) == context->subparser_element) pop_subparser_stack (context); } static void ensure_no_outstanding_subparser (GMarkupParseContext *context) { if (context->awaiting_pop) g_critical ("During the first end_element call after invoking a " "subparser you must pop the subparser stack and handle " "the freeing of the subparser user_data. This can be " "done by calling the end function of the subparser. " "Very probably, your program just leaked memory."); /* let valgrind watch the pointer disappear... */ context->held_user_data = NULL; context->awaiting_pop = FALSE; } static const gchar* current_attribute (GMarkupParseContext *context) { g_assert (context->cur_attr >= 0); return context->attr_names[context->cur_attr]->str; } static void add_attribute (GMarkupParseContext *context, GString *str) { if (context->cur_attr + 2 >= context->alloc_attrs) { context->alloc_attrs += 5; /* silly magic number */ context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs); context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs); } context->cur_attr++; context->attr_names[context->cur_attr] = str; context->attr_values[context->cur_attr] = NULL; context->attr_names[context->cur_attr+1] = NULL; context->attr_values[context->cur_attr+1] = NULL; } static void clear_attributes (GMarkupParseContext *context) { /* Go ahead and free the attributes. */ for (; context->cur_attr >= 0; context->cur_attr--) { int pos = context->cur_attr; release_chunk (context, context->attr_names[pos]); release_chunk (context, context->attr_values[pos]); context->attr_names[pos] = context->attr_values[pos] = NULL; } g_assert (context->cur_attr == -1); g_assert (context->attr_names == NULL || context->attr_names[0] == NULL); g_assert (context->attr_values == NULL || context->attr_values[0] == NULL); } /* This has to be a separate function to ensure the alloca's * are unwound on exit - otherwise we grow & blow the stack * with large documents */ static inline void emit_start_element (GMarkupParseContext *context, GError **error) { int i; const gchar *start_name; const gchar **attr_names; const gchar **attr_values; GError *tmp_error; attr_names = g_newa (const gchar *, context->cur_attr + 2); attr_values = g_newa (const gchar *, context->cur_attr + 2); for (i = 0; i < context->cur_attr + 1; i++) { attr_names[i] = context->attr_names[i]->str; attr_values[i] = context->attr_values[i]->str; } attr_names[i] = NULL; attr_values[i] = NULL; /* Call user callback for element start */ tmp_error = NULL; start_name = current_element (context); if (context->parser->start_element && name_validate (context, start_name, error)) (* context->parser->start_element) (context, start_name, (const gchar **)attr_names, (const gchar **)attr_values, context->user_data, &tmp_error); clear_attributes (context); if (tmp_error != NULL) propagate_error (context, error, tmp_error); } /** * g_markup_parse_context_parse: * @context: a #GMarkupParseContext * @text: chunk of text to parse * @text_len: length of @text in bytes * @error: return location for a #GError * * Feed some data to the #GMarkupParseContext. * * The data need not be valid UTF-8; an error will be signaled if * it's invalid. The data need not be an entire document; you can * feed a document into the parser incrementally, via multiple calls * to this function. Typically, as you receive data from a network * connection or file, you feed each received chunk of data into this * function, aborting the process if an error occurs. Once an error * is reported, no further data may be fed to the #GMarkupParseContext; * all errors are fatal. * * Return value: %FALSE if an error occurred, %TRUE on success */ gboolean g_markup_parse_context_parse (GMarkupParseContext *context, const gchar *text, gssize text_len, GError **error) { g_return_val_if_fail (context != NULL, FALSE); g_return_val_if_fail (text != NULL, FALSE); g_return_val_if_fail (context->state != STATE_ERROR, FALSE); g_return_val_if_fail (!context->parsing, FALSE); if (text_len < 0) text_len = strlen (text); if (text_len == 0) return TRUE; context->parsing = TRUE; context->current_text = text; context->current_text_len = text_len; context->current_text_end = context->current_text + text_len; context->iter = context->current_text; context->start = context->iter; while (context->iter != context->current_text_end) { switch (context->state) { case STATE_START: /* Possible next state: AFTER_OPEN_ANGLE */ g_assert (context->tag_stack == NULL); /* whitespace is ignored outside of any elements */ skip_spaces (context); if (context->iter != context->current_text_end) { if (*context->iter == '<') { /* Move after the open angle */ advance_char (context); context->state = STATE_AFTER_OPEN_ANGLE; /* this could start a passthrough */ context->start = context->iter; /* document is now non-empty */ context->document_empty = FALSE; } else { set_error_literal (context, error, G_MARKUP_ERROR_PARSE, _("Document must begin with an element (e.g. )")); } } break; case STATE_AFTER_OPEN_ANGLE: /* Possible next states: INSIDE_OPEN_TAG_NAME, * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH */ if (*context->iter == '?' || *context->iter == '!') { /* include < in the passthrough */ const gchar *openangle = "<"; add_to_partial (context, openangle, openangle + 1); context->start = context->iter; context->balance = 1; context->state = STATE_INSIDE_PASSTHROUGH; } else if (*context->iter == '/') { /* move after it */ advance_char (context); context->state = STATE_AFTER_CLOSE_TAG_SLASH; } else if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) { context->state = STATE_INSIDE_OPEN_TAG_NAME; /* start of tag name */ context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid character following " "a '<' character; it may not begin an " "element name"), utf8_str (context->iter, buf)); } break; /* The AFTER_CLOSE_ANGLE state is actually sort of * broken, because it doesn't correspond to a range * of characters in the input stream as the others do, * and thus makes things harder to conceptualize */ case STATE_AFTER_CLOSE_ANGLE: /* Possible next states: INSIDE_TEXT, STATE_START */ if (context->tag_stack == NULL) { context->start = NULL; context->state = STATE_START; } else { context->start = context->iter; context->state = STATE_INSIDE_TEXT; } break; case STATE_AFTER_ELISION_SLASH: /* Possible next state: AFTER_CLOSE_ANGLE */ { /* We need to pop the tag stack and call the end_element * function, since this is the close tag */ GError *tmp_error = NULL; g_assert (context->tag_stack != NULL); possibly_finish_subparser (context); tmp_error = NULL; if (context->parser->end_element) (* context->parser->end_element) (context, current_element (context), context->user_data, &tmp_error); ensure_no_outstanding_subparser (context); if (tmp_error) { mark_error (context, tmp_error); g_propagate_error (error, tmp_error); } else { if (*context->iter == '>') { /* move after the close angle */ advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected a '>' character " "to end the empty-element tag '%s'"), utf8_str (context->iter, buf), current_element (context)); } } pop_tag (context); } break; case STATE_INSIDE_OPEN_TAG_NAME: /* Possible next states: BETWEEN_ATTRIBUTES */ /* if there's a partial chunk then it's the first part of the * tag name. If there's a context->start then it's the start * of the tag name in current_text, the partial chunk goes * before that start though. */ advance_to_name_end (context); if (context->iter == context->current_text_end) { /* The name hasn't necessarily ended. Merge with * partial chunk, leave state unchanged. */ add_to_partial (context, context->start, context->iter); } else { /* The name has ended. Combine it with the partial chunk * if any; push it on the stack; enter next state. */ add_to_partial (context, context->start, context->iter); push_partial_as_tag (context); context->state = STATE_BETWEEN_ATTRIBUTES; context->start = NULL; } break; case STATE_INSIDE_ATTRIBUTE_NAME: /* Possible next states: AFTER_ATTRIBUTE_NAME */ advance_to_name_end (context); add_to_partial (context, context->start, context->iter); /* read the full name, if we enter the equals sign state * then add the attribute to the list (without the value), * otherwise store a partial chunk to be prepended later. */ if (context->iter != context->current_text_end) context->state = STATE_AFTER_ATTRIBUTE_NAME; break; case STATE_AFTER_ATTRIBUTE_NAME: /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */ skip_spaces (context); if (context->iter != context->current_text_end) { /* The name has ended. Combine it with the partial chunk * if any; push it on the stack; enter next state. */ if (!name_validate (context, context->partial_chunk->str, error)) break; add_attribute (context, context->partial_chunk); context->partial_chunk = NULL; context->start = NULL; if (*context->iter == '=') { advance_char (context); context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected a '=' after " "attribute name '%s' of element '%s'"), utf8_str (context->iter, buf), current_attribute (context), current_element (context)); } } break; case STATE_BETWEEN_ATTRIBUTES: /* Possible next states: AFTER_CLOSE_ANGLE, * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME */ skip_spaces (context); if (context->iter != context->current_text_end) { if (*context->iter == '/') { advance_char (context); context->state = STATE_AFTER_ELISION_SLASH; } else if (*context->iter == '>') { advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; } else if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) { context->state = STATE_INSIDE_ATTRIBUTE_NAME; /* start of attribute name */ context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected a '>' or '/' " "character to end the start tag of " "element '%s', or optionally an attribute; " "perhaps you used an invalid character in " "an attribute name"), utf8_str (context->iter, buf), current_element (context)); } /* If we're done with attributes, invoke * the start_element callback */ if (context->state == STATE_AFTER_ELISION_SLASH || context->state == STATE_AFTER_CLOSE_ANGLE) emit_start_element (context, error); } break; case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN: /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */ skip_spaces (context); if (context->iter != context->current_text_end) { if (*context->iter == '"') { advance_char (context); context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ; context->start = context->iter; } else if (*context->iter == '\'') { advance_char (context); context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ; context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("Odd character '%s', expected an open quote mark " "after the equals sign when giving value for " "attribute '%s' of element '%s'"), utf8_str (context->iter, buf), current_attribute (context), current_element (context)); } } break; case STATE_INSIDE_ATTRIBUTE_VALUE_SQ: case STATE_INSIDE_ATTRIBUTE_VALUE_DQ: /* Possible next states: BETWEEN_ATTRIBUTES */ { gchar delim; if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ) { delim = '\''; } else { delim = '"'; } do { if (*context->iter == delim) break; } while (advance_char (context)); } if (context->iter == context->current_text_end) { /* The value hasn't necessarily ended. Merge with * partial chunk, leave state unchanged. */ add_to_partial (context, context->start, context->iter); } else { gboolean is_ascii; /* The value has ended at the quote mark. Combine it * with the partial chunk if any; set it for the current * attribute. */ add_to_partial (context, context->start, context->iter); g_assert (context->cur_attr >= 0); if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) && (is_ascii || text_validate (context, context->partial_chunk->str, context->partial_chunk->len, error))) { /* success, advance past quote and set state. */ context->attr_values[context->cur_attr] = context->partial_chunk; context->partial_chunk = NULL; advance_char (context); context->state = STATE_BETWEEN_ATTRIBUTES; context->start = NULL; } truncate_partial (context); } break; case STATE_INSIDE_TEXT: /* Possible next states: AFTER_OPEN_ANGLE */ do { if (*context->iter == '<') break; } while (advance_char (context)); /* The text hasn't necessarily ended. Merge with * partial chunk, leave state unchanged. */ add_to_partial (context, context->start, context->iter); if (context->iter != context->current_text_end) { gboolean is_ascii; /* The text has ended at the open angle. Call the text * callback. */ if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) && (is_ascii || text_validate (context, context->partial_chunk->str, context->partial_chunk->len, error))) { GError *tmp_error = NULL; if (context->parser->text) (*context->parser->text) (context, context->partial_chunk->str, context->partial_chunk->len, context->user_data, &tmp_error); if (tmp_error == NULL) { /* advance past open angle and set state. */ advance_char (context); context->state = STATE_AFTER_OPEN_ANGLE; /* could begin a passthrough */ context->start = context->iter; } else propagate_error (context, error, tmp_error); } truncate_partial (context); } break; case STATE_AFTER_CLOSE_TAG_SLASH: /* Possible next state: INSIDE_CLOSE_TAG_NAME */ if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) { context->state = STATE_INSIDE_CLOSE_TAG_NAME; /* start of tag name */ context->start = context->iter; } else { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid character following " "the characters 'iter, buf), utf8_str (context->iter, buf)); } break; case STATE_INSIDE_CLOSE_TAG_NAME: /* Possible next state: AFTER_CLOSE_TAG_NAME */ advance_to_name_end (context); add_to_partial (context, context->start, context->iter); if (context->iter != context->current_text_end) context->state = STATE_AFTER_CLOSE_TAG_NAME; break; case STATE_AFTER_CLOSE_TAG_NAME: /* Possible next state: AFTER_CLOSE_TAG_SLASH */ skip_spaces (context); if (context->iter != context->current_text_end) { GString *close_name; close_name = context->partial_chunk; context->partial_chunk = NULL; if (*context->iter != '>') { gchar buf[8]; set_error (context, error, G_MARKUP_ERROR_PARSE, _("'%s' is not a valid character following " "the close element name '%s'; the allowed " "character is '>'"), utf8_str (context->iter, buf), close_name->str); } else if (context->tag_stack == NULL) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("Element '%s' was closed, no element " "is currently open"), close_name->str); } else if (strcmp (close_name->str, current_element (context)) != 0) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("Element '%s' was closed, but the currently " "open element is '%s'"), close_name->str, current_element (context)); } else { GError *tmp_error; advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; context->start = NULL; possibly_finish_subparser (context); /* call the end_element callback */ tmp_error = NULL; if (context->parser->end_element) (* context->parser->end_element) (context, close_name->str, context->user_data, &tmp_error); ensure_no_outstanding_subparser (context); pop_tag (context); if (tmp_error) propagate_error (context, error, tmp_error); } context->partial_chunk = close_name; truncate_partial (context); } break; case STATE_INSIDE_PASSTHROUGH: /* Possible next state: AFTER_CLOSE_ANGLE */ do { if (*context->iter == '<') context->balance++; if (*context->iter == '>') { gchar *str; gsize len; context->balance--; add_to_partial (context, context->start, context->iter); context->start = context->iter; str = context->partial_chunk->str; len = context->partial_chunk->len; if (str[1] == '?' && str[len - 1] == '?') break; if (strncmp (str, "