summaryrefslogtreecommitdiff
path: root/glib/src/markup.hg
blob: 14c4163bc89cc56d85c6378cebd2941cbb55fabc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
/* Copyright (C) 2002 The gtkmm Development Team
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

_DEFS(glibmm,glib)

#include <glibmm/error.h>
#include <sigc++/sigc++.h>
#include <map>

#ifndef DOXYGEN_SHOULD_SKIP_THIS
extern "C" { typedef struct _GMarkupParseContext GMarkupParseContext; }
#endif

namespace Glib
{

/** @defgroup Markup Simple XML Subset Parser
 *
 * The Glib::Markup parser is intended to parse a simple markup format that's a
 * subset of XML. This is a small, efficient, easy-to-use parser. It should not
 * be used if you expect to interoperate with other applications generating
 * full-scale XML. However, it's very useful for application data files, config
 * files, etc. where you know your application will be the only one writing the
 * file. Full-scale XML parsers should be able to parse the subset used by
 * Glib::Markup parser, so you can easily migrate to full-scale XML at a later
 * time if the need arises.
 *
 * Glib::Markup is not guaranteed to signal an error on all invalid XML;
 * the parser may accept documents that an XML parser would not. However,
 * invalid XML documents are not considered valid Glib::Markup documents.
 *
 * @par Simplifications to XML include:
 *
 * - Only UTF-8 encoding is allowed.
 * - No user-defined entities.
 * - Processing instructions, comments and the doctype declaration are "passed
 *   through" but are not interpreted in any way.
 * - No DTD or validation.
 *
 * @par The markup format does support:
 *
 * - Elements
 * - Attributes
 * - 5 standard entities: <tt>\&amp; \&lt; \&gt; \&quot; \&apos;</tt>
 * - Character references
 * - Sections marked as <tt>CDATA</tt>
 *
 * @{
 */

/** %Exception class for markup parsing errors.
 */
_WRAP_GERROR(MarkupError, GMarkupError, G_MARKUP_ERROR, NO_GTYPE)

/*! @var MarkupError::Code MarkupError::BAD_UTF8
 * Text being parsed was not valid UTF-8.
 */
/*! @var MarkupError::Code MarkupError::EMPTY
 * Document contained nothing, or only whitespace.
 */
/*! @var MarkupError::Code MarkupError::PARSE
 * Document was ill-formed.
 */
/*! @var MarkupError::Code MarkupError::UNKNOWN_ELEMENT
 * This error should be set by Glib::Markup::Parser virtual methods;
 * element wasn't known.
 */
/*! @var MarkupError::Code MarkupError::UNKNOWN_ATTRIBUTE
 * This error should be set by Glib::Markup::Parser virtual methods;
 * attribute wasn't known.
 */
/*! @var MarkupError::Code MarkupError::INVALID_CONTENT
 * This error should be set by Glib::Markup::Parser virtual methods;
 * something was wrong with contents of the document, e.g. invalid attribute value.
 */

/** @} group Markup */


namespace Markup
{

class ParseContext;

/** @ingroup Markup */
typedef Glib::MarkupError Error;


/** Escapes text so that the markup parser will parse it verbatim.
 * Less than, greater than, ampersand, etc. are replaced with the corresponding
 * entities.  This function would typically be used when writing out a file to
 * be parsed with the markup parser.
 * @ingroup Markup
 * @param text Some valid UTF-8 text.
 * @return Escaped text.
 */
Glib::ustring escape_text(const Glib::ustring& text);


/** There are no flags right now. Pass <tt>Glib::Markup::ParseFlags(0)</tt> for
 * the flags argument to all functions (this should be the default argument
 * anyway).
 */
_WRAP_ENUM(ParseFlags, GMarkupParseFlags, NO_GTYPE, s#^MARKUP_##)

/*! @var Markup::ParseFlags DO_NOT_USE_THIS_UNSUPPORTED_FLAG
 * Flag you should not use.
 */


/** Binary predicate used by Markup::Parser::AttributeMap.
 * @ingroup Markup
 * Unlike <tt>operator<(const ustring& lhs, const ustring& rhs)</tt>
 * which would be used by the default <tt>std::less<></tt> predicate,
 * the AttributeKeyLess predicate is locale-independent.  This is both
 * more correct and much more efficient.
 */
class AttributeKeyLess
{
public:
  typedef Glib::ustring first_argument_type;
  typedef Glib::ustring second_argument_type;
  typedef bool          result_type;

  bool operator()(const Glib::ustring& lhs, const Glib::ustring& rhs) const;
};


#ifndef DOXYGEN_SHOULD_SKIP_THIS
class ParserCallbacks;
#endif

/** The abstract markup parser base class.
 * @ingroup Markup
 * To implement a parser for your markup format, derive from
 * Glib::Markup::Parser and implement the virtual methods.
 *
 * You don't have to override all of the virtual methods.  If a particular
 * method is not implement the data passed to it will be ignored.  Except for
 * the error method, any of these callbacks can throw an error exception; in
 * particular the MarkupError::UNKNOWN_ELEMENT,
 * MarkupError::UNKNOWN_ATTRIBUTE, and MarkupError::INVALID_CONTENT errors
 * are intended to be thrown from these overridden methods. If you throw an
 * error from a method, Glib::Markup::ParseContext::parse() will report that
 * error back to its caller.
 */
class Parser : public sigc::trackable
{
public:
  typedef std::map<Glib::ustring, Glib::ustring, Glib::Markup::AttributeKeyLess> AttributeMap;

  virtual ~Parser() = 0;

protected:
  /** Constructs a Parser object.
   * Note that Markup::Parser is an abstract class which can't be instantiated
   * directly.  To implement the parser for your markup format, derive from
   * Markup::Parser and implement the virtual methods.
   */
  Parser();

  Parser(const Parser&) = delete;
  Parser& operator=(const Parser&) = delete;

  Parser(Parser&& other) noexcept;
  Parser& operator=(Parser&& other) noexcept;

  /** Called for open tags <tt>\<foo bar="baz"\></tt>.
   * This virtual method is invoked when the opening tag of an element is seen.
   * @param context The Markup::ParseContext object the parsed data belongs to.
   * @param element_name The name of the element.
   * @param attributes A map of attribute name/value pairs.
   * @throw Glib::MarkupError An exception <em>you</em> should throw if
   * something went wrong, for instance if an unknown attribute name was
   * encountered.  In particular the MarkupError::UNKNOWN_ELEMENT,
   * MarkupError::UNKNOWN_ATTRIBUTE, and MarkupError::INVALID_CONTENT
   * errors are intended to be thrown from user-implemented methods.
   */
  virtual void on_start_element(ParseContext&        context,
                                const Glib::ustring& element_name,
                                const AttributeMap&  attributes);

  /** Called for close tags <tt>\</foo\></tt>.
   * This virtual method is invoked when the closing tag of an element is seen.
   * @param context The Markup::ParseContext object the parsed data belongs to.
   * @param element_name The name of the element.
   * @throw Glib::MarkupError An exception <em>you</em> should throw if
   * something went wrong, for instance if an unknown attribute name was
   * encountered.  In particular the MarkupError::UNKNOWN_ELEMENT,
   * MarkupError::UNKNOWN_ATTRIBUTE, and MarkupError::INVALID_CONTENT
   * errors are intended to be thrown from user-implemented methods.
   */
  virtual void on_end_element(ParseContext& context, const Glib::ustring& element_name);

  /** Called for character data.
   * This virtual method is invoked when some text is seen (text is always
   * inside an element).
   * @param context The Markup::ParseContext object the parsed data belongs to.
   * @param text The parsed text in UTF-8 encoding.
   * @throw Glib::MarkupError An exception <em>you</em> should throw if
   * something went wrong, for instance if an unknown attribute name was
   * encountered.  In particular the MarkupError::UNKNOWN_ELEMENT,
   * MarkupError::UNKNOWN_ATTRIBUTE, and MarkupError::INVALID_CONTENT
   * errors are intended to be thrown from user-implemented methods.
   */
  virtual void on_text(ParseContext& context, const Glib::ustring& text);

  /** Called for strings that should be re-saved verbatim in this same
   * position, but are not otherwise interpretable.
   * This virtual method is invoked for comments, processing instructions and
   * doctype declarations; if you're re-writing the parsed document, write the
   * passthrough text back out in the same position.
   * @param context The Markup::ParseContext object the parsed data belongs to.
   * @param passthrough_text The text that should be passed through.
   * @throw Glib::MarkupError An exception <em>you</em> should throw if
   * something went wrong, for instance if an unknown attribute name was
   * encountered.  In particular the MarkupError::UNKNOWN_ELEMENT,
   * MarkupError::UNKNOWN_ATTRIBUTE, and MarkupError::INVALID_CONTENT
   * errors are intended to be thrown from user-implemented methods.
   */
  virtual void on_passthrough(ParseContext& context, const Glib::ustring& passthrough_text);

  /** Called on error, including one thrown by an overridden virtual method.
   * @param context The Markup::ParseContext object the parsed data belongs to.
   * @param error A MarkupError object with detailed information about the error.
   */
  virtual void on_error(ParseContext& context, const MarkupError& error);

private:

#ifndef DOXYGEN_SHOULD_SKIP_THIS
  friend class Glib::Markup::ParserCallbacks;
#endif
};


/** A parse context is used to parse marked-up documents.
 * @ingroup Markup
 * You can feed any number of documents into a context, as long as no errors
 * occur; once an error occurs, the parse context can't continue to parse text
 * (you have to destroy it and create a new parse context).
 */
class ParseContext : public sigc::trackable
{
public:
  /** Creates a new parse context.
   * @param parser A Markup::Parser instance.
   * @param flags Bitwise combination of Markup::ParseFlags.
   */
  explicit ParseContext(Parser& parser, ParseFlags flags = ParseFlags(0));

  ParseContext(const ParseContext&) = delete;
  ParseContext& operator=(const ParseContext&) = delete;

  ParseContext(ParseContext&& other) noexcept;
  ParseContext& operator=(ParseContext&& other) noexcept;

  virtual ~ParseContext();

  /** Feed some data to the ParseContext.
   * The data need not be valid UTF-8; an error will be signalled if it's
   * invalid. The data need not be an entire document; you can feed a document
   * into the parser incrementally, via multiple calls to this function.
   * Typically, as you receive data from a network connection or file, you feed
   * each received chunk of data into this function, aborting the process if an
   * error occurs. Once an error is reported, no further data may be fed to the
   * ParseContext; all errors are fatal.
   * @param text Chunk of text to parse.
   * @throw Glib::MarkupError
   */
  void parse(const Glib::ustring& text);

  /** Feed some data to the ParseContext.
   * The data need not be valid UTF-8; an error will be signalled if it's
   * invalid. The data need not be an entire document; you can feed a document
   * into the parser incrementally, via multiple calls to this function.
   * Typically, as you receive data from a network connection or file, you feed
   * each received chunk of data into this function, aborting the process if an
   * error occurs. Once an error is reported, no further data may be fed to the
   * ParseContext; all errors are fatal.
   * @param text_begin Begin of chunk of text to parse.
   * @param text_end End of chunk of text to parse.
   * @throw Glib::MarkupError
   */
  void parse(const char* text_begin, const char* text_end);

  /** Signals to the ParseContext that all data has been fed into the parse
   * context with parse(). This method reports an error if the document isn't
   * complete, for example if elements are still open.
   * @throw Glib::MarkupError
   */
  void end_parse();

  /** Retrieves the name of the currently open element.
   * @return The name of the currently open element, or <tt>""</tt>.
   */
  Glib::ustring get_element() const;

  /** Retrieves the current line number.
   * Intended for use in error messages; there are no strict semantics for what
   * constitutes the "current" line number other than "the best number we could
   * come up with for error messages."
   */
  int get_line_number() const;

  /** Retrieves the number of the current character on the current line.
   * Intended for use in error messages; there are no strict semantics for what
   * constitutes the "current" character number other than "the best number we
   * could come up with for error messages."
   */
  int get_char_number() const;

  Parser*       get_parser()       { return parser_; }
  const Parser* get_parser() const { return parser_; }

#ifndef DOXYGEN_SHOULD_SKIP_THIS
  GMarkupParseContext*       gobj()       { return gobject_; }
  const GMarkupParseContext* gobj() const { return gobject_; }
#endif

private:
  Markup::Parser*       parser_;
  GMarkupParseContext*  gobject_;

  static void destroy_notify_callback(void* data);
};

} // namespace Markup

} // namespace Glib