diff options
Diffstat (limited to 'glib/gregex.c')
-rw-r--r-- | glib/gregex.c | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/glib/gregex.c b/glib/gregex.c index bf4380a38..f067b0000 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -33,6 +33,66 @@ #include "pcre/pcre.h" #endif +/** + * SECTION:gregex + * @title: Perl-compatible regular expressions + * @short_description: matches strings against regular expressions + * @see_also: <xref linkend="glib-regex-syntax"> + * + * The <function>g_regex_*()</function> functions implement regular + * expression pattern matching using syntax and semantics similar to + * Perl regular expression. + * + * Some functions accept a @start_position argument, setting it differs + * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL + * in the case of a pattern that begins with any kind of lookbehind assertion. + * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" + * in the middle of words. ("\B" matches only if the current position in the + * subject is not a word boundary.) When applied to the string "Mississipi" + * from the fourth byte, namely "issipi", it does not match, because "\B" is + * always false at the start of the subject, which is deemed to be a word + * boundary. However, if the entire string is passed , but with + * @start_position set to 4, it finds the second occurrence of "iss" because + * it is able to look behind the starting point to discover that it is + * preceded by a letter. + * + * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed + * to these functions must be encoded in UTF-8. The lengths and the positions + * inside the strings are in bytes and not in characters, so, for instance, + * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a + * single character. If you set #G_REGEX_RAW the strings can be non-valid + * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two + * bytes and two characters long. + * + * When matching a pattern, "\n" matches only against a "\n" character in + * the string, and "\r" matches only a "\r" character. To match any newline + * sequence use "\R". This particular group matches either the two-character + * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, + * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), + * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line + * separator, U+2028), or PS (paragraph separator, U+2029). + * + * The behaviour of the dot, circumflex, and dollar metacharacters are + * affected by newline characters, the default is to recognize any newline + * character (the same characters recognized by "\R"). This can be changed + * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF + * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY, + * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and + * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also + * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an + * unescaped "#" outside a character class is encountered. This indicates + * a comment that lasts until after the next newline. + * + * Creating and manipulating the same #GRegex structure from different + * threads is not a problem as #GRegex does not modify its internal + * state between creation and destruction, on the other hand #GMatchInfo + * is not threadsafe. + * + * The regular expressions low-level functionalities are obtained through + * the excellent <ulink url="http://www.pcre.org/">PCRE</ulink> library + * written by Philip Hazel. + */ + /* Mask of all the possible values for GRegexCompileFlags. */ #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \ G_REGEX_MULTILINE | \ |