summaryrefslogtreecommitdiff
path: root/lib/regexprops.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/regexprops.c')
-rw-r--r--lib/regexprops.c614
1 files changed, 614 insertions, 0 deletions
diff --git a/lib/regexprops.c b/lib/regexprops.c
new file mode 100644
index 0000000..3409b4c
--- /dev/null
+++ b/lib/regexprops.c
@@ -0,0 +1,614 @@
+/* regexprops.c -- document the properties of the regular expressions
+ understood by gnulib.
+
+ Copyright 2005, 2007, 2010, 2011, 2015 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/*
+ The output of this program is included in the GNU findutils source
+ distribution. The copying conditions for that file are generated
+ by the copying() function below.
+*/
+
+/* Written by James Youngman, <jay@gnu.org>. */
+
+/* config.h must be included first. */
+#include <config.h>
+
+/* system headers */
+#include <errno.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+/* gnulib headers */
+#include "progname.h"
+
+/* find headers */
+#include "regextype.h"
+
+static void
+output (const char *s, int escape)
+{
+ (void) escape;
+
+ fputs (s, stdout);
+}
+
+
+static void
+newline (void)
+{
+ output ("\n", 0);
+}
+
+static void
+content (const char *s)
+{
+ output (s, 1);
+}
+
+static void
+literal (const char *s)
+{
+ output (s, 0);
+}
+
+static void
+directive (const char *s)
+{
+ output (s, 0);
+}
+
+static void
+comment (const char *s)
+{
+ directive ("@c ");
+ literal (s);
+ newline ();
+}
+
+static void
+enum_item (const char *s)
+{
+ newline ();
+ directive ("@item ");
+ literal (s);
+ newline ();
+}
+
+static void
+begin_subsection (const char *name,
+ const char *next,
+ const char *prev,
+ const char *up)
+{
+ (void) next;
+ (void) prev;
+ (void) up;
+
+ newline ();
+
+ directive ("@node ");
+ content (name);
+ content (" regular expression syntax");
+ newline ();
+
+ directive ("@subsection ");
+ output ("@samp{", 0);
+ content (name);
+ output ("}", 0);
+ content (" regular expression syntax");
+ newline ();
+}
+
+static void
+begintable_markup (char const *markup)
+{
+ newline ();
+ directive ("@table ");
+ literal (markup);
+ newline ();
+}
+
+static void
+endtable ()
+{
+ newline ();
+ directive ("@end table");
+ newline ();
+}
+
+static void
+beginenum ()
+{
+ newline ();
+ directive ("@enumerate");
+ newline ();
+}
+
+static void
+endenum ()
+{
+ newline ();
+ directive ("@end enumerate");
+ newline ();
+}
+
+static void
+newpara ()
+{
+ content ("\n\n");
+}
+
+
+static void
+describe_regex_syntax (int options)
+{
+ newpara ();
+ content ("The character @samp{.} matches any single character");
+ if ( (options & RE_DOT_NEWLINE) == 0 )
+ {
+ content (" except newline");
+ }
+ if (options & RE_DOT_NOT_NULL)
+ {
+ if ( (options & RE_DOT_NEWLINE) == 0 )
+ content (" and");
+ else
+ content (" except");
+
+ content (" the null character");
+ }
+ content (". ");
+ newpara ();
+
+ if (!(options & RE_LIMITED_OPS))
+ {
+ begintable_markup ("@samp");
+ if (options & RE_BK_PLUS_QM)
+ {
+ enum_item ("\\+");
+ content ("indicates that the regular expression should match one"
+ " or more occurrences of the previous atom or regexp. ");
+ enum_item ("\\?");
+ content ("indicates that the regular expression should match zero"
+ " or one occurrence of the previous atom or regexp. ");
+ enum_item ("+ and ? ");
+ content ("match themselves. ");
+ }
+ else
+ {
+ enum_item ("+");
+ content ("indicates that the regular expression should match one"
+ " or more occurrences of the previous atom or regexp. ");
+ enum_item ("?");
+ content ("indicates that the regular expression should match zero"
+ " or one occurrence of the previous atom or regexp. ");
+ enum_item ("\\+");
+ literal ("matches a @samp{+}");
+ enum_item ("\\?");
+ literal ("matches a @samp{?}. ");
+ }
+ endtable ();
+ }
+
+ newpara ();
+
+ content ("Bracket expressions are used to match ranges of characters. ");
+ literal ("Bracket expressions where the range is backward, for example @samp{[z-a]}, are ");
+ if (options & RE_NO_EMPTY_RANGES)
+ content ("invalid");
+ else
+ content ("ignored");
+ content (". ");
+
+ if (options & RE_BACKSLASH_ESCAPE_IN_LISTS)
+ literal ("Within square brackets, @samp{\\} can be used to quote "
+ "the following character. ");
+ else
+ literal ("Within square brackets, @samp{\\} is taken literally. ");
+
+ if (options & RE_CHAR_CLASSES)
+ content ("Character classes are supported; for example "
+ "@samp{[[:digit:]]} will match a single decimal digit. ");
+ else
+ literal ("Character classes are not supported, so for example "
+ "you would need to use @samp{[0-9]} "
+ "instead of @samp{[[:digit:]]}. ");
+
+ if (options & RE_HAT_LISTS_NOT_NEWLINE)
+ {
+ literal ("Non-matching lists @samp{[^@dots{}]} do not ever match newline. ");
+ }
+ newpara ();
+ if (options & RE_NO_GNU_OPS)
+ {
+ content ("GNU extensions are not supported and so "
+ "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} "
+ "match "
+ "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively. ");
+ }
+ else
+ {
+ content ("GNU extensions are supported:");
+ beginenum ();
+ enum_item ("@samp{\\w} matches a character within a word");
+ enum_item ("@samp{\\W} matches a character which is not within a word");
+ enum_item ("@samp{\\<} matches the beginning of a word");
+ enum_item ("@samp{\\>} matches the end of a word");
+ enum_item ("@samp{\\b} matches a word boundary");
+ enum_item ("@samp{\\B} matches characters which are not a word boundary");
+ enum_item ("@samp{\\`} matches the beginning of the whole input");
+ enum_item ("@samp{\\'} matches the end of the whole input");
+ endenum ();
+ }
+
+ newpara ();
+
+
+ if (options & RE_NO_BK_PARENS)
+ {
+ literal ("Grouping is performed with parentheses @samp{()}. ");
+
+ if (options & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ literal ("An unmatched @samp{)} matches just itself. ");
+ }
+ else
+ {
+ literal ("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}. ");
+ }
+
+ if (options & RE_NO_BK_REFS)
+ {
+ content ("A backslash followed by a digit matches that digit. ");
+ }
+ else
+ {
+ literal ("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number. For example @samp{\\2} matches the second group expression. The order of group expressions is determined by the position of their opening parenthesis ");
+ if (options & RE_NO_BK_PARENS)
+ literal ("@samp{(}");
+ else
+ literal ("@samp{\\(}");
+ content (". ");
+ }
+
+
+ newpara ();
+ if (!(options & RE_LIMITED_OPS))
+ {
+ if (options & RE_NO_BK_VBAR)
+ literal ("The alternation operator is @samp{|}. ");
+ else
+ literal ("The alternation operator is @samp{\\|}. ");
+ }
+ newpara ();
+
+ if (options & RE_CONTEXT_INDEP_ANCHORS)
+ {
+ literal ("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets. Within brackets, @samp{^} can be used to invert the membership of the character class being specified. ");
+ }
+ else
+ {
+ literal ("The character @samp{^} only represents the beginning of a string when it appears:");
+ beginenum ();
+ enum_item ("\nAt the beginning of a regular expression");
+ enum_item ("After an open-group, signified by ");
+ if (options & RE_NO_BK_PARENS)
+ {
+ literal ("@samp{(}");
+ }
+ else
+ {
+ literal ("@samp{\\(}");
+ }
+ newline ();
+ if (!(options & RE_LIMITED_OPS))
+ {
+ if (options & RE_NEWLINE_ALT)
+ enum_item ("After a newline");
+
+ if (options & RE_NO_BK_VBAR )
+ enum_item ("After the alternation operator @samp{|}");
+ else
+ enum_item ("After the alternation operator @samp{\\|}");
+ }
+ endenum ();
+
+ newpara ();
+ literal ("The character @samp{$} only represents the end of a string when it appears:");
+ beginenum ();
+ enum_item ("At the end of a regular expression");
+ enum_item ("Before a close-group, signified by ");
+ if (options & RE_NO_BK_PARENS)
+ {
+ literal ("@samp{)}");
+ }
+ else
+ {
+ literal ("@samp{\\)}");
+ }
+ if (!(options & RE_LIMITED_OPS))
+ {
+ if (options & RE_NEWLINE_ALT)
+ enum_item ("Before a newline");
+
+ if (options & RE_NO_BK_VBAR)
+ enum_item ("Before the alternation operator @samp{|}");
+ else
+ enum_item ("Before the alternation operator @samp{\\|}");
+ }
+ endenum ();
+ }
+ newpara ();
+ if (!(options & RE_LIMITED_OPS) )
+ {
+ if ((options & RE_CONTEXT_INDEP_OPS)
+ && !(options & RE_CONTEXT_INVALID_OPS))
+ {
+ literal ("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression. ");
+ }
+ else
+ {
+ if (options & RE_BK_PLUS_QM)
+ literal ("@samp{\\*}, @samp{\\+} and @samp{\\?} ");
+ else
+ literal ("@samp{*}, @samp{+} and @samp{?} ");
+
+ if (options & RE_CONTEXT_INVALID_OPS)
+ {
+ content ("are special at any point in a regular expression except the following places, where they are not allowed:");
+ }
+ else
+ {
+ content ("are special at any point in a regular expression except:");
+ }
+
+ beginenum ();
+ enum_item ("At the beginning of a regular expression");
+ enum_item ("After an open-group, signified by ");
+ if (options & RE_NO_BK_PARENS)
+ {
+ literal ("@samp{(}");
+ }
+ else
+ {
+ literal ("@samp{\\(}");
+ }
+ if (!(options & RE_LIMITED_OPS))
+ {
+ if (options & RE_NEWLINE_ALT)
+ enum_item ("After a newline");
+
+ if (options & RE_NO_BK_VBAR)
+ enum_item ("After the alternation operator @samp{|}");
+ else
+ enum_item ("After the alternation operator @samp{\\|}");
+ }
+ endenum ();
+ }
+ }
+
+
+ newpara ();
+ if (options & RE_INTERVALS)
+ {
+ if (options & RE_NO_BK_BRACES)
+ {
+ literal ("Intervals are specified by @samp{@{} and @samp{@}}. ");
+ if (options & RE_INVALID_INTERVAL_ORD)
+ {
+ literal ("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}");
+ }
+ else
+ {
+ literal ("Invalid intervals such as @samp{a@{1z} are not accepted. ");
+ }
+ }
+ else
+ {
+ literal ("Intervals are specified by @samp{\\@{} and @samp{\\@}}. ");
+ if (options & RE_INVALID_INTERVAL_ORD)
+ {
+ literal ("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}");
+ }
+ else
+ {
+ literal ("Invalid intervals such as @samp{a\\@{1z} are not accepted. ");
+ }
+ }
+
+ }
+
+ newpara ();
+ if (options & RE_NO_POSIX_BACKTRACKING)
+ {
+ content ("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match. ");
+ }
+ else
+ {
+ content ("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups. ");
+ }
+ newpara ();
+}
+
+
+static void
+copying (void)
+{
+ static const char *copy_para[]=
+ {
+ "Copyright (C) 1994, 1996, 1998, 2000, 2001, 2003, 2004, 2005, 2006,"
+ ,"2007, 2009, 2010, 2011 Free Software Foundation, Inc."
+ ,""
+ ,"Permission is granted to copy, distribute and/or modify this document"
+ ,"under the terms of the GNU Free Documentation License, Version 1.3 or"
+ ,"any later version published by the Free Software Foundation; with no"
+ ,"Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts."
+ ,"A copy of the license is included in the ``GNU Free"
+ ,"Documentation License'' file as part of this distribution."
+ ""
+ ,NULL
+ };
+ const char **s = copy_para;
+ while (*s)
+ comment (*s++);
+}
+
+static int
+ignore (int ix, const unsigned int context)
+{
+ return 0 == (get_regex_type_context (ix) & context);
+}
+
+static void
+menu (unsigned int context)
+{
+ int i, options;
+ const char *name;
+
+ output ("@menu\n", 0);
+ for (i=0;
+ options = get_regex_type_flags (i),
+ name=get_regex_type_name (i);
+ ++i)
+ {
+ if (!ignore (i, context))
+ {
+ output ("* ", 0);
+ output (name, 0);
+ content (" regular expression syntax");
+ output ("::", 0);
+ newline ();
+ }
+ }
+ output ("@end menu\n", 0);
+}
+
+
+
+static const char *
+get_next (unsigned int ix, unsigned int context)
+{
+ const char *next;
+ while (get_regex_type_name (ix))
+ {
+ if (!ignore (ix, context))
+ {
+ next = get_regex_type_name (ix);
+ if (NULL == next)
+ return "";
+ else
+ return next;
+ }
+ ++ix;
+ }
+ return "";
+}
+
+
+static void
+describe_all (const char *contextname,
+ unsigned int context,
+ const char *up)
+{
+ const char *name, *next, *previous;
+ int options;
+ int i, parent;
+
+ copying ();
+ newline ();
+ literal ("@c this regular expression description is for: ");
+ literal (contextname);
+ newline ();
+ newline ();
+ menu (context);
+
+ previous = "";
+
+ for (i=0;
+ options = get_regex_type_flags (i),
+ name=get_regex_type_name (i);
+ ++i)
+ {
+ if (ignore (i, context))
+ {
+ fprintf (stderr,
+ "Skipping regexp type %s for context %s\n",
+ name, contextname);
+ name = previous;
+ continue;
+ }
+
+ next = get_next (i+1, context);
+ if (NULL == next)
+ next = "";
+ begin_subsection (name, next, previous, up);
+ parent = get_regex_type_synonym (i);
+ if (parent >= 0)
+ {
+ content ("This is a synonym for ");
+ content (get_regex_type_name (parent));
+ content (".");
+ }
+ else
+ {
+ describe_regex_syntax (options);
+ }
+ previous = name;
+ }
+}
+
+
+
+int
+main (int argc, char *argv[])
+{
+ const char *up = "";
+ unsigned int context = CONTEXT_ALL;
+ const char *contextname = "all";
+
+ if (argc)
+ set_program_name (argv[0]);
+ else
+ set_program_name ("regexprops");
+
+ if (argc > 1)
+ {
+ up = argv[1];
+ }
+ if (argc > 2)
+ {
+ contextname = argv[2];
+ if (0 == strcmp (contextname, "findutils"))
+ context = CONTEXT_FINDUTILS;
+ else if (0 == strcmp (contextname, "generic"))
+ context = CONTEXT_GENERIC;
+ else if (0 == strcmp (contextname, "all"))
+ context = CONTEXT_ALL;
+ else
+ {
+ fprintf (stderr, "Unexpected context %s",
+ contextname);
+ return 1;
+ }
+ }
+
+ describe_all (contextname, context, up);
+ return 0;
+}