summaryrefslogtreecommitdiff
path: root/info/search.c
diff options
context:
space:
mode:
Diffstat (limited to 'info/search.c')
-rw-r--r--info/search.c677
1 files changed, 677 insertions, 0 deletions
diff --git a/info/search.c b/info/search.c
new file mode 100644
index 0000000..44010bf
--- /dev/null
+++ b/info/search.c
@@ -0,0 +1,677 @@
+/* search.c -- searching large bodies of text.
+ $Id: search.c,v 1.9 2008/06/11 09:55:42 gray Exp $
+
+ Copyright (C) 1993, 1997, 1998, 2002, 2004, 2007, 2008
+ Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Originally written by Brian Fox (bfox@ai.mit.edu). */
+
+#include "info.h"
+#include <regex.h>
+
+#include "search.h"
+#include "nodes.h"
+
+/* The search functions take two arguments:
+
+ 1) a string to search for, and
+
+ 2) a pointer to a SEARCH_BINDING which contains the buffer, start,
+ and end of the search.
+
+ They return a long, which is the offset from the start of the buffer
+ at which the match was found. An offset of -1 indicates failure. */
+
+/* A function which makes a binding with buffer and bounds. */
+SEARCH_BINDING *
+make_binding (char *buffer, long int start, long int end)
+{
+ SEARCH_BINDING *binding;
+
+ binding = xmalloc (sizeof (SEARCH_BINDING));
+ binding->buffer = buffer;
+ binding->start = start;
+ binding->end = end;
+ binding->flags = 0;
+
+ return binding;
+}
+
+/* Make a copy of BINDING without duplicating the data. */
+SEARCH_BINDING *
+copy_binding (SEARCH_BINDING *binding)
+{
+ SEARCH_BINDING *copy;
+
+ copy = make_binding (binding->buffer, binding->start, binding->end);
+ copy->flags = binding->flags;
+ return copy;
+}
+
+
+/* **************************************************************** */
+/* */
+/* The Actual Searching Functions */
+/* */
+/* **************************************************************** */
+
+/* Search forwards or backwards for the text delimited by BINDING.
+ The search is forwards if BINDING->start is greater than BINDING->end. */
+long
+search (char *string, SEARCH_BINDING *binding)
+{
+ long result;
+
+ /* If the search is backwards, then search backwards, otherwise forwards. */
+ if (binding->start > binding->end)
+ result = search_backward (string, binding);
+ else
+ result = search_forward (string, binding);
+
+ return result;
+}
+
+/* Search forwards or backwards for anything matching the regexp in the text
+ delimited by BINDING. The search is forwards if BINDING->start is greater
+ than BINDING->end.
+
+ If PRET is specified, it receives a copy of BINDING at the end of a
+ succeded search. Its START and END fields contain bounds of the found
+ string instance.
+*/
+long
+regexp_search (char *regexp, SEARCH_BINDING *binding, long length,
+ SEARCH_BINDING *pret)
+{
+ static char *previous_regexp = NULL;
+ static char *previous_content = NULL;
+ static int was_insensitive = 0;
+ static regex_t preg;
+ static regmatch_t *matches;
+ static int match_alloc = 0;
+ static int match_count = 0;
+ regoff_t pos;
+
+ if (previous_regexp == NULL
+ || ((binding->flags & S_FoldCase) != was_insensitive)
+ || (strcmp (previous_regexp, regexp) != 0))
+ {
+ /* need to compile a new regexp */
+ int result;
+ char *unescaped_regexp;
+ char *p, *q;
+
+ previous_content = NULL;
+
+ if (previous_regexp != NULL)
+ {
+ free (previous_regexp);
+ previous_regexp = NULL;
+ regfree (&preg);
+ }
+
+ was_insensitive = binding->flags & S_FoldCase;
+
+ /* expand the \n and \t in regexp */
+ unescaped_regexp = xmalloc (1 + strlen (regexp));
+ for (p = regexp, q = unescaped_regexp; *p != '\0'; p++, q++)
+ {
+ if (*p == '\\')
+ switch(*++p)
+ {
+ case 'n':
+ *q = '\n';
+ break;
+ case 't':
+ *q = '\t';
+ break;
+ case '\0':
+ *q = '\\';
+ p--;
+ break;
+ default:
+ *q++ = '\\';
+ *q = *p;
+ break;
+ }
+ else
+ *q = *p;
+ }
+ *q = '\0';
+
+ result = regcomp (&preg, unescaped_regexp,
+ REG_EXTENDED|
+ REG_NEWLINE|
+ (was_insensitive ? REG_ICASE : 0));
+ free (unescaped_regexp);
+
+ if (result != 0)
+ {
+ int size = regerror (result, &preg, NULL, 0);
+ char *buf = xmalloc (size);
+ regerror (result, &preg, buf, size);
+ info_error (_("regexp error: %s"), buf, NULL);
+ return -1;
+ }
+
+ previous_regexp = xstrdup(regexp);
+ }
+
+ if (previous_content != binding->buffer)
+ {
+ /* new buffer to search in, let's scan it */
+ regoff_t start = 0;
+ char saved_char;
+
+ previous_content = binding->buffer;
+ saved_char = previous_content[length-1];
+ previous_content[length-1] = '\0';
+
+ for (match_count = 0; start < length; )
+ {
+ int result = 0;
+ if (match_count >= match_alloc)
+ {
+ /* match list full. Initially allocate 256 entries, then double
+ every time we fill it */
+ match_alloc = (match_alloc > 0 ? match_alloc * 2 : 256);
+ matches = xrealloc (matches,
+ match_alloc * sizeof(regmatch_t));
+ }
+
+ result = regexec (&preg, &previous_content[start],
+ 1, &matches[match_count], 0);
+ if (result == 0)
+ {
+ if (matches[match_count].rm_eo == 0)
+ {
+ /* ignore empty matches */
+ start++;
+ }
+ else
+ {
+ matches[match_count].rm_so += start;
+ matches[match_count].rm_eo += start;
+ start = matches[match_count++].rm_eo;
+ }
+ }
+ else
+ {
+ break;
+ }
+ }
+ previous_content[length-1] = saved_char;
+ }
+
+ pos = binding->start;
+ if (pos > binding->end)
+ {
+ /* searching backward */
+ int i;
+ for (i = match_count - 1; i >= 0; i--)
+ {
+ if (matches[i].rm_so <= pos)
+ {
+ if (pret)
+ {
+ pret->buffer = binding->buffer;
+ pret->flags = binding->flags;
+ pret->start = matches[i].rm_so;
+ pret->end = matches[i].rm_eo;
+ }
+ return matches[i].rm_so;
+ }
+ }
+ }
+ else
+ {
+ /* searching forward */
+ int i;
+ for (i = 0; i < match_count; i++)
+ {
+ if (matches[i].rm_so >= pos)
+ {
+ if (pret)
+ {
+ pret->buffer = binding->buffer;
+ pret->flags = binding->flags;
+ pret->start = matches[i].rm_so;
+ pret->end = matches[i].rm_eo;
+ }
+ if (binding->flags & S_SkipDest)
+ return matches[i].rm_eo;
+ else
+ return matches[i].rm_so;
+ }
+ }
+ }
+
+ /* not found */
+ return -1;
+}
+
+/* Search forwards for STRING through the text delimited in BINDING. */
+long
+search_forward (char *string, SEARCH_BINDING *binding)
+{
+ register int c, i, len;
+ register char *buff, *end;
+ char *alternate = NULL;
+
+ len = strlen (string);
+
+ /* We match characters in the search buffer against STRING and ALTERNATE.
+ ALTERNATE is a case reversed version of STRING; this is cheaper than
+ case folding each character before comparison. Alternate is only
+ used if the case folding bit is turned on in the passed BINDING. */
+
+ if (binding->flags & S_FoldCase)
+ {
+ alternate = xstrdup (string);
+
+ for (i = 0; i < len; i++)
+ {
+ if (islower (alternate[i]))
+ alternate[i] = toupper (alternate[i]);
+ else if (isupper (alternate[i]))
+ alternate[i] = tolower (alternate[i]);
+ }
+ }
+
+ buff = binding->buffer + binding->start;
+ end = binding->buffer + binding->end + 1;
+
+ while (buff < (end - len))
+ {
+ for (i = 0; i < len; i++)
+ {
+ c = buff[i];
+
+ if ((c != string[i]) && (!alternate || c != alternate[i]))
+ break;
+ }
+
+ if (!string[i])
+ {
+ if (alternate)
+ free (alternate);
+ if (binding->flags & S_SkipDest)
+ buff += len;
+ return buff - binding->buffer;
+ }
+
+ buff++;
+ }
+
+ if (alternate)
+ free (alternate);
+
+ return -1;
+}
+
+/* Search for STRING backwards through the text delimited in BINDING. */
+long
+search_backward (char *input_string, SEARCH_BINDING *binding)
+{
+ register int c, i, len;
+ register char *buff, *end;
+ char *string;
+ char *alternate = NULL;
+
+ len = strlen (input_string);
+
+ /* Reverse the characters in the search string. */
+ string = xmalloc (1 + len);
+ for (c = 0, i = len - 1; input_string[c]; c++, i--)
+ string[i] = input_string[c];
+
+ string[c] = '\0';
+
+ /* We match characters in the search buffer against STRING and ALTERNATE.
+ ALTERNATE is a case reversed version of STRING; this is cheaper than
+ case folding each character before comparison. ALTERNATE is only
+ used if the case folding bit is turned on in the passed BINDING. */
+
+ if (binding->flags & S_FoldCase)
+ {
+ alternate = xstrdup (string);
+
+ for (i = 0; i < len; i++)
+ {
+ if (islower (alternate[i]))
+ alternate[i] = toupper (alternate[i]);
+ else if (isupper (alternate[i]))
+ alternate[i] = tolower (alternate[i]);
+ }
+ }
+
+ buff = binding->buffer + binding->start - 1;
+ end = binding->buffer + binding->end;
+
+ while (buff > (end + len))
+ {
+ for (i = 0; i < len; i++)
+ {
+ c = *(buff - i);
+
+ if (c != string[i] && (!alternate || c != alternate[i]))
+ break;
+ }
+
+ if (!string[i])
+ {
+ free (string);
+ if (alternate)
+ free (alternate);
+
+ if (binding->flags & S_SkipDest)
+ buff -= len;
+ return 1 + buff - binding->buffer;
+ }
+
+ buff--;
+ }
+
+ free (string);
+ if (alternate)
+ free (alternate);
+
+ return -1;
+}
+
+/* Find STRING in LINE, returning the offset of the end of the string.
+ Return an offset of -1 if STRING does not appear in LINE. The search
+ is bound by the end of the line (i.e., either NEWLINE or 0). */
+int
+string_in_line (char *string, char *line)
+{
+ register int end;
+ SEARCH_BINDING binding;
+
+ /* Find the end of the line. */
+ for (end = 0; line[end] && line[end] != '\n'; end++);
+
+ /* Search for STRING within these confines. */
+ binding.buffer = line;
+ binding.start = 0;
+ binding.end = end;
+ binding.flags = S_FoldCase | S_SkipDest;
+
+ return search_forward (string, &binding);
+}
+
+/* Return non-zero if STRING is the first text to appear at BINDING. */
+int
+looking_at (char *string, SEARCH_BINDING *binding)
+{
+ long search_end;
+
+ search_end = search (string, binding);
+
+ /* If the string was not found, SEARCH_END is -1. If the string was found,
+ but not right away, SEARCH_END is != binding->start. Otherwise, the
+ string was found at binding->start. */
+ return search_end == binding->start;
+}
+
+/* **************************************************************** */
+/* */
+/* Small String Searches */
+/* */
+/* **************************************************************** */
+
+/* Function names that start with "skip" are passed a string, and return
+ an offset from the start of that string. Function names that start
+ with "find" are passed a SEARCH_BINDING, and return an absolute position
+ marker of the item being searched for. "Find" functions return a value
+ of -1 if the item being looked for couldn't be found. */
+
+/* Return the index of the first non-whitespace character in STRING. */
+int
+skip_whitespace (char *string)
+{
+ register int i;
+
+ for (i = 0; string && whitespace (string[i]); i++);
+ return i;
+}
+
+/* Return the index of the first non-whitespace or newline character in
+ STRING. */
+int
+skip_whitespace_and_newlines (char *string)
+{
+ register int i;
+
+ for (i = 0; string && whitespace_or_newline (string[i]); i++);
+ return i;
+}
+
+/* Return the index of the first whitespace character in STRING. */
+int
+skip_non_whitespace (char *string)
+{
+ register int i;
+
+ for (i = 0; string && string[i] && !whitespace (string[i]); i++);
+ return i;
+}
+
+/* Return the index of the first non-node character in STRING. Note that
+ this function contains quite a bit of hair to ignore periods in some
+ special cases. This is because we here at GNU ship some info files which
+ contain nodenames that contain periods. No such nodename can start with
+ a period, or continue with whitespace, newline, or ')' immediately following
+ the period. If second argument NEWLINES_OKAY is non-zero, newlines should
+ be skipped while parsing out the nodename specification. */
+int
+skip_node_characters (char *string, int newlines_okay)
+{
+ register int c, i = 0;
+ int paren_seen = 0;
+ int paren = 0;
+
+ /* Handle special case. This is when another function has parsed out the
+ filename component of the node name, and we just want to parse out the
+ nodename proper. In that case, a period at the start of the nodename
+ indicates an empty nodename. */
+ if (string && *string == '.')
+ return 0;
+
+ if (string && *string == '(')
+ {
+ paren++;
+ paren_seen++;
+ i++;
+ }
+
+ for (; string && (c = string[i]); i++)
+ {
+ if (paren)
+ {
+ if (c == '(')
+ paren++;
+ else if (c == ')')
+ paren--;
+
+ continue;
+ }
+
+ /* If the character following the close paren is a space or period,
+ then this node name has no more characters associated with it. */
+ if (c == '\t' ||
+ c == ',' ||
+ c == INFO_TAGSEP ||
+ ((!newlines_okay) && (c == '\n')) ||
+ ((paren_seen && string[i - 1] == ')') &&
+ (c == ' ' || c == '.')) ||
+ (c == '.' &&
+ (
+#if 0
+/* This test causes a node name ending in a period, like `This.', not to
+ be found. The trailing . is stripped. This occurs in the jargon
+ file (`I see no X here.' is a node name). */
+ (!string[i + 1]) ||
+#endif
+ (whitespace_or_newline (string[i + 1])) ||
+ (string[i + 1] == ')'))))
+ break;
+ }
+ return i;
+}
+
+
+/* **************************************************************** */
+/* */
+/* Searching FILE_BUFFER's */
+/* */
+/* **************************************************************** */
+
+/* Return the absolute position of the first occurence of a node separator in
+ BINDING-buffer. The search starts at BINDING->start. Return -1 if no node
+ separator was found. */
+long
+find_node_separator (SEARCH_BINDING *binding)
+{
+ register long i;
+ char *body;
+
+ body = binding->buffer;
+
+ /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are
+ optional, but the DELETE and NEWLINE are not. This separator holds
+ true for all separated elements in an Info file, including the tags
+ table (if present) and the indirect tags table (if present). */
+ for (i = binding->start; i < binding->end - 1; i++)
+ if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
+ (body[i + 2] == '\n' ||
+ (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
+ ((body[i] == INFO_COOKIE) &&
+ (body[i + 1] == '\n' ||
+ (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
+ return i;
+ return -1;
+}
+
+/* Return the length of the node separator characters that BODY is
+ currently pointing at. */
+int
+skip_node_separator (char *body)
+{
+ register int i;
+
+ i = 0;
+
+ if (body[i] == INFO_FF)
+ i++;
+
+ if (body[i++] != INFO_COOKIE)
+ return 0;
+
+ if (body[i] == INFO_FF)
+ i++;
+
+ if (body[i++] != '\n')
+ return 0;
+
+ return i;
+}
+
+/* Return the number of characters from STRING to the start of
+ the next line. */
+int
+skip_line (char *string)
+{
+ register int i;
+
+ for (i = 0; string && string[i] && string[i] != '\n'; i++);
+
+ if (string[i] == '\n')
+ i++;
+
+ return i;
+}
+
+/* Return the absolute position of the beginning of a tags table in this
+ binding starting the search at binding->start. */
+long
+find_tags_table (SEARCH_BINDING *binding)
+{
+ SEARCH_BINDING tmp_search;
+ long position;
+
+ tmp_search.buffer = binding->buffer;
+ tmp_search.start = binding->start;
+ tmp_search.end = binding->end;
+ tmp_search.flags = S_FoldCase;
+
+ while ((position = find_node_separator (&tmp_search)) != -1 )
+ {
+ tmp_search.start = position;
+ tmp_search.start += skip_node_separator (tmp_search.buffer
+ + tmp_search.start);
+
+ if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
+ return position;
+ }
+ return -1;
+}
+
+/* Return the absolute position of the node named NODENAME in BINDING.
+ This is a brute force search, and we wish to avoid it when possible.
+ This function is called when a tag (indirect or otherwise) doesn't
+ really point to the right node. It returns the absolute position of
+ the separator preceding the node. */
+long
+find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
+{
+ long position;
+ int offset, namelen;
+ SEARCH_BINDING tmp_search;
+
+ namelen = strlen (nodename);
+
+ tmp_search.buffer = binding->buffer;
+ tmp_search.start = binding->start;
+ tmp_search.end = binding->end;
+ tmp_search.flags = 0;
+
+ while ((position = find_node_separator (&tmp_search)) != -1)
+ {
+ tmp_search.start = position;
+ tmp_search.start += skip_node_separator
+ (tmp_search.buffer + tmp_search.start);
+
+ offset = string_in_line
+ (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
+
+ if (offset == -1)
+ continue;
+
+ tmp_search.start += offset;
+ tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
+ offset = skip_node_characters
+ (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
+
+ /* Notice that this is an exact match. You cannot grovel through
+ the buffer with this function looking for random nodes. */
+ if ((offset == namelen) &&
+ (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
+ (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
+ return position;
+ }
+ return -1;
+}