summaryrefslogtreecommitdiff
path: root/locate/code.c
diff options
context:
space:
mode:
Diffstat (limited to 'locate/code.c')
-rw-r--r--locate/code.c291
1 files changed, 291 insertions, 0 deletions
diff --git a/locate/code.c b/locate/code.c
new file mode 100644
index 0000000..eae02ce
--- /dev/null
+++ b/locate/code.c
@@ -0,0 +1,291 @@
+/* code -- bigram- and front-encode filenames for locate
+ Copyright (C) 1994, 2005, 2007, 2008, 2010, 2011 Free Software
+ Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Compress a sorted list.
+ Works with `find' to encode a filename database to save space
+ and search time.
+
+ Usage:
+
+ bigram < file_list > bigrams
+ process-bigrams > most_common_bigrams
+ code most_common_bigrams < file_list > squeezed_list
+
+ Uses `front compression' (see ";login:", March 1983, p. 8).
+ The output begins with the 128 most common bigrams.
+ After that, the output format is, for each line,
+ an offset (from the previous line) differential count byte
+ followed by a (partially bigram-encoded) ASCII remainder.
+ The output lines have no terminating byte; the start of the next line
+ is indicated by its first byte having a value <= 30.
+
+ The encoding of the output bytes is:
+
+ 0-28 likeliest differential counts + offset (14) to make nonnegative
+ 30 escape code for out-of-range count to follow in next halfword
+ 128-255 bigram codes (the 128 most common, as determined by `updatedb')
+ 32-127 single character (printable) ASCII remainder
+
+ Written by James A. Woods <jwoods@adobe.com>.
+ Modified by David MacKenzie <djm@gnu.org>. */
+
+/* config.h should always be included first. */
+#include <config.h>
+
+/* system headers. */
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+/* gnulib headers. */
+#include "closeout.h"
+#include "error.h"
+#include "gettext.h"
+#include "progname.h"
+#include "xalloc.h"
+
+/* find headers. */
+#include "findutils-version.h"
+#include "locatedb.h"
+
+#if ENABLE_NLS
+# include <libintl.h>
+# define _(Text) gettext (Text)
+#else
+# define _(Text) Text
+#define textdomain(Domain)
+#define bindtextdomain(Package, Directory)
+#endif
+#ifdef gettext_noop
+# define N_(String) gettext_noop (String)
+#else
+/* See locate.c for explanation as to why not use (String) */
+# define N_(String) String
+#endif
+
+
+#ifndef ATTRIBUTE_NORETURN
+# define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
+#endif
+
+
+/* The 128 most common bigrams in the file list, padded with NULs
+ if there are fewer. */
+static char bigrams[257] = {0};
+
+/* Return the offset of PATTERN in STRING, or -1 if not found. */
+
+static int
+strindex (char *string, char *pattern)
+{
+ register char *s;
+
+ for (s = string; *s != '\0'; s++)
+ /* Fast first char check. */
+ if (*s == *pattern)
+ {
+ register char *p2 = pattern + 1, *s2 = s + 1;
+ while (*p2 != '\0' && *p2 == *s2)
+ p2++, s2++;
+ if (*p2 == '\0')
+ return s2 - strlen (pattern) - string;
+ }
+ return -1;
+}
+
+/* Return the length of the longest common prefix of strings S1 and S2. */
+
+static int
+prefix_length (char *s1, char *s2)
+{
+ register char *start;
+
+ for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
+ ;
+ return s1 - start;
+}
+
+extern char *version_string;
+
+static void
+usage (FILE *stream)
+{
+ fprintf (stream, _("\
+Usage: %s [--version | --help]\n\
+or %s most_common_bigrams < file-list > locate-database\n"),
+ program_name, program_name);
+ fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
+}
+
+
+static void inerr (const char *filename) ATTRIBUTE_NORETURN;
+static void outerr (void) ATTRIBUTE_NORETURN;
+
+static void
+inerr (const char *filename)
+{
+ error (EXIT_FAILURE, errno, "%s", filename);
+ /*NOTREACHED*/
+ abort ();
+}
+
+static void
+outerr (void)
+{
+ error (EXIT_FAILURE, errno, _("write error"));
+ /*NOTREACHED*/
+ abort ();
+}
+
+
+int
+main (int argc, char **argv)
+{
+ char *path; /* The current input entry. */
+ char *oldpath; /* The previous input entry. */
+ size_t pathsize, oldpathsize; /* Amounts allocated for them. */
+ int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
+ char bigram[3]; /* Bigram to search for in table. */
+ int code; /* Index of `bigram' in bigrams table. */
+ FILE *fp; /* Most common bigrams file. */
+ int line_len; /* Length of input line. */
+
+ set_program_name (argv[0]);
+ if (atexit (close_stdout))
+ {
+ error (EXIT_FAILURE, errno, _("The atexit library function failed"));
+ }
+
+ bigram[2] = '\0';
+
+ if (argc != 2)
+ {
+ usage (stderr);
+ return 2;
+ }
+
+ if (0 == strcmp (argv[1], "--help"))
+ {
+ usage (stdout);
+ return 0;
+ }
+ else if (0 == strcmp (argv[1], "--version"))
+ {
+ display_findutils_version ("code");
+ return 0;
+ }
+
+ fp = fopen (argv[1], "r");
+ if (fp == NULL)
+ {
+ fprintf (stderr, "%s: ", argv[0]);
+ perror (argv[1]);
+ return 1;
+ }
+
+ pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
+ path = xmalloc (pathsize);
+ oldpath = xmalloc (oldpathsize);
+
+ /* Set to empty string, to force the first prefix count to 0. */
+ oldpath[0] = '\0';
+ oldcount = 0;
+
+ /* Copy the list of most common bigrams to the output,
+ padding with NULs if there are <128 of them. */
+ if (NULL == fgets (bigrams, 257, fp))
+ inerr (argv[1]);
+
+ if (256 != fwrite (bigrams, 1, 256, stdout))
+ outerr ();
+
+ if (EOF == fclose (fp))
+ inerr (argv[1]);
+
+ while ((line_len = getline (&path, &pathsize, stdin)) > 0)
+ {
+ char *pp;
+
+ path[line_len - 1] = '\0'; /* Remove newline. */
+
+ /* Squelch unprintable chars in path so as not to botch decoding. */
+ for (pp = path; *pp != '\0'; pp++)
+ {
+ if (!(*pp >= 040 && *pp < 0177))
+ *pp = '?';
+ }
+
+ count = prefix_length (oldpath, path);
+ diffcount = count - oldcount;
+ oldcount = count;
+ /* If the difference is small, it fits in one byte;
+ otherwise, two bytes plus a marker noting that fact. */
+ if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
+ {
+ if (EOF ==- putc (LOCATEDB_OLD_ESCAPE, stdout))
+ outerr ();
+
+ if (!putword (stdout,
+ diffcount+LOCATEDB_OLD_OFFSET,
+ GetwordEndianStateNative))
+ outerr ();
+ }
+ else
+ {
+ if (EOF == putc (diffcount + LOCATEDB_OLD_OFFSET, stdout))
+ outerr ();
+ }
+
+ /* Look for bigrams in the remainder of the path. */
+ for (pp = path + count; *pp != '\0'; pp += 2)
+ {
+ if (pp[1] == '\0')
+ {
+ /* No bigram is possible; only one char is left. */
+ putchar (*pp);
+ break;
+ }
+ bigram[0] = *pp;
+ bigram[1] = pp[1];
+ /* Linear search for specific bigram in string table. */
+ code = strindex (bigrams, bigram);
+ if (code % 2 == 0)
+ putchar ((code / 2) | 0200); /* It's a common bigram. */
+ else
+ fputs (bigram, stdout); /* Write the text as printable ASCII. */
+ }
+
+ {
+ /* Swap path and oldpath and their sizes. */
+ char *tmppath = oldpath;
+ size_t tmppathsize = oldpathsize;
+ oldpath = path;
+ oldpathsize = pathsize;
+ path = tmppath;
+ pathsize = tmppathsize;
+ }
+ }
+
+ free (path);
+ free (oldpath);
+
+ return 0;
+}