1 files changed, 291 insertions, 0 deletions
diff --git a/locate/code.c b/locate/code.c
new file mode 100644
index 0000000..eae02ce
--- /dev/null
+++ b/locate/code.c
@@ -0,0 +1,291 @@
+/* code -- bigram- and front-encode filenames for locate
+   Copyright (C) 1994, 2005, 2007, 2008, 2010, 2011 Free Software
+   Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Compress a sorted list.
+   Works with `find' to encode a filename database to save space
+   and search time.
+
+   Usage:
+
+   bigram < file_list > bigrams
+   process-bigrams > most_common_bigrams
+   code most_common_bigrams < file_list > squeezed_list
+
+   Uses `front compression' (see ";login:", March 1983, p. 8).
+   The output begins with the 128 most common bigrams.
+   After that, the output format is, for each line,
+   an offset (from the previous line) differential count byte
+   followed by a (partially bigram-encoded) ASCII remainder.
+   The output lines have no terminating byte; the start of the next line
+   is indicated by its first byte having a value <= 30.
+
+   The encoding of the output bytes is:
+
+   0-28		likeliest differential counts + offset (14) to make nonnegative
+   30		escape code for out-of-range count to follow in next halfword
+   128-255      bigram codes (the 128 most common, as determined by `updatedb')
+   32-127       single character (printable) ASCII remainder
+
+   Written by James A. Woods <jwoods@adobe.com>.
+   Modified by David MacKenzie <djm@gnu.org>.  */
+
+/* config.h should always be included first. */
+#include <config.h>
+
+/* system headers. */
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+/* gnulib headers. */
+#include "closeout.h"
+#include "error.h"
+#include "gettext.h"
+#include "progname.h"
+#include "xalloc.h"
+
+/* find headers. */
+#include "findutils-version.h"
+#include "locatedb.h"
+
+#if ENABLE_NLS
+# include <libintl.h>
+# define _(Text) gettext (Text)
+#else
+# define _(Text) Text
+#define textdomain(Domain)
+#define bindtextdomain(Package, Directory)
+#endif
+#ifdef gettext_noop
+# define N_(String) gettext_noop (String)
+#else
+/* See locate.c for explanation as to why not use (String) */
+# define N_(String) String
+#endif
+
+
+#ifndef ATTRIBUTE_NORETURN
+# define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__))
+#endif
+
+
+/* The 128 most common bigrams in the file list, padded with NULs
+   if there are fewer.  */
+static char bigrams[257] = {0};
+
+/* Return the offset of PATTERN in STRING, or -1 if not found. */
+
+static int
+strindex (char *string, char *pattern)
+{
+  register char *s;
+
+  for (s = string; *s != '\0'; s++)
+    /* Fast first char check. */
+    if (*s == *pattern)
+      {
+	register char *p2 = pattern + 1, *s2 = s + 1;
+	while (*p2 != '\0' && *p2 == *s2)
+	  p2++, s2++;
+	if (*p2 == '\0')
+	  return s2 - strlen (pattern) - string;
+      }
+  return -1;
+}
+
+/* Return the length of the longest common prefix of strings S1 and S2. */
+
+static int
+prefix_length (char *s1, char *s2)
+{
+  register char *start;
+
+  for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
+    ;
+  return s1 - start;
+}
+
+extern char *version_string;
+
+static void
+usage (FILE *stream)
+{
+  fprintf (stream, _("\
+Usage: %s [--version | --help]\n\
+or     %s most_common_bigrams < file-list > locate-database\n"),
+	   program_name, program_name);
+  fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
+}
+
+
+static void inerr (const char *filename) ATTRIBUTE_NORETURN;
+static void outerr (void)                 ATTRIBUTE_NORETURN;
+
+static void
+inerr (const char *filename)
+{
+  error (EXIT_FAILURE, errno, "%s", filename);
+  /*NOTREACHED*/
+  abort ();
+}
+
+static void
+outerr (void)
+{
+  error (EXIT_FAILURE, errno, _("write error"));
+  /*NOTREACHED*/
+  abort ();
+}
+
+
+int
+main (int argc, char **argv)
+{
+  char *path;			/* The current input entry.  */
+  char *oldpath;		/* The previous input entry.  */
+  size_t pathsize, oldpathsize;	/* Amounts allocated for them.  */
+  int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
+  char bigram[3];		/* Bigram to search for in table.  */
+  int code;			/* Index of `bigram' in bigrams table.  */
+  FILE *fp;			/* Most common bigrams file.  */
+  int line_len;			/* Length of input line.  */
+
+  set_program_name (argv[0]);
+  if (atexit (close_stdout))
+    {
+      error (EXIT_FAILURE, errno, _("The atexit library function failed"));
+    }
+
+  bigram[2] = '\0';
+
+  if (argc != 2)
+    {
+      usage (stderr);
+      return 2;
+    }
+
+  if (0 == strcmp (argv[1], "--help"))
+    {
+      usage (stdout);
+      return 0;
+    }
+  else if (0 == strcmp (argv[1], "--version"))
+    {
+      display_findutils_version ("code");
+      return 0;
+    }
+
+  fp = fopen (argv[1], "r");
+  if (fp == NULL)
+    {
+      fprintf (stderr, "%s: ", argv[0]);
+      perror (argv[1]);
+      return 1;
+    }
+
+  pathsize = oldpathsize = 1026; /* Increased as necessary by getline.  */
+  path = xmalloc (pathsize);
+  oldpath = xmalloc (oldpathsize);
+
+  /* Set to empty string, to force the first prefix count to 0.  */
+  oldpath[0] = '\0';
+  oldcount = 0;
+
+  /* Copy the list of most common bigrams to the output,
+     padding with NULs if there are <128 of them.  */
+  if (NULL == fgets (bigrams, 257, fp))
+    inerr (argv[1]);
+
+  if (256 != fwrite (bigrams, 1, 256, stdout))
+     outerr ();
+
+  if (EOF == fclose (fp))
+     inerr (argv[1]);
+
+  while ((line_len = getline (&path, &pathsize, stdin)) > 0)
+    {
+      char *pp;
+
+      path[line_len - 1] = '\0'; /* Remove newline. */
+
+      /* Squelch unprintable chars in path so as not to botch decoding.  */
+      for (pp = path; *pp != '\0'; pp++)
+	{
+	  if (!(*pp >= 040 && *pp < 0177))
+	    *pp = '?';
+	}
+
+      count = prefix_length (oldpath, path);
+      diffcount = count - oldcount;
+      oldcount = count;
+      /* If the difference is small, it fits in one byte;
+	 otherwise, two bytes plus a marker noting that fact.  */
+      if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
+	{
+	  if (EOF ==- putc (LOCATEDB_OLD_ESCAPE, stdout))
+	    outerr ();
+
+	  if (!putword (stdout,
+			diffcount+LOCATEDB_OLD_OFFSET,
+			GetwordEndianStateNative))
+	    outerr ();
+	}
+      else
+	{
+	  if (EOF == putc (diffcount + LOCATEDB_OLD_OFFSET, stdout))
+	    outerr ();
+	}
+
+      /* Look for bigrams in the remainder of the path.  */
+      for (pp = path + count; *pp != '\0'; pp += 2)
+	{
+	  if (pp[1] == '\0')
+	    {
+	      /* No bigram is possible; only one char is left.  */
+	      putchar (*pp);
+	      break;
+	    }
+	  bigram[0] = *pp;
+	  bigram[1] = pp[1];
+	  /* Linear search for specific bigram in string table. */
+	  code = strindex (bigrams, bigram);
+	  if (code % 2 == 0)
+	    putchar ((code / 2) | 0200); /* It's a common bigram.  */
+	  else
+	    fputs (bigram, stdout); /* Write the text as printable ASCII.  */
+	}
+
+      {
+	/* Swap path and oldpath and their sizes.  */
+	char *tmppath = oldpath;
+	size_t tmppathsize = oldpathsize;
+	oldpath = path;
+	oldpathsize = pathsize;
+	path = tmppath;
+	pathsize = tmppathsize;
+      }
+    }
+
+  free (path);
+  free (oldpath);
+
+  return 0;
+}