Update tesseract traineddata loader with new path search.

First, we look in TESSDATA_PREFIX (if defined). If not found there, we look in ROMFS (in tessdata). If not found there, we look at the configured "tessdata" path (which defaults to ${datadir}/tessdata). (${datadir} defaults to ${prefix}/share on unix, and ${gsrootdir} on windows.) If not found there, we look in the current directory. Update doc/Devices.html (and fix some indexing).
author: Robin Watts <Robin.Watts@artifex.com> 2020-10-12 13:19:09 +0100
committer: Robin Watts <Robin.Watts@artifex.com> 2020-10-13 11:30:41 +0100
commit: 5af4f31bcda18c6fb7d14501c0a22697a7f49ac0 (patch)
tree: 5592359aa1cc75d3c277eba3d706ce2b636a831a /base/tessocr.cpp
parent: 848077c4f7b8b9131263b483ba90b05e0ee4e9d2 (diff)
download: ghostpdl-5af4f31bcda18c6fb7d14501c0a22697a7f49ac0.tar.gz
1 files changed, 57 insertions, 7 deletions
diff --git a/base/tessocr.cpp b/base/tessocr.cpp
index 225b75764..8ce19a14e 100644
--- a/base/tessocr.cpp
+++ b/base/tessocr.cpp
@@ -147,22 +147,71 @@ fail:
 }
 
 static bool
+load_file_from_path(const char *path, const char *file, GenericVector<char> *out)
+{
+    const char *sep = gp_file_name_directory_separator();
+    size_t seplen = strlen(sep);
+    size_t bufsize = strlen(path) + seplen + strlen(file) + 1;
+    const char *s, *e;
+    bool ret = 0;
+    char *buf = (char *)gs_alloc_bytes(leptonica_mem, bufsize, "load_file_from_path");
+    if (buf == NULL)
+        return 0;
+
+    s = path;
+    do {
+        e = path;
+        while (*e && *e != gp_file_name_list_separator)
+            e++;
+        memcpy(buf, s, e-s);
+        memcpy(&buf[e-s], sep, seplen);
+        strcpy(&buf[e-s+seplen], file);
+        ret = load_file(buf, out);
+        if (ret)
+            break;
+        s = e;
+        while (*s == gp_file_name_list_separator)
+            s++;
+    } while (*s != 0);
+
+    gs_free_object(leptonica_mem, buf, "load_file_from_path");
+
+    return ret;
+}
+
+#ifndef TESSDATA
+#define TESSDATA tessdata
+#endif
+#define STRINGIFY2(S) #S
+#define STRINGIFY(S) STRINGIFY2(S)
+static char *tessdata_prefix = STRINGIFY(TESSDATA);
+
+static bool
 tess_file_reader(const char *fname, GenericVector<char> *out)
 {
     const char *file = fname;
     const char *s;
     char text[PATH_MAX];
     int code = 0;
+    bool found;
     stream *ps;
     gx_io_device *iodev;
 
+    /* fname, as supplied to us by Tesseract has TESSDATA_PREFIX prepended
+     * to it. Check that first. */
+    found = load_file(fname, out);
+    if (found)
+            return found;
+
+    /* Find file, fname with any prefix removed, and use that in
+     * the rest of the searches. */
     for (s = fname; *s; s++)
         if (*s == '\\' || *s == '/')
             file = s+1;
 
-    /* FIXME: Try loading 'file' from gs specific paths */
+    /* Next look in romfs in the tessdata directory. */
     iodev = gs_findiodevice(leptonica_mem, (const byte *)"%rom", 4);
-    gs_snprintf(text, sizeof(text), "Resource/Tesseract/%s", file);
+    gs_snprintf(text, sizeof(text), "tessdata/%s", file);
     if (iodev) {
         long size;
         long i;
@@ -195,12 +244,13 @@ tess_file_reader(const char *fname, GenericVector<char> *out)
         }
     }
 
-    /* Fall back to gp_file access, first under Resource/Tesseract */
-    if (load_file(text, out))
-        return true;
+    /* Fall back to gp_file access under our configured tessdata path. */
+    found = load_file_from_path(tessdata_prefix, file, out);
+    if (found)
+        return found;
 
-    /* Then under TESSDATA */
-    return load_file(fname, out);
+    /* If all else fails, look in the current directory. */
+    return load_file(file, out);
 }
 
 int
author	Robin Watts <Robin.Watts@artifex.com>	2020-10-12 13:19:09 +0100
committer	Robin Watts <Robin.Watts@artifex.com>	2020-10-13 11:30:41 +0100
commit	5af4f31bcda18c6fb7d14501c0a22697a7f49ac0 (patch)
tree	5592359aa1cc75d3c277eba3d706ce2b636a831a /base/tessocr.cpp
parent	848077c4f7b8b9131263b483ba90b05e0ee4e9d2 (diff)
download	ghostpdl-5af4f31bcda18c6fb7d14501c0a22697a7f49ac0.tar.gz