diff options
author | Robin Watts <Robin.Watts@artifex.com> | 2020-10-12 13:19:09 +0100 |
---|---|---|
committer | Robin Watts <Robin.Watts@artifex.com> | 2020-10-13 11:30:41 +0100 |
commit | 5af4f31bcda18c6fb7d14501c0a22697a7f49ac0 (patch) | |
tree | 5592359aa1cc75d3c277eba3d706ce2b636a831a /base/tessocr.cpp | |
parent | 848077c4f7b8b9131263b483ba90b05e0ee4e9d2 (diff) | |
download | ghostpdl-5af4f31bcda18c6fb7d14501c0a22697a7f49ac0.tar.gz |
Update tesseract traineddata loader with new path search.
First, we look in TESSDATA_PREFIX (if defined).
If not found there, we look in ROMFS (in tessdata).
If not found there, we look at the configured "tessdata" path
(which defaults to ${datadir}/tessdata). (${datadir} defaults to
${prefix}/share on unix, and ${gsrootdir} on windows.)
If not found there, we look in the current directory.
Update doc/Devices.html (and fix some indexing).
Diffstat (limited to 'base/tessocr.cpp')
-rw-r--r-- | base/tessocr.cpp | 64 |
1 files changed, 57 insertions, 7 deletions
diff --git a/base/tessocr.cpp b/base/tessocr.cpp index 225b75764..8ce19a14e 100644 --- a/base/tessocr.cpp +++ b/base/tessocr.cpp @@ -147,22 +147,71 @@ fail: } static bool +load_file_from_path(const char *path, const char *file, GenericVector<char> *out) +{ + const char *sep = gp_file_name_directory_separator(); + size_t seplen = strlen(sep); + size_t bufsize = strlen(path) + seplen + strlen(file) + 1; + const char *s, *e; + bool ret = 0; + char *buf = (char *)gs_alloc_bytes(leptonica_mem, bufsize, "load_file_from_path"); + if (buf == NULL) + return 0; + + s = path; + do { + e = path; + while (*e && *e != gp_file_name_list_separator) + e++; + memcpy(buf, s, e-s); + memcpy(&buf[e-s], sep, seplen); + strcpy(&buf[e-s+seplen], file); + ret = load_file(buf, out); + if (ret) + break; + s = e; + while (*s == gp_file_name_list_separator) + s++; + } while (*s != 0); + + gs_free_object(leptonica_mem, buf, "load_file_from_path"); + + return ret; +} + +#ifndef TESSDATA +#define TESSDATA tessdata +#endif +#define STRINGIFY2(S) #S +#define STRINGIFY(S) STRINGIFY2(S) +static char *tessdata_prefix = STRINGIFY(TESSDATA); + +static bool tess_file_reader(const char *fname, GenericVector<char> *out) { const char *file = fname; const char *s; char text[PATH_MAX]; int code = 0; + bool found; stream *ps; gx_io_device *iodev; + /* fname, as supplied to us by Tesseract has TESSDATA_PREFIX prepended + * to it. Check that first. */ + found = load_file(fname, out); + if (found) + return found; + + /* Find file, fname with any prefix removed, and use that in + * the rest of the searches. */ for (s = fname; *s; s++) if (*s == '\\' || *s == '/') file = s+1; - /* FIXME: Try loading 'file' from gs specific paths */ + /* Next look in romfs in the tessdata directory. */ iodev = gs_findiodevice(leptonica_mem, (const byte *)"%rom", 4); - gs_snprintf(text, sizeof(text), "Resource/Tesseract/%s", file); + gs_snprintf(text, sizeof(text), "tessdata/%s", file); if (iodev) { long size; long i; @@ -195,12 +244,13 @@ tess_file_reader(const char *fname, GenericVector<char> *out) } } - /* Fall back to gp_file access, first under Resource/Tesseract */ - if (load_file(text, out)) - return true; + /* Fall back to gp_file access under our configured tessdata path. */ + found = load_file_from_path(tessdata_prefix, file, out); + if (found) + return found; - /* Then under TESSDATA */ - return load_file(fname, out); + /* If all else fails, look in the current directory. */ + return load_file(file, out); } int |