summaryrefslogtreecommitdiff
path: root/charsetConv.c
diff options
context:
space:
mode:
Diffstat (limited to 'charsetConv.c')
-rw-r--r--charsetConv.c428
1 files changed, 428 insertions, 0 deletions
diff --git a/charsetConv.c b/charsetConv.c
new file mode 100644
index 0000000..0dc1b6d
--- /dev/null
+++ b/charsetConv.c
@@ -0,0 +1,428 @@
+/* Copyright 2008,2009 Alain Knaff.
+ * This file is part of mtools.
+ *
+ * Mtools is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Mtools is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Mtools. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Various character set conversions used by mtools
+ */
+#include "sysincludes.h"
+#include "msdos.h"
+#include "mtools.h"
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include "file_name.h"
+
+
+#ifdef HAVE_ICONV_H
+#include <iconv.h>
+
+struct doscp_t {
+ iconv_t from;
+ iconv_t to;
+};
+
+static const char *wcharCp=NULL;
+
+static const char* wcharTries[] = {
+ "WCHAR_T",
+ "UTF-32BE", "UTF-32LE",
+ "UTF-16BE", "UTF-16LE",
+ "UTF-32", "UTF-16",
+ "UCS-4BE", "UCS-4LE",
+ "UCS-2BE", "UCS-2LE",
+ "UCS-4", "UCS-2"
+};
+
+static const char *asciiTries[] = {
+ "ASCII", "ASCII-GR", "ISO8859-1"
+};
+
+static const wchar_t *testString = L"ab";
+
+static int try(const char *testCp) {
+ size_t res;
+ char *inbuf = (char *)testString;
+ size_t inbufLen = 2*sizeof(wchar_t);
+ char outbuf[3];
+ char *outbufP = outbuf;
+ size_t outbufLen = 2*sizeof(char);
+ iconv_t test;
+ int i;
+
+ for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
+ test = iconv_open(asciiTries[i], testCp);
+ if(test != (iconv_t) -1)
+ break;
+ }
+ if(test == (iconv_t) -1)
+ goto fail0;
+ res = iconv(test,
+ &inbuf, &inbufLen,
+ &outbufP, &outbufLen);
+ if(res != 0 || outbufLen != 0 || inbufLen != 0)
+ goto fail;
+ if(memcmp(outbuf, "ab", 2))
+ goto fail;
+ /* fprintf(stderr, "%s ok\n", testCp); */
+ return 1;
+ fail:
+ iconv_close(test);
+ fail0:
+ /*fprintf(stderr, "%s fail\n", testCp);*/
+ return 0;
+}
+
+static const char *getWcharCp(void) {
+ unsigned int i;
+ if(wcharCp != NULL)
+ return wcharCp;
+ for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
+ if(try(wcharTries[i]))
+ return (wcharCp=wcharTries[i]);
+ }
+ fprintf(stderr, "No codepage found for wchar_t\n");
+ return NULL;
+}
+
+
+doscp_t *cp_open(int codepage)
+{
+ char dosCp[17];
+ doscp_t *ret;
+ iconv_t *from;
+ iconv_t *to;
+
+ if(codepage == 0)
+ codepage = mtools_default_codepage;
+ if(codepage < 0 || codepage > 9999) {
+ fprintf(stderr, "Bad codepage %d\n", codepage);
+ return NULL;
+ }
+
+ if(getWcharCp() == NULL)
+ return NULL;
+
+ sprintf(dosCp, "CP%d", codepage);
+ from = iconv_open(wcharCp, dosCp);
+ if(from == (iconv_t)-1) {
+ fprintf(stderr, "Error converting to codepage %d %s\n",
+ codepage, strerror(errno));
+ return NULL;
+ }
+
+ sprintf(dosCp, "CP%d//TRANSLIT", codepage);
+ to = iconv_open(dosCp, wcharCp);
+ if(to == (iconv_t)-1) {
+ /* Transliteration not supported? */
+ sprintf(dosCp, "CP%d", codepage);
+ to = iconv_open(dosCp, wcharCp);
+ }
+ if(to == (iconv_t)-1) {
+ iconv_close(from);
+ fprintf(stderr, "Error converting to codepage %d %s\n",
+ codepage, strerror(errno));
+ return NULL;
+ }
+
+ ret = New(doscp_t);
+ if(ret == NULL)
+ return ret;
+ ret->from = from;
+ ret->to = to;
+ return ret;
+}
+
+void cp_close(doscp_t *cp)
+{
+ iconv_close(cp->to);
+ iconv_close(cp->from);
+ free(cp);
+}
+
+int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
+{
+ int r;
+ size_t in_len=len;
+ size_t out_len=len*sizeof(wchar_t);
+ wchar_t *dptr=wchar;
+ r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
+ if(r < 0)
+ return r;
+ *dptr = L'\0';
+ return dptr-wchar;
+}
+
+/**
+ * Converts len wide character to destination. Caller's responsibility to
+ * ensure that dest is large enough.
+ * mangled will be set if there has been an untranslatable character.
+ */
+static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
+ size_t len, int *mangled)
+{
+ int r;
+ unsigned int i;
+ size_t in_len=len*sizeof(wchar_t);
+ size_t out_len=len*4;
+ char *dptr = dest;
+
+ while(in_len > 0) {
+ r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
+ if(r >= 0 || errno != EILSEQ) {
+ /* everything transformed, or error that is _not_ a bad
+ * character */
+ break;
+ }
+ *mangled |= 1;
+
+ if(dptr)
+ *dptr++ = '_';
+ in_len--;
+
+ wchar++;
+ out_len--;
+ }
+
+ len = dptr-dest; /* how many dest characters have there been
+ generated */
+
+ /* eliminate question marks which might have been formed by
+ untransliterable characters */
+ for(i=0; i<len; i++) {
+ if(dest[i] == '?') {
+ dest[i] = '_';
+ *mangled |= 1;
+ }
+ }
+ return len;
+}
+
+void wchar_to_dos(doscp_t *cp,
+ wchar_t *wchar, char *dos, size_t len, int *mangled)
+{
+ safe_iconv(cp->to, wchar, dos, len, mangled);
+}
+
+#else
+
+#include "codepage.h"
+
+struct doscp_t {
+ unsigned char *from_dos;
+ unsigned char to_dos[0x80];
+};
+
+doscp_t *cp_open(int codepage)
+{
+ doscp_t *ret;
+ int i;
+ Codepage_t *cp;
+
+ if(codepage == 0)
+ codepage = 850;
+
+ ret = New(doscp_t);
+ if(ret == NULL)
+ return ret;
+
+ for(cp=codepages; cp->nr ; cp++)
+ if(cp->nr == codepage) {
+ ret->from_dos = cp->tounix;
+ break;
+ }
+
+ if(ret->from_dos == NULL) {
+ fprintf(stderr, "Bad codepage %d\n", codepage);
+ free(ret);
+ return NULL;
+ }
+
+ for(i=0; i<0x80; i++) {
+ char native = ret->from_dos[i];
+ if(! (native & 0x80))
+ continue;
+ ret->to_dos[native & 0x7f] = 0x80 | i;
+ }
+ return ret;
+}
+
+void cp_close(doscp_t *cp)
+{
+ free(cp);
+}
+
+int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
+{
+ int i;
+
+ for(i=0; i<len && dos[i]; i++) {
+ char c = dos[i];
+ if(c >= ' ' && c <= '~')
+ wchar[i] = c;
+ else {
+ wchar[i] = cp->from_dos[c & 0x7f];
+ }
+ }
+ wchar[i] = '\0';
+ return i;
+}
+
+
+void wchar_to_dos(doscp_t *cp,
+ wchar_t *wchar, char *dos, size_t len, int *mangled)
+{
+ int i;
+ for(i=0; i<len && wchar[i]; i++) {
+ char c = wchar[i];
+ if(c >= ' ' && c <= '~')
+ dos[i] = c;
+ else {
+ dos[i] = cp->to_dos[c & 0x7f];
+ if(dos[i] == '\0') {
+ dos[i]='_';
+ *mangled=1;
+ }
+ }
+ }
+}
+
+#endif
+
+
+#ifndef HAVE_WCHAR_H
+
+typedef int mbstate_t;
+
+static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
+{
+ *s = wc;
+ return 1;
+}
+
+static inline size_t mbrtowc(wchar_t *pwc, const char *s,
+ size_t n, mbstate_t *ps)
+{
+ *pwc = *s;
+ return 1;
+}
+
+#endif
+
+#ifdef HAVE_ICONV_H
+
+#include <langinfo.h>
+
+static iconv_t to_native = NULL;
+
+static void initialize_to_native(void)
+{
+ char *li, *cp;
+ int len;
+ if(to_native != NULL)
+ return;
+ li = nl_langinfo(CODESET);
+ len = strlen(li) + 11;
+ if(getWcharCp() == NULL)
+ exit(1);
+ cp = safe_malloc(len);
+ strcpy(cp, li);
+ strcat(cp, "//TRANSLIT");
+ to_native = iconv_open(cp, wcharCp);
+ if(to_native == (iconv_t) -1)
+ to_native = iconv_open(li, wcharCp);
+ if(to_native == (iconv_t) -1)
+ fprintf(stderr, "Could not allocate iconv for %s\n", cp);
+ free(cp);
+ if(to_native == (iconv_t) -1)
+ exit(1);
+}
+
+
+
+#endif
+
+
+/**
+ * Convert wchar string to native, converting at most len wchar characters
+ * Returns number of generated native characters
+ */
+int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
+{
+#ifdef HAVE_ICONV_H
+ int mangled;
+ int r;
+ initialize_to_native();
+ len = wcsnlen(wchar,len);
+ r=safe_iconv(to_native, wchar, native, len, &mangled);
+ native[r]='\0';
+ return r;
+#else
+ int i;
+ char *dptr = native;
+ mbstate_t ps;
+ memset(&ps, 0, sizeof(ps));
+ for(i=0; i<len && wchar[i] != 0; i++) {
+ int r = wcrtomb(dptr, wchar[i], &ps);
+ if(r < 0 && errno == EILSEQ) {
+ r=1;
+ *dptr='_';
+ }
+ if(r < 0)
+ return r;
+ dptr+=r;
+ }
+ *dptr='\0';
+ return dptr-native;
+#endif
+}
+
+/**
+ * Convert native string to wchar string, generating at most len wchar
+ * characters. If end is supplied, stop conversion when source pointer
+ * exceeds end. Returns number of generated wchars
+ */
+int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
+ const char *end, int *mangled)
+{
+ mbstate_t ps;
+ unsigned int i;
+ memset(&ps, 0, sizeof(ps));
+
+ for(i=0; i<len && (native < end || !end); i++) {
+ int r = mbrtowc(wchar+i, native, len, &ps);
+ if(r < 0) {
+ /* Unconvertible character. Just pretend it's Latin1
+ encoded (if valid Latin1 character) or substitue
+ with an underscore if not
+ */
+ char c = *native;
+ if(c >= '\xa0' && c < '\xff')
+ wchar[i] = c & 0xff;
+ else
+ wchar[i] = '_';
+ memset(&ps, 0, sizeof(ps));
+ r=1;
+ }
+ if(r == 0)
+ break;
+ native += r;
+ }
+ if(mangled && ((end && native < end) || (!end && *native && i == len)))
+ *mangled |= 3;
+ wchar[i]='\0';
+ return i;
+}
+