1 files changed, 428 insertions, 0 deletions
diff --git a/charsetConv.c b/charsetConv.c
new file mode 100644
index 0000000..0dc1b6d
--- /dev/null
+++ b/charsetConv.c
@@ -0,0 +1,428 @@
+/*  Copyright 2008,2009 Alain Knaff.
+ *  This file is part of mtools.
+ *                              
+ *  Mtools is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or   
+ *  (at your option) any later version.                                 
+ *                                                                      
+ *  Mtools is distributed in the hope that it will be useful,           
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of      
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Mtools.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Various character set conversions used by mtools
+ */
+#include "sysincludes.h"
+#include "msdos.h"
+#include "mtools.h"
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include "file_name.h"
+
+
+#ifdef HAVE_ICONV_H
+#include <iconv.h>
+
+struct doscp_t {
+	iconv_t from;
+	iconv_t to;
+};
+
+static const char *wcharCp=NULL;
+
+static const char* wcharTries[] = {
+	"WCHAR_T",
+	"UTF-32BE", "UTF-32LE",
+	"UTF-16BE", "UTF-16LE",
+	"UTF-32", "UTF-16",
+	"UCS-4BE", "UCS-4LE",
+	"UCS-2BE", "UCS-2LE",
+	"UCS-4", "UCS-2"
+};
+
+static const char *asciiTries[] = {
+	"ASCII", "ASCII-GR", "ISO8859-1"
+};
+
+static const wchar_t *testString = L"ab";
+
+static int try(const char *testCp) {
+	size_t res;
+	char *inbuf = (char *)testString;
+	size_t inbufLen = 2*sizeof(wchar_t);
+	char outbuf[3];
+	char *outbufP = outbuf;
+	size_t outbufLen = 2*sizeof(char);
+	iconv_t test;
+	int i;
+	
+	for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
+		test = iconv_open(asciiTries[i], testCp);
+		if(test != (iconv_t) -1)
+			break;
+	}
+	if(test == (iconv_t) -1)
+		goto fail0;
+	res = iconv(test,
+		    &inbuf, &inbufLen,
+		    &outbufP, &outbufLen);
+	if(res != 0 || outbufLen != 0 || inbufLen != 0)
+		goto fail;
+	if(memcmp(outbuf, "ab", 2))
+		goto fail;
+	/* fprintf(stderr, "%s ok\n", testCp); */
+	return 1;
+ fail:
+	iconv_close(test);
+ fail0:
+	/*fprintf(stderr, "%s fail\n", testCp);*/
+	return 0;
+}
+
+static const char *getWcharCp(void) {
+	unsigned int i;
+	if(wcharCp != NULL)
+		return wcharCp;	
+	for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
+		if(try(wcharTries[i]))
+			return (wcharCp=wcharTries[i]);
+	}
+	fprintf(stderr, "No codepage found for wchar_t\n");
+	return NULL;
+}
+
+
+doscp_t *cp_open(int codepage)
+{
+	char dosCp[17];
+	doscp_t *ret;
+	iconv_t *from;
+	iconv_t *to;
+
+	if(codepage == 0)
+		codepage = mtools_default_codepage;
+	if(codepage < 0 || codepage > 9999) {
+		fprintf(stderr, "Bad codepage %d\n", codepage);
+		return NULL;
+	}
+
+	if(getWcharCp() == NULL)
+		return NULL;
+
+	sprintf(dosCp, "CP%d", codepage);
+	from = iconv_open(wcharCp, dosCp);
+	if(from == (iconv_t)-1) {
+		fprintf(stderr, "Error converting to codepage %d %s\n",
+			codepage, strerror(errno));
+		return NULL;
+	}
+
+	sprintf(dosCp, "CP%d//TRANSLIT", codepage);
+	to   =  iconv_open(dosCp, wcharCp);
+	if(to == (iconv_t)-1) {
+		/* Transliteration not supported? */
+		sprintf(dosCp, "CP%d", codepage);
+		to   =  iconv_open(dosCp, wcharCp);
+	}
+	if(to == (iconv_t)-1) {
+		iconv_close(from);
+		fprintf(stderr, "Error converting to codepage %d %s\n",
+			codepage, strerror(errno));
+		return NULL;
+	}
+
+	ret = New(doscp_t);
+	if(ret == NULL)
+		return ret;
+	ret->from = from;
+	ret->to   = to;
+	return ret;
+}
+
+void cp_close(doscp_t *cp)
+{
+	iconv_close(cp->to);
+	iconv_close(cp->from);
+	free(cp);
+}
+
+int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
+{
+	int r;
+	size_t in_len=len;
+	size_t out_len=len*sizeof(wchar_t);
+	wchar_t *dptr=wchar;
+	r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len);
+	if(r < 0)
+		return r;
+	*dptr = L'\0';
+	return dptr-wchar;
+}
+
+/**
+ * Converts len wide character to destination. Caller's responsibility to
+ * ensure that dest is large enough.
+ * mangled will be set if there has been an untranslatable character.
+ */
+static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
+		      size_t len, int *mangled)
+{
+	int r;
+	unsigned int i;
+	size_t in_len=len*sizeof(wchar_t);
+	size_t out_len=len*4;
+	char *dptr = dest;
+
+	while(in_len > 0) {
+		r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
+		if(r >= 0 || errno != EILSEQ) {
+			/* everything transformed, or error that is _not_ a bad
+			 * character */
+			break;
+		}
+		*mangled |= 1;
+
+		if(dptr)
+			*dptr++ = '_';
+		in_len--;
+
+		wchar++;
+		out_len--;
+	}
+
+	len = dptr-dest; /* how many dest characters have there been
+			    generated */
+
+	/* eliminate question marks which might have been formed by
+	   untransliterable characters */
+	for(i=0; i<len; i++) {
+		if(dest[i] == '?') {
+			dest[i] = '_';
+			*mangled |= 1;
+		}
+	}
+	return len;
+}
+
+void wchar_to_dos(doscp_t *cp,
+		  wchar_t *wchar, char *dos, size_t len, int *mangled)
+{
+	safe_iconv(cp->to, wchar, dos, len, mangled);
+}
+
+#else
+
+#include "codepage.h"
+
+struct doscp_t {
+	unsigned char *from_dos;
+	unsigned char to_dos[0x80];
+};
+
+doscp_t *cp_open(int codepage)
+{
+	doscp_t *ret;
+	int i;
+	Codepage_t *cp;
+
+	if(codepage == 0)
+		codepage = 850;
+
+	ret = New(doscp_t);
+	if(ret == NULL)
+		return ret;
+
+	for(cp=codepages; cp->nr ; cp++)
+		if(cp->nr == codepage) {
+			ret->from_dos = cp->tounix;
+			break;
+		}
+
+	if(ret->from_dos == NULL) {
+		fprintf(stderr, "Bad codepage %d\n", codepage);
+		free(ret);
+		return NULL;
+	}
+
+	for(i=0; i<0x80; i++) {
+		char native = ret->from_dos[i];
+		if(! (native & 0x80))
+			continue;
+		ret->to_dos[native & 0x7f] = 0x80 | i;
+	}
+	return ret;
+}
+
+void cp_close(doscp_t *cp)
+{
+	free(cp);
+}
+
+int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len)
+{
+	int i;
+
+	for(i=0; i<len && dos[i]; i++) {
+		char c = dos[i];
+		if(c >= ' ' && c <= '~')
+			wchar[i] = c;
+		else {
+			wchar[i] = cp->from_dos[c & 0x7f];
+		}
+	}
+	wchar[i] = '\0';
+	return i;
+}
+
+
+void wchar_to_dos(doscp_t *cp,
+		  wchar_t *wchar, char *dos, size_t len, int *mangled)
+{
+	int i;
+	for(i=0; i<len && wchar[i]; i++) {
+		char c = wchar[i];
+		if(c >= ' ' && c <= '~')
+			dos[i] = c;
+		else {
+			dos[i] = cp->to_dos[c & 0x7f];
+			if(dos[i] == '\0') {
+				dos[i]='_';
+				*mangled=1;
+			}
+		}
+	}
+}
+
+#endif
+
+
+#ifndef HAVE_WCHAR_H
+
+typedef int mbstate_t;
+
+static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
+{
+	*s = wc;
+	return 1;
+}
+
+static inline size_t mbrtowc(wchar_t *pwc, const char *s, 
+			     size_t n, mbstate_t *ps)
+{
+	*pwc = *s;
+	return 1;
+}
+
+#endif
+
+#ifdef HAVE_ICONV_H
+
+#include <langinfo.h>
+
+static iconv_t to_native = NULL;
+
+static void initialize_to_native(void)
+{
+	char *li, *cp;
+	int len;
+	if(to_native != NULL)
+		return;
+	li = nl_langinfo(CODESET);
+	len = strlen(li) + 11;
+	if(getWcharCp() == NULL)
+		exit(1);
+	cp = safe_malloc(len);
+	strcpy(cp, li);
+	strcat(cp, "//TRANSLIT");
+	to_native = iconv_open(cp, wcharCp);
+	if(to_native == (iconv_t) -1)
+		to_native = iconv_open(li, wcharCp);
+	if(to_native == (iconv_t) -1)
+		fprintf(stderr, "Could not allocate iconv for %s\n", cp);
+	free(cp);
+	if(to_native == (iconv_t) -1)
+		exit(1);
+}
+
+
+
+#endif
+
+
+/**
+ * Convert wchar string to native, converting at most len wchar characters
+ * Returns number of generated native characters
+ */
+int wchar_to_native(const wchar_t *wchar, char *native, size_t len)
+{
+#ifdef HAVE_ICONV_H
+	int mangled;
+	int r;
+	initialize_to_native();
+	len = wcsnlen(wchar,len);
+	r=safe_iconv(to_native, wchar, native, len, &mangled);
+	native[r]='\0';
+	return r;
+#else
+	int i;
+	char *dptr = native;
+	mbstate_t ps;
+	memset(&ps, 0, sizeof(ps));
+	for(i=0; i<len && wchar[i] != 0; i++) {
+		int r = wcrtomb(dptr, wchar[i], &ps);
+		if(r < 0 && errno == EILSEQ) {
+			r=1;
+			*dptr='_';
+		}
+		if(r < 0)
+			return r;
+		dptr+=r;
+	}
+	*dptr='\0';
+	return dptr-native;
+#endif
+}
+
+/**
+ * Convert native string to wchar string, generating at most len wchar
+ * characters. If end is supplied, stop conversion when source pointer
+ * exceeds end. Returns number of generated wchars
+ */
+int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
+		    const char *end, int *mangled)
+{
+	mbstate_t ps;
+	unsigned int i;
+	memset(&ps, 0, sizeof(ps));
+
+	for(i=0; i<len && (native < end || !end); i++) {
+		int r = mbrtowc(wchar+i, native, len, &ps);
+		if(r < 0) {
+			/* Unconvertible character. Just pretend it's Latin1
+			   encoded (if valid Latin1 character) or substitue
+			   with an underscore if not
+			*/
+			char c = *native;
+			if(c >= '\xa0' && c < '\xff')
+				wchar[i] = c & 0xff;
+			else
+				wchar[i] = '_';
+			memset(&ps, 0, sizeof(ps));
+			r=1;
+		}
+		if(r == 0)
+			break;
+		native += r;
+	}
+	if(mangled && ((end && native < end) || (!end && *native &&  i == len)))
+		*mangled |= 3;
+	wchar[i]='\0';
+	return i;
+}
+