diff options
Diffstat (limited to 'charsetConv.c')
-rw-r--r-- | charsetConv.c | 428 |
1 files changed, 428 insertions, 0 deletions
diff --git a/charsetConv.c b/charsetConv.c new file mode 100644 index 0000000..0dc1b6d --- /dev/null +++ b/charsetConv.c @@ -0,0 +1,428 @@ +/* Copyright 2008,2009 Alain Knaff. + * This file is part of mtools. + * + * Mtools is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Mtools is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Mtools. If not, see <http://www.gnu.org/licenses/>. + * + * Various character set conversions used by mtools + */ +#include "sysincludes.h" +#include "msdos.h" +#include "mtools.h" + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include "file_name.h" + + +#ifdef HAVE_ICONV_H +#include <iconv.h> + +struct doscp_t { + iconv_t from; + iconv_t to; +}; + +static const char *wcharCp=NULL; + +static const char* wcharTries[] = { + "WCHAR_T", + "UTF-32BE", "UTF-32LE", + "UTF-16BE", "UTF-16LE", + "UTF-32", "UTF-16", + "UCS-4BE", "UCS-4LE", + "UCS-2BE", "UCS-2LE", + "UCS-4", "UCS-2" +}; + +static const char *asciiTries[] = { + "ASCII", "ASCII-GR", "ISO8859-1" +}; + +static const wchar_t *testString = L"ab"; + +static int try(const char *testCp) { + size_t res; + char *inbuf = (char *)testString; + size_t inbufLen = 2*sizeof(wchar_t); + char outbuf[3]; + char *outbufP = outbuf; + size_t outbufLen = 2*sizeof(char); + iconv_t test; + int i; + + for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) { + test = iconv_open(asciiTries[i], testCp); + if(test != (iconv_t) -1) + break; + } + if(test == (iconv_t) -1) + goto fail0; + res = iconv(test, + &inbuf, &inbufLen, + &outbufP, &outbufLen); + if(res != 0 || outbufLen != 0 || inbufLen != 0) + goto fail; + if(memcmp(outbuf, "ab", 2)) + goto fail; + /* fprintf(stderr, "%s ok\n", testCp); */ + return 1; + fail: + iconv_close(test); + fail0: + /*fprintf(stderr, "%s fail\n", testCp);*/ + return 0; +} + +static const char *getWcharCp(void) { + unsigned int i; + if(wcharCp != NULL) + return wcharCp; + for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) { + if(try(wcharTries[i])) + return (wcharCp=wcharTries[i]); + } + fprintf(stderr, "No codepage found for wchar_t\n"); + return NULL; +} + + +doscp_t *cp_open(int codepage) +{ + char dosCp[17]; + doscp_t *ret; + iconv_t *from; + iconv_t *to; + + if(codepage == 0) + codepage = mtools_default_codepage; + if(codepage < 0 || codepage > 9999) { + fprintf(stderr, "Bad codepage %d\n", codepage); + return NULL; + } + + if(getWcharCp() == NULL) + return NULL; + + sprintf(dosCp, "CP%d", codepage); + from = iconv_open(wcharCp, dosCp); + if(from == (iconv_t)-1) { + fprintf(stderr, "Error converting to codepage %d %s\n", + codepage, strerror(errno)); + return NULL; + } + + sprintf(dosCp, "CP%d//TRANSLIT", codepage); + to = iconv_open(dosCp, wcharCp); + if(to == (iconv_t)-1) { + /* Transliteration not supported? */ + sprintf(dosCp, "CP%d", codepage); + to = iconv_open(dosCp, wcharCp); + } + if(to == (iconv_t)-1) { + iconv_close(from); + fprintf(stderr, "Error converting to codepage %d %s\n", + codepage, strerror(errno)); + return NULL; + } + + ret = New(doscp_t); + if(ret == NULL) + return ret; + ret->from = from; + ret->to = to; + return ret; +} + +void cp_close(doscp_t *cp) +{ + iconv_close(cp->to); + iconv_close(cp->from); + free(cp); +} + +int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len) +{ + int r; + size_t in_len=len; + size_t out_len=len*sizeof(wchar_t); + wchar_t *dptr=wchar; + r=iconv(cp->from, &dos, &in_len, (char **)&dptr, &out_len); + if(r < 0) + return r; + *dptr = L'\0'; + return dptr-wchar; +} + +/** + * Converts len wide character to destination. Caller's responsibility to + * ensure that dest is large enough. + * mangled will be set if there has been an untranslatable character. + */ +static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest, + size_t len, int *mangled) +{ + int r; + unsigned int i; + size_t in_len=len*sizeof(wchar_t); + size_t out_len=len*4; + char *dptr = dest; + + while(in_len > 0) { + r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len); + if(r >= 0 || errno != EILSEQ) { + /* everything transformed, or error that is _not_ a bad + * character */ + break; + } + *mangled |= 1; + + if(dptr) + *dptr++ = '_'; + in_len--; + + wchar++; + out_len--; + } + + len = dptr-dest; /* how many dest characters have there been + generated */ + + /* eliminate question marks which might have been formed by + untransliterable characters */ + for(i=0; i<len; i++) { + if(dest[i] == '?') { + dest[i] = '_'; + *mangled |= 1; + } + } + return len; +} + +void wchar_to_dos(doscp_t *cp, + wchar_t *wchar, char *dos, size_t len, int *mangled) +{ + safe_iconv(cp->to, wchar, dos, len, mangled); +} + +#else + +#include "codepage.h" + +struct doscp_t { + unsigned char *from_dos; + unsigned char to_dos[0x80]; +}; + +doscp_t *cp_open(int codepage) +{ + doscp_t *ret; + int i; + Codepage_t *cp; + + if(codepage == 0) + codepage = 850; + + ret = New(doscp_t); + if(ret == NULL) + return ret; + + for(cp=codepages; cp->nr ; cp++) + if(cp->nr == codepage) { + ret->from_dos = cp->tounix; + break; + } + + if(ret->from_dos == NULL) { + fprintf(stderr, "Bad codepage %d\n", codepage); + free(ret); + return NULL; + } + + for(i=0; i<0x80; i++) { + char native = ret->from_dos[i]; + if(! (native & 0x80)) + continue; + ret->to_dos[native & 0x7f] = 0x80 | i; + } + return ret; +} + +void cp_close(doscp_t *cp) +{ + free(cp); +} + +int dos_to_wchar(doscp_t *cp, char *dos, wchar_t *wchar, size_t len) +{ + int i; + + for(i=0; i<len && dos[i]; i++) { + char c = dos[i]; + if(c >= ' ' && c <= '~') + wchar[i] = c; + else { + wchar[i] = cp->from_dos[c & 0x7f]; + } + } + wchar[i] = '\0'; + return i; +} + + +void wchar_to_dos(doscp_t *cp, + wchar_t *wchar, char *dos, size_t len, int *mangled) +{ + int i; + for(i=0; i<len && wchar[i]; i++) { + char c = wchar[i]; + if(c >= ' ' && c <= '~') + dos[i] = c; + else { + dos[i] = cp->to_dos[c & 0x7f]; + if(dos[i] == '\0') { + dos[i]='_'; + *mangled=1; + } + } + } +} + +#endif + + +#ifndef HAVE_WCHAR_H + +typedef int mbstate_t; + +static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) +{ + *s = wc; + return 1; +} + +static inline size_t mbrtowc(wchar_t *pwc, const char *s, + size_t n, mbstate_t *ps) +{ + *pwc = *s; + return 1; +} + +#endif + +#ifdef HAVE_ICONV_H + +#include <langinfo.h> + +static iconv_t to_native = NULL; + +static void initialize_to_native(void) +{ + char *li, *cp; + int len; + if(to_native != NULL) + return; + li = nl_langinfo(CODESET); + len = strlen(li) + 11; + if(getWcharCp() == NULL) + exit(1); + cp = safe_malloc(len); + strcpy(cp, li); + strcat(cp, "//TRANSLIT"); + to_native = iconv_open(cp, wcharCp); + if(to_native == (iconv_t) -1) + to_native = iconv_open(li, wcharCp); + if(to_native == (iconv_t) -1) + fprintf(stderr, "Could not allocate iconv for %s\n", cp); + free(cp); + if(to_native == (iconv_t) -1) + exit(1); +} + + + +#endif + + +/** + * Convert wchar string to native, converting at most len wchar characters + * Returns number of generated native characters + */ +int wchar_to_native(const wchar_t *wchar, char *native, size_t len) +{ +#ifdef HAVE_ICONV_H + int mangled; + int r; + initialize_to_native(); + len = wcsnlen(wchar,len); + r=safe_iconv(to_native, wchar, native, len, &mangled); + native[r]='\0'; + return r; +#else + int i; + char *dptr = native; + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + for(i=0; i<len && wchar[i] != 0; i++) { + int r = wcrtomb(dptr, wchar[i], &ps); + if(r < 0 && errno == EILSEQ) { + r=1; + *dptr='_'; + } + if(r < 0) + return r; + dptr+=r; + } + *dptr='\0'; + return dptr-native; +#endif +} + +/** + * Convert native string to wchar string, generating at most len wchar + * characters. If end is supplied, stop conversion when source pointer + * exceeds end. Returns number of generated wchars + */ +int native_to_wchar(const char *native, wchar_t *wchar, size_t len, + const char *end, int *mangled) +{ + mbstate_t ps; + unsigned int i; + memset(&ps, 0, sizeof(ps)); + + for(i=0; i<len && (native < end || !end); i++) { + int r = mbrtowc(wchar+i, native, len, &ps); + if(r < 0) { + /* Unconvertible character. Just pretend it's Latin1 + encoded (if valid Latin1 character) or substitue + with an underscore if not + */ + char c = *native; + if(c >= '\xa0' && c < '\xff') + wchar[i] = c & 0xff; + else + wchar[i] = '_'; + memset(&ps, 0, sizeof(ps)); + r=1; + } + if(r == 0) + break; + native += r; + } + if(mangled && ((end && native < end) || (!end && *native && i == len))) + *mangled |= 3; + wchar[i]='\0'; + return i; +} + |