| /* Copyright 2008,2009 Alain Knaff. |
| * This file is part of mtools. |
| * |
| * Mtools is free software: you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation, either version 3 of the License, or |
| * (at your option) any later version. |
| * |
| * Mtools is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with Mtools. If not, see <http://www.gnu.org/licenses/>. |
| * |
| * Various character set conversions used by mtools |
| */ |
| #include "sysincludes.h" |
| #include "msdos.h" |
| #include "mtools.h" |
| |
| #include <stdio.h> |
| #include <errno.h> |
| #include <stdlib.h> |
| #include "file_name.h" |
| |
| |
| #ifdef HAVE_ICONV_H |
| #include <iconv.h> |
| |
| struct doscp_t { |
| iconv_t from; |
| iconv_t to; |
| }; |
| |
| static const char *wcharCp=NULL; |
| |
| static const char* wcharTries[] = { |
| "WCHAR_T", |
| "UTF-32BE", "UTF-32LE", |
| "UTF-16BE", "UTF-16LE", |
| "UTF-32", "UTF-16", |
| "UCS-4BE", "UCS-4LE", |
| "UCS-2BE", "UCS-2LE", |
| "UCS-4", "UCS-2" |
| }; |
| |
| static const char *asciiTries[] = { |
| "ASCII", "ASCII-GR", "ISO8859-1" |
| }; |
| |
| static const wchar_t *testString = L"ab"; |
| |
| static int try(const char *testCp) { |
| size_t res; |
| char *inbuf = (char *)testString; |
| size_t inbufLen = 2*sizeof(wchar_t); |
| char outbuf[3]; |
| char *outbufP = outbuf; |
| size_t outbufLen = 2*sizeof(char); |
| iconv_t test = 0; |
| size_t i; |
| |
| for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) { |
| test = iconv_open(asciiTries[i], testCp); |
| if(test != (iconv_t) -1) |
| break; |
| } |
| if(test == (iconv_t) -1) |
| goto fail0; |
| res = iconv(test, |
| &inbuf, &inbufLen, |
| &outbufP, &outbufLen); |
| if(res != 0 || outbufLen != 0 || inbufLen != 0) |
| goto fail; |
| if(memcmp(outbuf, "ab", 2)) |
| goto fail; |
| /* fprintf(stderr, "%s ok\n", testCp); */ |
| return 1; |
| fail: |
| iconv_close(test); |
| fail0: |
| /*fprintf(stderr, "%s fail\n", testCp);*/ |
| return 0; |
| } |
| |
| static const char *getWcharCp(void) { |
| unsigned int i; |
| if(wcharCp != NULL) |
| return wcharCp; |
| for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) { |
| if(try(wcharTries[i])) |
| return (wcharCp=wcharTries[i]); |
| } |
| fprintf(stderr, "No codepage found for wchar_t\n"); |
| return NULL; |
| } |
| |
| |
| doscp_t *cp_open(unsigned int codepage) |
| { |
| char dosCp[17]; |
| doscp_t *ret; |
| iconv_t from; |
| iconv_t to; |
| |
| if(codepage == 0) |
| codepage = mtools_default_codepage; |
| if(codepage > 9999) { |
| fprintf(stderr, "Bad codepage %d\n", codepage); |
| return NULL; |
| } |
| |
| if(getWcharCp() == NULL) |
| return NULL; |
| |
| sprintf(dosCp, "CP%d", codepage); |
| from = iconv_open(wcharCp, dosCp); |
| if(from == (iconv_t)-1) { |
| fprintf(stderr, "Error converting to codepage %d %s\n", |
| codepage, strerror(errno)); |
| return NULL; |
| } |
| |
| sprintf(dosCp, "CP%d//TRANSLIT", codepage); |
| to = iconv_open(dosCp, wcharCp); |
| if(to == (iconv_t)-1) { |
| /* Transliteration not supported? */ |
| sprintf(dosCp, "CP%d", codepage); |
| to = iconv_open(dosCp, wcharCp); |
| } |
| if(to == (iconv_t)-1) { |
| iconv_close(from); |
| fprintf(stderr, "Error converting to codepage %d %s\n", |
| codepage, strerror(errno)); |
| return NULL; |
| } |
| |
| ret = New(doscp_t); |
| if(ret == NULL) |
| return ret; |
| ret->from = from; |
| ret->to = to; |
| return ret; |
| } |
| |
| void cp_close(doscp_t *cp) |
| { |
| iconv_close(cp->to); |
| iconv_close(cp->from); |
| free(cp); |
| } |
| |
| size_t dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len) |
| { |
| size_t r; |
| size_t in_len=len; |
| size_t out_len=len*sizeof(wchar_t); |
| wchar_t *dptr=wchar; |
| char *dos2 = (char *) dos; /* Magic to be able to call iconv with its |
| buggy prototype */ |
| r=iconv(cp->from, &dos2, &in_len, (char **)&dptr, &out_len); |
| if(r == (size_t) -1) |
| return r; |
| *dptr = L'\0'; |
| return (size_t) (dptr-wchar); |
| } |
| |
| /** |
| * Converts len wide character to destination. Caller's responsibility to |
| * ensure that dest is large enough. |
| * mangled will be set if there has been an untranslatable character. |
| */ |
| static size_t safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest, |
| size_t in_len, size_t out_len, int *mangled) |
| { |
| size_t r; |
| unsigned int i; |
| char *dptr = dest; |
| size_t len; |
| |
| in_len=in_len*sizeof(wchar_t); |
| |
| while(in_len > 0 && out_len > 0) { |
| r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len); |
| if(r == (size_t) -1 || errno != EILSEQ) { |
| /* everything transformed, or error that is _not_ a bad |
| * character */ |
| break; |
| } |
| *mangled |= 1; |
| |
| if(out_len <= 0) |
| break; |
| if(dptr) |
| *dptr++ = '_'; |
| in_len -= sizeof(wchar_t); |
| |
| wchar++; |
| out_len--; |
| } |
| |
| len = (size_t) (dptr-dest); /* how many dest characters have there been |
| generated */ |
| |
| /* eliminate question marks which might have been formed by |
| untransliterable characters */ |
| for(i=0; i<len; i++) { |
| if(dest[i] == '?') { |
| dest[i] = '_'; |
| *mangled |= 1; |
| } |
| } |
| return len; |
| } |
| |
| void wchar_to_dos(doscp_t *cp, |
| wchar_t *wchar, char *dos, size_t len, int *mangled) |
| { |
| safe_iconv(cp->to, wchar, dos, len, len, mangled); |
| } |
| |
| #else |
| |
| #include "codepage.h" |
| |
| struct doscp_t { |
| unsigned char *from_dos; |
| unsigned char to_dos[0x80]; |
| }; |
| |
| doscp_t *cp_open(unsigned int codepage) |
| { |
| doscp_t *ret; |
| int i; |
| Codepage_t *cp; |
| |
| if(codepage == 0) |
| codepage = 850; |
| |
| ret = New(doscp_t); |
| if(ret == NULL) |
| return ret; |
| |
| for(cp=codepages; cp->nr ; cp++) |
| if(cp->nr == codepage) { |
| ret->from_dos = cp->tounix; |
| break; |
| } |
| |
| if(ret->from_dos == NULL) { |
| fprintf(stderr, "Bad codepage %d\n", codepage); |
| free(ret); |
| return NULL; |
| } |
| |
| for(i=0; i<0x80; i++) { |
| char native = ret->from_dos[i]; |
| if(! (native & 0x80)) |
| continue; |
| ret->to_dos[native & 0x7f] = 0x80 | i; |
| } |
| return ret; |
| } |
| |
| void cp_close(doscp_t *cp) |
| { |
| free(cp); |
| } |
| |
| size_t dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len) |
| { |
| int i; |
| |
| for(i=0; i<len && dos[i]; i++) { |
| char c = dos[i]; |
| if(c >= ' ' && c <= '~') |
| wchar[i] = c; |
| else { |
| wchar[i] = cp->from_dos[c & 0x7f]; |
| } |
| } |
| wchar[i] = '\0'; |
| return i; |
| } |
| |
| |
| void wchar_to_dos(doscp_t *cp, |
| wchar_t *wchar, char *dos, size_t len, int *mangled) |
| { |
| int i; |
| for(i=0; i<len && wchar[i]; i++) { |
| char c = wchar[i]; |
| if(c >= ' ' && c <= '~') |
| dos[i] = c; |
| else { |
| dos[i] = cp->to_dos[c & 0x7f]; |
| if(dos[i] == '\0') { |
| dos[i]='_'; |
| *mangled=1; |
| } |
| } |
| } |
| } |
| |
| #endif |
| |
| |
| #ifndef HAVE_WCHAR_H |
| |
| typedef int mbstate_t; |
| |
| static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) |
| { |
| *s = wc; |
| return 1; |
| } |
| |
| static inline size_t mbrtowc(wchar_t *pwc, const char *s, |
| size_t n, mbstate_t *ps) |
| { |
| *pwc = *s; |
| return 1; |
| } |
| |
| #endif |
| |
| #ifdef HAVE_ICONV_H |
| |
| #include <langinfo.h> |
| |
| static iconv_t to_native = NULL; |
| |
| static void initialize_to_native(void) |
| { |
| char *li, *cp; |
| size_t len; |
| if(to_native != NULL) |
| return; |
| li = nl_langinfo(CODESET); |
| len = strlen(li) + 11; |
| if(getWcharCp() == NULL) |
| exit(1); |
| cp = safe_malloc(len); |
| strcpy(cp, li); |
| strcat(cp, "//TRANSLIT"); |
| to_native = iconv_open(cp, wcharCp); |
| if(to_native == (iconv_t) -1) |
| to_native = iconv_open(li, wcharCp); |
| if(to_native == (iconv_t) -1) |
| fprintf(stderr, "Could not allocate iconv for %s\n", cp); |
| free(cp); |
| if(to_native == (iconv_t) -1) |
| exit(1); |
| } |
| |
| |
| |
| #endif |
| |
| |
| /** |
| * Convert wchar string to native, converting at most len wchar characters |
| * Returns number of generated native characters |
| */ |
| size_t wchar_to_native(const wchar_t *wchar, char *native, size_t len, |
| size_t out_len) |
| { |
| #ifdef HAVE_ICONV_H |
| int mangled; |
| size_t r; |
| initialize_to_native(); |
| len = wcsnlen(wchar,len); |
| r=safe_iconv(to_native, wchar, native, len, out_len, &mangled); |
| native[r]='\0'; |
| return r; |
| #else |
| int i; |
| char *dptr = native; |
| mbstate_t ps; |
| memset(&ps, 0, sizeof(ps)); |
| for(i=0; i<len && wchar[i] != 0; i++) { |
| size_t r = wcrtomb(dptr, wchar[i], &ps); |
| if(r == (size_t) -1 && errno == EILSEQ) { |
| r=1; |
| *dptr='_'; |
| } |
| dptr+=r; |
| } |
| *dptr='\0'; |
| return dptr-native; |
| #endif |
| } |
| |
| /** |
| * Convert native string to wchar string, generating at most len wchar |
| * characters. If end is supplied, stop conversion when source pointer |
| * exceeds end. Returns number of generated wchars |
| */ |
| size_t native_to_wchar(const char *native, wchar_t *wchar, size_t len, |
| const char *end, int *mangled) |
| { |
| mbstate_t ps; |
| unsigned int i; |
| memset(&ps, 0, sizeof(ps)); |
| |
| for(i=0; i<len && (native < end || !end); i++) { |
| size_t r = mbrtowc(wchar+i, native, len, &ps); |
| if(r == (size_t) -1) { |
| /* Unconvertible character. Just pretend it's Latin1 |
| encoded (if valid Latin1 character) or substitute |
| with an underscore if not |
| */ |
| char c = *native; |
| if(c >= '\xa0' && c < '\xff') |
| wchar[i] = c & 0xff; |
| else |
| wchar[i] = '_'; |
| memset(&ps, 0, sizeof(ps)); |
| r=1; |
| } |
| if(r == 0) |
| break; |
| native += r; |
| } |
| if(mangled && ((end && native < end) || (!end && *native && i == len))) |
| *mangled |= 3; |
| wchar[i]='\0'; |
| return i; |
| } |
| |