/* encdec - encode and decode integers, times, and * internationalized strings to and from popular binary formats * Copyright (c) 2002 Michael B. Allen * * The GNU Library General Public License * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, * MA 02111-1307, USA */ /* Modified iconv.c to support encdec multi-byte string * functions. This module can only be built directly * against libiconv-1.7/lib sources. */ #include #include #include #include #include #include "config.h" #include "libcharset.h" #include "encdec.h" #include "defines.h" #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define BUFSIZ 512 #if HAVE_WCWIDTH > 0 int wcwidth(wchar_t ucs); #else #define wcwidth mk_wcwidth int mk_wcwidth(wchar_t ucs); #endif /* * Consider those system dependent encodings that are needed for the * current system. */ #ifdef _AIX #define USE_AIX #endif #ifdef __osf__ #define USE_OSF1 #endif #ifdef __DJGPP__ #define USE_DOS #endif /* * Data type for general conversion loop. */ struct loop_funcs { size_t (*loop_convertn) (iconv_t icd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft, int *n, int zt); size_t (*loop_reset) (iconv_t icd, char* * outbuf, size_t *outbytesleft); }; /* * Converters. */ #include "converters.h" /* * Transliteration tables. */ #include "cjk_variants.h" #include "translit.h" /* * Table of all supported encodings. */ struct encoding { struct mbtowc_funcs ifuncs; /* conversion multibyte -> unicode */ struct wctomb_funcs ofuncs; /* conversion unicode -> multibyte */ int oflags; /* flags for unicode -> multibyte conversion */ }; enum { #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \ ei_##xxx , #include "encodings.def" #ifdef USE_AIX #include "encodings_aix.def" #endif #ifdef USE_OSF1 #include "encodings_osf1.def" #endif #ifdef USE_DOS #include "encodings_dos.def" #endif #include "encodings_local.def" #undef DEFENCODING ei_for_broken_compilers_that_dont_like_trailing_commas }; #include "flags.h" static struct encoding const all_encodings[] = { #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \ { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, ei_##xxx##_oflags }, #include "encodings.def" #ifdef USE_AIX #include "encodings_aix.def" #endif #ifdef USE_OSF1 #include "encodings_osf1.def" #endif #ifdef USE_DOS #include "encodings_dos.def" #endif #undef DEFENCODING #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \ { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, 0 }, #include "encodings_local.def" #undef DEFENCODING }; /* * Conversion loops. */ #include "loops.h" /* Heavily modified libiconv-1.7/lib/loop_unicode.h */ static size_t unicode_loop_convertn(iconv_t icd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int *n, int zt) { conv_t cd = (conv_t)icd; size_t result = 0; const unsigned char *inptr = (const unsigned char *)*inbuf; size_t inleft = *inbytesleft; unsigned char *outptr = (unsigned char *)*outbuf; size_t outleft = *outbytesleft; static unsigned char buf[BUFSIZ]; ucs4_t wc = 1; int incount, outcount; int count = *n, w; if (!count) { return 0; } while (wc && inleft && outleft > zt && count >= 0) { incount = cd->ifuncs.xxx_mbtowc(cd, &wc, inptr, inleft); if (incount < 0) { if (incount == RET_ILSEQ) { errno = EILSEQ; return -1; } if (incount == RET_TOOFEW(0)) { break; } incount = -2 - incount; } else { outcount = cd->ofuncs.xxx_wctomb(cd, buf, wc, MAX(outleft, BUFSIZ)); if (outcount != RET_ILUNI) { goto outcount_ok; } if ((wc >> 7) == (0xe0000 >> 7)) { goto outcount_zero; } result++; if (cd->transliterate) { outcount = unicode_transliterate(cd, wc, buf, MAX(outleft, BUFSIZ)); if (outcount != RET_ILUNI) { goto outcount_ok; } } outcount = cd->ofuncs.xxx_wctomb(cd, buf, EINSEQ_SUBS, MAX(outleft, BUFSIZ)); if (outcount != RET_ILUNI) { goto outcount_ok; } errno = EILSEQ; return -1; outcount_ok: if (outcount < 0 || outcount > outleft || (zt && wc && outcount == outleft)) { break; } if (wc == 0 || (w = wcwidth(wc)) == -1) { w = 1; } if (w > count) { /* this allows count to be 0 while the loop * continues so we pick up trailing combining chars */ break; } count -= w; if (*outbuf) { memmove(outptr, buf, outcount); } outptr += outcount; outleft -= outcount; outcount_zero: ; } if (!(incount <= inleft)) abort(); inptr += incount; inleft -= incount; } if (*outbuf) { if (zt && wc) { cd->ofuncs.xxx_wctomb(cd, outptr, 0, 1); } *inbuf = (const char *)inptr; } else if (wc || count == 0) { /* if dst is NULL add a 1 to the return * value but only if wc wasn't zero already in which case 1 will * already have been added OR if count is 0 because we could have read * a trailing '\0' acedentally which would make wc 0. */ outptr++; } *inbytesleft = inleft; *outbuf = outptr; *outbytesleft = outleft; *n = count; return result; } /* Copied from libiconv-1.7/lib/loop_wchar.h */ #if HAVE_WCRTOMB /* From wchar_t to anything else. */ static size_t wchar_from_loop_convertn (iconv_t icd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft, int *n, int zt) { struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) icd; size_t result = 0; while (*inbytesleft >= sizeof(wchar_t)) { const wchar_t * inptr = (const wchar_t *) *inbuf; size_t inleft = *inbytesleft; char buf[BUFSIZ]; #if HAVE_MBSTATE > 0 mbstate_t state = wcd->state; #endif size_t bufcount = 0; while (inleft >= sizeof(wchar_t)) { /* Convert one wchar_t to multibyte representation. */ #if HAVE_MBSTATE > 0 size_t count = wcrtomb(buf+bufcount,*inptr,&state); #else size_t count = wctomb(buf+bufcount,*inptr); #endif if (count == (size_t)(-1)) { /* Invalid input. */ errno = EILSEQ; return -1; } inptr++; inleft -= sizeof(wchar_t); bufcount += count; if (count == 0) { /* Continue, append next wchar_t. */ } else { /* Attempt to convert the accumulated multibyte representations to the target encoding. */ const char* bufptr = buf; size_t bufleft = bufcount; char* outptr = *outbuf; size_t outleft = *outbytesleft; int cnt = 1; size_t res = unicode_loop_convertn(&wcd->parent, &bufptr,&bufleft, &outptr,&outleft,&cnt,zt); if (res == (size_t)(-1)) { if (errno == EILSEQ) /* Invalid input. */ return -1; else if (errno == E2BIG) /* Output buffer too small. */ return -1; else if (errno == EINVAL) { /* Continue, append next wchar_t, but avoid buffer overrun. */ if (bufcount + MB_CUR_MAX > BUFSIZ) abort(); } else abort(); } else { /* Successful conversion. */ #if HAVE_MBSTATE > 0 wcd->state = state; #endif *inbuf = (const char *) inptr; *inbytesleft = inleft; *outbuf = outptr; *outbytesleft = outleft; result += res; *n -= 1; break; } } } } return result; } #endif #if HAVE_MBRTOWC /* From anything else to wchar_t. */ static size_t wchar_to_loop_convertn (iconv_t icd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft, int *n, int zt) { struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) icd; size_t result = 0; while (*inbytesleft > 0 && *n > 0) { size_t try; for (try = 1; try <= *inbytesleft; try++) { char buf[BUFSIZ]; const char* inptr = *inbuf; size_t inleft = *inbytesleft; char* bufptr = buf; size_t bufleft = BUFSIZ; int cnt = 1; size_t res = unicode_loop_convertn(&wcd->parent, &inptr,&inleft, &bufptr,&bufleft,&cnt, zt); if (res == (size_t)(-1)) { if (errno == EILSEQ) /* Invalid input. */ return -1; else if (errno == EINVAL) { /* Incomplete input. Next try with one more input byte. */ } else /* E2BIG shouldn't occur. */ abort(); } else { /* Successful conversion. */ size_t bufcount = bufptr-buf; /* = BUFSIZ-bufleft */ wchar_t wc; #if defined(HAVE_MBSTATE) mbstate_t state = wcd->state; res = mbrtowc(&wc,buf,bufcount,&state); #else res = mbtowc(&wc,buf,bufcount); #endif if (res == (size_t)(-2)) { /* Next try with one more input byte. */ } else if (res == (size_t)(-1)) { /* Invalid input. */ return -1; } else { if (*outbytesleft < sizeof(wchar_t)) { errno = E2BIG; return -1; } *(wchar_t*) *outbuf = wc; *outbuf += sizeof(wchar_t); *outbytesleft -= sizeof(wchar_t); *inbuf += try; *inbytesleft -= try; result += res; *n -= 1; break; } } } } return result; } #endif /* * Alias lookup function. * Defines * struct alias { const char* name; unsigned int encoding_index; }; * const struct alias * aliases_lookup (const char *str, unsigned int len); * #define MAX_WORD_LENGTH ... */ #include "aliases.h" /* * System dependent alias lookup function. * Defines * const struct alias * aliases2_lookup (const char *str); */ #if defined(USE_AIX) || defined(USE_OSF1) || defined(USE_DOS) /* || ... */ static struct alias sysdep_aliases[] = { #ifdef USE_AIX #include "aliases_aix.h" #endif #ifdef USE_OSF1 #include "aliases_osf1.h" #endif #ifdef USE_DOS #include "aliases_dos.h" #endif }; #ifdef __GNUC__ __inline #endif const struct alias * aliases2_lookup (register const char *str) { struct alias * ptr; unsigned int count; for (ptr = sysdep_aliases, count = sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0]); count > 0; ptr++, count--) if (!strcmp(str,ptr->name)) return ptr; return NULL; } #else #define aliases2_lookup(str) NULL #endif #if 0 /* Like !strcasecmp, except that the both strings can be assumed to be ASCII and the first string can be assumed to be in uppercase. */ static int strequal (const char* str1, const char* str2) { unsigned char c1; unsigned char c2; for (;;) { c1 = * (unsigned char *) str1++; c2 = * (unsigned char *) str2++; if (c1 == 0) break; if (c2 >= 'a' && c2 <= 'z') c2 -= 'a'-'A'; if (c1 != c2) break; } return (c1 == c2); } #endif int enc_mbscpy(const char *inbuf, char **outbuf, const char *tocode) { return enc_mbsncpy(inbuf, INT_MAX, outbuf, INT_MAX, INT_MAX, tocode); } int enc_mbsncpy(const char *inbuf, size_t inbytesleft, char **outbuf, size_t outbytesleft, int n, const char *tocode) { struct conv_struct * cd; char buf[MAX_WORD_LENGTH+10+1]; const char* cp, *fromcode; char* bp; const struct alias * ap; unsigned int count; unsigned int from_index; int from_wchar; unsigned int to_index; int to_wchar; int transliterate = 0; int start; if (inbytesleft > INT_MAX) { inbytesleft = INT_MAX; } if (outbytesleft > INT_MAX) { outbytesleft = INT_MAX; } if (n < 0) { n = INT_MAX; } start = n; /* Before calling aliases_lookup, convert the input string to upper case, * and check whether it's entirely ASCII (we call gperf with option "-7" * to achieve a smaller table) and non-empty. If it's not entirely ASCII, * or if it's too long, it is not a valid encoding name. */ for (to_wchar = 0;;) { /* Search tocode in the table. */ for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) { unsigned char c = * (unsigned char *) cp; if (c >= 0x80) goto invalid; if (c >= 'a' && c <= 'z') c -= 'a'-'A'; *bp = c; if (c == '\0') break; if (--count == 0) goto invalid; } if (bp-buf > 10 && memcmp(bp-10,"//TRANSLIT",10)==0) { bp -= 10; *bp = '\0'; transliterate = 1; } ap = aliases_lookup(buf,bp-buf); if (ap == NULL) { ap = aliases2_lookup(buf); if (ap == NULL) goto invalid; } if (ap->encoding_index == ei_local_char) { tocode = locale_charset(); continue; } if (ap->encoding_index == ei_local_wchar_t) { #if __STDC_ISO_10646__ if (sizeof(wchar_t) == 4) { to_index = ei_ucs4internal; break; } if (sizeof(wchar_t) == 2) { to_index = ei_ucs2internal; break; } if (sizeof(wchar_t) == 1) { to_index = ei_iso8859_1; break; } #endif #if HAVE_MBRTOWC to_wchar = 1; tocode = locale_charset(); continue; #endif goto invalid; } to_index = ap->encoding_index; break; } fromcode = locale_charset(); ap = aliases_lookup(fromcode, strlen(fromcode)); if (ap == NULL) { ap = aliases2_lookup(fromcode); if (ap == NULL) goto invalid; } from_wchar = 0; from_index = ap->encoding_index; cd = (struct conv_struct *) malloc(from_wchar != to_wchar ? sizeof(struct wchar_conv_struct) : sizeof(struct conv_struct)); if (cd == NULL) { errno = ENOMEM; return -1; } cd->iindex = from_index; cd->ifuncs = all_encodings[from_index].ifuncs; cd->oindex = to_index; cd->ofuncs = all_encodings[to_index].ofuncs; cd->oflags = all_encodings[to_index].oflags; /* Initialize the loop functions. */ #if HAVE_MBRTOWC if (to_wchar) { cd->lfuncs.loop_convertn = wchar_to_loop_convertn; cd->lfuncs.loop_reset = wchar_to_loop_reset; } else #endif { cd->lfuncs.loop_convertn = unicode_loop_convertn; cd->lfuncs.loop_reset = unicode_loop_reset; } /* Initialize the states. */ memset(&cd->istate,'\0',sizeof(state_t)); memset(&cd->ostate,'\0',sizeof(state_t)); /* Initialize the operation flags. */ cd->transliterate = transliterate; /* Initialize additional fields. */ if (from_wchar != to_wchar) { struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) cd; #if HAVE_MBSTATE > 0 memset(&wcd->state,'\0',sizeof(mbstate_t)); #endif } if (inbuf == NULL) goto invalid; cd->lfuncs.loop_convertn(cd, (const char **)&inbuf, &inbytesleft, outbuf, &outbytesleft, &n, 0); free(cd); /* Done. */ return start - n; invalid: errno = EINVAL; return -1; } char * dec_mbsdup(char **src, const char *fromcode) { return dec_mbsndup(src, -1, -1, -1, fromcode); } char * dec_mbsndup(char **src, size_t sn, size_t dn, int wn, const char *fromcode) { size_t n; char *dst; if ((n = dec_mbsncpy(src, sn, NULL, dn, wn, fromcode)) == (size_t)-1) { return NULL; } if ((dst = malloc(n)) == NULL) { return NULL; } if (dec_mbsncpy(src, sn, dst, dn, wn, fromcode) == (size_t)-1) { free(dst); return NULL; } return dst; } size_t dec_mbscpy(char **inbuf, char *outbuf, const char *fromcode) { return dec_mbsncpy(inbuf, INT_MAX, outbuf, INT_MAX, INT_MAX, fromcode); } size_t dec_mbsncpy(char **inbuf, size_t inbytesleft, char *outbuf, size_t outbytesleft, int n, const char *fromcode) { struct conv_struct * cd; char buf[MAX_WORD_LENGTH+10+1]; const char* cp, *tocode; char* bp, *start; const struct alias * ap; unsigned int count; unsigned int from_index; int from_wchar; unsigned int to_index; int to_wchar; int transliterate = 0; size_t r; if (inbytesleft > INT_MAX) { inbytesleft = INT_MAX; } if (outbytesleft > INT_MAX) { outbytesleft = INT_MAX; } if (n < 0) { n = INT_MAX; } /* Before calling aliases_lookup, convert the input string to upper case, * and check whether it's entirely ASCII (we call gperf with option "-7" * to achieve a smaller table) and non-empty. If it's not entirely ASCII, * or if it's too long, it is not a valid encoding name. */ for (from_wchar = 0;;) { /* Search fromcode in the table. */ for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) { unsigned char c = * (unsigned char *) cp; if (c >= 0x80) goto invalid; if (c >= 'a' && c <= 'z') c -= 'a'-'A'; *bp = c; if (c == '\0') break; if (--count == 0) goto invalid; } if (bp-buf > 10 && memcmp(bp-10,"//TRANSLIT",10)==0) { bp -= 10; *bp = '\0'; } ap = aliases_lookup(buf,bp-buf); if (ap == NULL) { ap = aliases2_lookup(buf); if (ap == NULL) goto invalid; } if (ap->encoding_index == ei_local_char) { fromcode = locale_charset(); continue; } if (ap->encoding_index == ei_local_wchar_t) { #if __STDC_ISO_10646__ if (sizeof(wchar_t) == 4) { from_index = ei_ucs4internal; break; } if (sizeof(wchar_t) == 2) { from_index = ei_ucs2internal; break; } if (sizeof(wchar_t) == 1) { from_index = ei_iso8859_1; break; } #endif #if HAVE_WCRTOMB from_wchar = 1; fromcode = locale_charset(); continue; #endif goto invalid; } from_index = ap->encoding_index; break; } tocode = locale_charset(); ap = aliases_lookup(tocode, strlen(tocode)); if (ap == NULL) { ap = aliases2_lookup(tocode); if (ap == NULL) { goto invalid; } } to_wchar = 0; to_index = ap->encoding_index; cd = (struct conv_struct *) malloc(from_wchar != to_wchar ? sizeof(struct wchar_conv_struct) : sizeof(struct conv_struct)); if (cd == NULL) { errno = ENOMEM; return -1; } cd->iindex = from_index; cd->ifuncs = all_encodings[from_index].ifuncs; cd->oindex = to_index; cd->ofuncs = all_encodings[to_index].ofuncs; cd->oflags = all_encodings[to_index].oflags; /* Initialize the loop functions. */ #if HAVE_WCRTOMB if (from_wchar) { cd->lfuncs.loop_convertn = wchar_from_loop_convertn; cd->lfuncs.loop_reset = wchar_from_loop_reset; } else #endif { cd->lfuncs.loop_convertn = unicode_loop_convertn; cd->lfuncs.loop_reset = unicode_loop_reset; } /* Initialize the states. */ memset(&cd->istate,'\0',sizeof(state_t)); memset(&cd->ostate,'\0',sizeof(state_t)); /* Initialize the operation flags. */ cd->transliterate = transliterate; /* Initialize additional fields. */ if (from_wchar != to_wchar) { struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) cd; #if HAVE_MBSTATE > 0 memset(&wcd->state,'\0',sizeof(mbstate_t)); #endif } start = outbuf; r = cd->lfuncs.loop_convertn(cd, (const char **)inbuf, &inbytesleft, &outbuf, &outbytesleft, &n, 1); free(cd); /* Done. */ return r == (size_t)-1 ? -1 : outbuf - start; invalid: errno = EINVAL; return -1; }