| /* |
| * encoding.c : implements the encoding conversion functions needed for XML |
| * |
| * Related specs: |
| * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies |
| * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau |
| * [ISO-10646] UTF-8 and UTF-16 in Annexes |
| * [ISO-8859-1] ISO Latin-1 characters codes. |
| * [UNICODE] The Unicode Consortium, "The Unicode Standard -- |
| * Worldwide Character Encoding -- Version 1.0", Addison- |
| * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is |
| * described in Unicode Technical Report #4. |
| * [US-ASCII] Coded Character Set--7-bit American Standard Code for |
| * Information Interchange, ANSI X3.4-1986. |
| * |
| * See Copyright for the status of this software. |
| * |
| * [email protected] |
| * |
| * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <[email protected]> |
| */ |
| |
| #define IN_LIBXML |
| #include "libxml.h" |
| |
| #include <string.h> |
| #include <limits.h> |
| #include <ctype.h> |
| #include <stdlib.h> |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| #include <iconv.h> |
| #include <errno.h> |
| #endif |
| |
| #include <libxml/encoding.h> |
| #include <libxml/xmlmemory.h> |
| #include <libxml/parser.h> |
| #ifdef LIBXML_HTML_ENABLED |
| #include <libxml/HTMLparser.h> |
| #endif |
| #include <libxml/xmlerror.h> |
| |
| #include "private/buf.h" |
| #include "private/enc.h" |
| #include "private/entities.h" |
| #include "private/error.h" |
| |
| #ifdef LIBXML_ICU_ENABLED |
| #include <unicode/ucnv.h> |
| #endif |
| |
| #define XML_HANDLER_STATIC 1 |
| |
| typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias; |
| typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr; |
| struct _xmlCharEncodingAlias { |
| const char *name; |
| const char *alias; |
| }; |
| |
| static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; |
| static int xmlCharEncodingAliasesNb = 0; |
| static int xmlCharEncodingAliasesMax = 0; |
| |
| static int xmlLittleEndian = 1; |
| |
| typedef struct { |
| const char *name; |
| xmlCharEncoding enc; |
| } xmlEncTableEntry; |
| |
| static const xmlEncTableEntry xmlEncTable[] = { |
| { "ASCII", XML_CHAR_ENCODING_ASCII }, |
| { "EUC-JP", XML_CHAR_ENCODING_EUC_JP }, |
| { "HTML", XML_CHAR_ENCODING_HTML }, |
| { "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 }, |
| { "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 }, |
| { "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 }, |
| { "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE }, |
| { "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP }, |
| { "ISO-8859-1", XML_CHAR_ENCODING_8859_1 }, |
| { "ISO-8859-10", XML_CHAR_ENCODING_8859_10 }, |
| { "ISO-8859-11", XML_CHAR_ENCODING_8859_11 }, |
| { "ISO-8859-13", XML_CHAR_ENCODING_8859_13 }, |
| { "ISO-8859-14", XML_CHAR_ENCODING_8859_14 }, |
| { "ISO-8859-15", XML_CHAR_ENCODING_8859_15 }, |
| { "ISO-8859-16", XML_CHAR_ENCODING_8859_16 }, |
| { "ISO-8859-2", XML_CHAR_ENCODING_8859_2 }, |
| { "ISO-8859-3", XML_CHAR_ENCODING_8859_3 }, |
| { "ISO-8859-4", XML_CHAR_ENCODING_8859_4 }, |
| { "ISO-8859-5", XML_CHAR_ENCODING_8859_5 }, |
| { "ISO-8859-6", XML_CHAR_ENCODING_8859_6 }, |
| { "ISO-8859-7", XML_CHAR_ENCODING_8859_7 }, |
| { "ISO-8859-8", XML_CHAR_ENCODING_8859_8 }, |
| { "ISO-8859-9", XML_CHAR_ENCODING_8859_9 }, |
| { "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 }, |
| { "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 }, |
| { "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS }, |
| { "UCS-2", XML_CHAR_ENCODING_UCS2 }, |
| { "UCS-4", XML_CHAR_ENCODING_UCS4LE }, |
| { "UCS2", XML_CHAR_ENCODING_UCS2 }, |
| { "UCS4", XML_CHAR_ENCODING_UCS4LE }, |
| { "US-ASCII", XML_CHAR_ENCODING_ASCII }, |
| { "UTF-16", XML_CHAR_ENCODING_UTF16 }, |
| { "UTF-16BE", XML_CHAR_ENCODING_UTF16BE }, |
| { "UTF-16LE", XML_CHAR_ENCODING_UTF16LE }, |
| { "UTF-8", XML_CHAR_ENCODING_UTF8 }, |
| { "UTF16", XML_CHAR_ENCODING_UTF16 }, |
| { "UTF8", XML_CHAR_ENCODING_UTF8 } |
| }; |
| |
| static int |
| asciiToAscii(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| static int |
| UTF8ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* inb, int *inlenb, void *vctxt); |
| static int |
| latin1ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| static int |
| UTF16LEToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* inb, int *inlenb, void *vctxt); |
| static int |
| UTF16BEToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* inb, int *inlenb, void *vctxt); |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| |
| static int |
| UTF8ToLatin1(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| static int |
| UTF8ToUTF16(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| static int |
| UTF8ToUTF16LE(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| static int |
| UTF8ToUTF16BE(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| |
| #else /* LIBXML_OUTPUT_ENABLED */ |
| |
| #define UTF8ToLatin1 NULL |
| #define UTF8ToUTF16 NULL |
| #define UTF8ToUTF16LE NULL |
| #define UTF8ToUTF16BE NULL |
| |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED) |
| static int |
| UTF8ToHtmlWrapper(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, void *vctxt); |
| #else |
| #define UTF8ToHtmlWrapper NULL |
| #endif |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| #define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0 |
| #else |
| #define EMPTY_ICONV |
| #endif |
| |
| #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \ |
| defined(LIBXML_ISO8859X_ENABLED) |
| |
| #include "iso8859x.inc" |
| |
| static int |
| ISO8859xToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt); |
| static int |
| UTF8ToISO8859x(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, void *vctxt); |
| |
| #define MAKE_ISO_HANDLER(name, n) \ |
| { (char *) name, \ |
| (xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \ |
| (xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \ |
| EMPTY_ICONV, \ |
| (void *) xmlunicodetable_ISO8859_##n, \ |
| (void *) xmltranscodetable_ISO8859_##n, \ |
| NULL, XML_HANDLER_STATIC } |
| |
| #else /* LIBXML_ISO8859X_ENABLED */ |
| |
| #define MAKE_ISO_HANDLER(name, n) \ |
| { (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \ |
| XML_HANDLER_STATIC } |
| |
| #endif /* LIBXML_ISO8859X_ENABLED */ |
| |
| #define MAKE_HANDLER(name, in, out) \ |
| { (char *) name, \ |
| (xmlCharEncodingInputFunc) (void (*)(void)) in, \ |
| (xmlCharEncodingOutputFunc) (void (*)(void)) out \ |
| EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC } |
| |
| /* |
| * The layout must match enum xmlCharEncoding. |
| * |
| * Names should match the IANA registry if possible: |
| * https://www.iana.org/assignments/character-sets/character-sets.xhtml |
| */ |
| static const xmlCharEncodingHandler defaultHandlers[31] = { |
| MAKE_HANDLER(NULL, NULL, NULL), /* NONE */ |
| MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8), |
| MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE), |
| MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE), |
| MAKE_HANDLER("UCS-4LE", NULL, NULL), |
| MAKE_HANDLER("UCS-4BE", NULL, NULL), |
| MAKE_HANDLER("IBM037", NULL, NULL), |
| MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */ |
| MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */ |
| MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL), |
| MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1), |
| MAKE_ISO_HANDLER("ISO-8859-2", 2), |
| MAKE_ISO_HANDLER("ISO-8859-3", 3), |
| MAKE_ISO_HANDLER("ISO-8859-4", 4), |
| MAKE_ISO_HANDLER("ISO-8859-5", 5), |
| MAKE_ISO_HANDLER("ISO-8859-6", 6), |
| MAKE_ISO_HANDLER("ISO-8859-7", 7), |
| MAKE_ISO_HANDLER("ISO-8859-8", 8), |
| MAKE_ISO_HANDLER("ISO-8859-9", 9), |
| MAKE_HANDLER("ISO-2022-JP", NULL, NULL), |
| MAKE_HANDLER("Shift_JIS", NULL, NULL), |
| MAKE_HANDLER("EUC-JP", NULL, NULL), |
| MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii), |
| MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16), |
| MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper), |
| MAKE_ISO_HANDLER("ISO-8859-10", 10), |
| MAKE_ISO_HANDLER("ISO-8859-11", 11), |
| MAKE_ISO_HANDLER("ISO-8859-13", 13), |
| MAKE_ISO_HANDLER("ISO-8859-14", 14), |
| MAKE_ISO_HANDLER("ISO-8859-15", 15), |
| MAKE_ISO_HANDLER("ISO-8859-16", 16), |
| }; |
| |
| #define NUM_DEFAULT_HANDLERS \ |
| (sizeof(defaultHandlers) / sizeof(defaultHandlers[0])) |
| |
| /* the size should be growable, but it's not a big deal ... */ |
| #define MAX_ENCODING_HANDLERS 50 |
| static xmlCharEncodingHandlerPtr *globalHandlers = NULL; |
| static int nbCharEncodingHandler = 0; |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| static int |
| xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv); |
| #endif |
| |
| #ifdef LIBXML_ICU_ENABLED |
| static int |
| xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv); |
| #endif |
| |
| /************************************************************************ |
| * * |
| * Generic encoding handling routines * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * xmlDetectCharEncoding: |
| * @in: a pointer to the first bytes of the XML entity, must be at least |
| * 2 bytes long (at least 4 if encoding is UTF4 variant). |
| * @len: pointer to the length of the buffer |
| * |
| * Guess the encoding of the entity using the first bytes of the entity content |
| * according to the non-normative appendix F of the XML-1.0 recommendation. |
| * |
| * Returns one of the XML_CHAR_ENCODING_... values. |
| */ |
| xmlCharEncoding |
| xmlDetectCharEncoding(const unsigned char* in, int len) |
| { |
| if (in == NULL) |
| return(XML_CHAR_ENCODING_NONE); |
| if (len >= 4) { |
| if ((in[0] == 0x00) && (in[1] == 0x00) && |
| (in[2] == 0x00) && (in[3] == 0x3C)) |
| return(XML_CHAR_ENCODING_UCS4BE); |
| if ((in[0] == 0x3C) && (in[1] == 0x00) && |
| (in[2] == 0x00) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UCS4LE); |
| if ((in[0] == 0x00) && (in[1] == 0x00) && |
| (in[2] == 0x3C) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UCS4_2143); |
| if ((in[0] == 0x00) && (in[1] == 0x3C) && |
| (in[2] == 0x00) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UCS4_3412); |
| if ((in[0] == 0x4C) && (in[1] == 0x6F) && |
| (in[2] == 0xA7) && (in[3] == 0x94)) |
| return(XML_CHAR_ENCODING_EBCDIC); |
| if ((in[0] == 0x3C) && (in[1] == 0x3F) && |
| (in[2] == 0x78) && (in[3] == 0x6D)) |
| return(XML_CHAR_ENCODING_UTF8); |
| /* |
| * Although not part of the recommendation, we also |
| * attempt an "auto-recognition" of UTF-16LE and |
| * UTF-16BE encodings. |
| */ |
| if ((in[0] == 0x3C) && (in[1] == 0x00) && |
| (in[2] == 0x3F) && (in[3] == 0x00)) |
| return(XML_CHAR_ENCODING_UTF16LE); |
| if ((in[0] == 0x00) && (in[1] == 0x3C) && |
| (in[2] == 0x00) && (in[3] == 0x3F)) |
| return(XML_CHAR_ENCODING_UTF16BE); |
| } |
| if (len >= 3) { |
| /* |
| * Errata on XML-1.0 June 20 2001 |
| * We now allow an UTF8 encoded BOM |
| */ |
| if ((in[0] == 0xEF) && (in[1] == 0xBB) && |
| (in[2] == 0xBF)) |
| return(XML_CHAR_ENCODING_UTF8); |
| } |
| /* For UTF-16 we can recognize by the BOM */ |
| if (len >= 2) { |
| if ((in[0] == 0xFE) && (in[1] == 0xFF)) |
| return(XML_CHAR_ENCODING_UTF16BE); |
| if ((in[0] == 0xFF) && (in[1] == 0xFE)) |
| return(XML_CHAR_ENCODING_UTF16LE); |
| } |
| return(XML_CHAR_ENCODING_NONE); |
| } |
| |
| /** |
| * xmlCleanupEncodingAliases: |
| * |
| * DEPRECATED: This function modifies global state and is not |
| * thread-safe. |
| * |
| * Unregisters all aliases |
| */ |
| void |
| xmlCleanupEncodingAliases(void) { |
| int i; |
| |
| if (xmlCharEncodingAliases == NULL) |
| return; |
| |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (xmlCharEncodingAliases[i].name != NULL) |
| xmlFree((char *) xmlCharEncodingAliases[i].name); |
| if (xmlCharEncodingAliases[i].alias != NULL) |
| xmlFree((char *) xmlCharEncodingAliases[i].alias); |
| } |
| xmlCharEncodingAliasesNb = 0; |
| xmlCharEncodingAliasesMax = 0; |
| xmlFree(xmlCharEncodingAliases); |
| xmlCharEncodingAliases = NULL; |
| } |
| |
| /** |
| * xmlGetEncodingAlias: |
| * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * DEPRECATED: This function is not thread-safe. |
| * |
| * Lookup an encoding name for the given alias. |
| * |
| * Returns NULL if not found, otherwise the original name |
| */ |
| const char * |
| xmlGetEncodingAlias(const char *alias) { |
| int i; |
| char upper[100]; |
| |
| if (alias == NULL) |
| return(NULL); |
| |
| if (xmlCharEncodingAliases == NULL) |
| return(NULL); |
| |
| for (i = 0;i < 99;i++) { |
| upper[i] = (char) toupper((unsigned char) alias[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| |
| /* |
| * Walk down the list looking for a definition of the alias |
| */ |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { |
| return(xmlCharEncodingAliases[i].name); |
| } |
| } |
| return(NULL); |
| } |
| |
| /** |
| * xmlAddEncodingAlias: |
| * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) |
| * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * DEPRECATED: This function modifies global state and is not |
| * thread-safe. |
| * |
| * Registers an alias @alias for an encoding named @name. Existing alias |
| * will be overwritten. |
| * |
| * Returns 0 in case of success, -1 in case of error |
| */ |
| int |
| xmlAddEncodingAlias(const char *name, const char *alias) { |
| int i; |
| char upper[100]; |
| char *nameCopy, *aliasCopy; |
| |
| if ((name == NULL) || (alias == NULL)) |
| return(-1); |
| |
| for (i = 0;i < 99;i++) { |
| upper[i] = (char) toupper((unsigned char) alias[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| |
| if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) { |
| xmlCharEncodingAliasPtr tmp; |
| size_t newSize = xmlCharEncodingAliasesMax ? |
| xmlCharEncodingAliasesMax * 2 : |
| 20; |
| |
| tmp = (xmlCharEncodingAliasPtr) |
| xmlRealloc(xmlCharEncodingAliases, |
| newSize * sizeof(xmlCharEncodingAlias)); |
| if (tmp == NULL) |
| return(-1); |
| xmlCharEncodingAliases = tmp; |
| xmlCharEncodingAliasesMax = newSize; |
| } |
| |
| /* |
| * Walk down the list looking for a definition of the alias |
| */ |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { |
| /* |
| * Replace the definition. |
| */ |
| nameCopy = xmlMemStrdup(name); |
| if (nameCopy == NULL) |
| return(-1); |
| xmlFree((char *) xmlCharEncodingAliases[i].name); |
| xmlCharEncodingAliases[i].name = nameCopy; |
| return(0); |
| } |
| } |
| /* |
| * Add the definition |
| */ |
| nameCopy = xmlMemStrdup(name); |
| if (nameCopy == NULL) |
| return(-1); |
| aliasCopy = xmlMemStrdup(upper); |
| if (aliasCopy == NULL) { |
| xmlFree(nameCopy); |
| return(-1); |
| } |
| xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy; |
| xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy; |
| xmlCharEncodingAliasesNb++; |
| return(0); |
| } |
| |
| /** |
| * xmlDelEncodingAlias: |
| * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * DEPRECATED: This function modifies global state and is not |
| * thread-safe. |
| * |
| * Unregisters an encoding alias @alias |
| * |
| * Returns 0 in case of success, -1 in case of error |
| */ |
| int |
| xmlDelEncodingAlias(const char *alias) { |
| int i; |
| |
| if (alias == NULL) |
| return(-1); |
| |
| if (xmlCharEncodingAliases == NULL) |
| return(-1); |
| /* |
| * Walk down the list looking for a definition of the alias |
| */ |
| for (i = 0;i < xmlCharEncodingAliasesNb;i++) { |
| if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) { |
| xmlFree((char *) xmlCharEncodingAliases[i].name); |
| xmlFree((char *) xmlCharEncodingAliases[i].alias); |
| xmlCharEncodingAliasesNb--; |
| memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1], |
| sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i)); |
| return(0); |
| } |
| } |
| return(-1); |
| } |
| |
| static int |
| xmlCompareEncTableEntries(const void *vkey, const void *ventry) { |
| const char *key = vkey; |
| const xmlEncTableEntry *entry = ventry; |
| |
| return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name)); |
| } |
| |
| static xmlCharEncoding |
| xmlParseCharEncodingInternal(const char *name) |
| { |
| const xmlEncTableEntry *entry; |
| |
| if (name == NULL) |
| return(XML_CHAR_ENCODING_NONE); |
| |
| entry = bsearch(name, xmlEncTable, |
| sizeof(xmlEncTable) / sizeof(xmlEncTable[0]), |
| sizeof(xmlEncTable[0]), xmlCompareEncTableEntries); |
| if (entry != NULL) |
| return(entry->enc); |
| |
| return(XML_CHAR_ENCODING_ERROR); |
| } |
| |
| /** |
| * xmlParseCharEncoding: |
| * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) |
| * |
| * Compare the string to the encoding schemes already known. Note |
| * that the comparison is case insensitive accordingly to the section |
| * [XML] 4.3.3 Character Encoding in Entities. |
| * |
| * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE |
| * if not recognized. |
| */ |
| xmlCharEncoding |
| xmlParseCharEncoding(const char *name) |
| { |
| xmlCharEncoding enc = xmlParseCharEncodingInternal(name); |
| |
| /* Backward compatibility */ |
| if (enc == XML_CHAR_ENCODING_UTF16) |
| enc = XML_CHAR_ENCODING_UTF16LE; |
| |
| return(enc); |
| } |
| |
| /** |
| * xmlGetCharEncodingName: |
| * @enc: the encoding |
| * |
| * The "canonical" name for XML encoding. |
| * C.f. http://www.w3.org/TR/REC-xml#charencoding |
| * Section 4.3.3 Character Encoding in Entities |
| * |
| * Returns the canonical name for the given encoding |
| */ |
| const char* |
| xmlGetCharEncodingName(xmlCharEncoding enc) { |
| switch (enc) { |
| case XML_CHAR_ENCODING_UTF16LE: |
| return("UTF-16"); |
| case XML_CHAR_ENCODING_UTF16BE: |
| return("UTF-16"); |
| case XML_CHAR_ENCODING_UCS4LE: |
| return("ISO-10646-UCS-4"); |
| case XML_CHAR_ENCODING_UCS4BE: |
| return("ISO-10646-UCS-4"); |
| default: |
| break; |
| } |
| |
| if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS)) |
| return(NULL); |
| |
| return(defaultHandlers[enc].name); |
| } |
| |
| /************************************************************************ |
| * * |
| * Char encoding handlers * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * xmlNewCharEncodingHandler: |
| * @name: the encoding name, in UTF-8 format (ASCII actually) |
| * @input: the xmlCharEncodingInputFunc to read that encoding |
| * @output: the xmlCharEncodingOutputFunc to write that encoding |
| * |
| * DEPRECATED: This function modifies global state and is not |
| * thread-safe. |
| * |
| * Create and registers an xmlCharEncodingHandler. |
| * |
| * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error). |
| */ |
| xmlCharEncodingHandlerPtr |
| xmlNewCharEncodingHandler(const char *name, |
| xmlCharEncodingInputFunc input, |
| xmlCharEncodingOutputFunc output) { |
| xmlCharEncodingHandlerPtr handler; |
| const char *alias; |
| char upper[500]; |
| int i; |
| char *up = NULL; |
| |
| /* |
| * Do the alias resolution |
| */ |
| alias = xmlGetEncodingAlias(name); |
| if (alias != NULL) |
| name = alias; |
| |
| /* |
| * Keep only the uppercase version of the encoding. |
| */ |
| if (name == NULL) |
| return(NULL); |
| for (i = 0;i < 499;i++) { |
| upper[i] = (char) toupper((unsigned char) name[i]); |
| if (upper[i] == 0) break; |
| } |
| upper[i] = 0; |
| up = xmlMemStrdup(upper); |
| if (up == NULL) |
| return(NULL); |
| |
| /* |
| * allocate and fill-up an handler block. |
| */ |
| handler = (xmlCharEncodingHandlerPtr) |
| xmlMalloc(sizeof(xmlCharEncodingHandler)); |
| if (handler == NULL) { |
| xmlFree(up); |
| return(NULL); |
| } |
| memset(handler, 0, sizeof(xmlCharEncodingHandler)); |
| handler->input = input; |
| handler->output = output; |
| handler->name = up; |
| handler->flags = XML_HANDLER_STATIC; |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| handler->iconv_in = NULL; |
| handler->iconv_out = NULL; |
| #endif |
| |
| /* |
| * registers and returns the handler. |
| */ |
| xmlRegisterCharEncodingHandler(handler); |
| return(handler); |
| } |
| |
| /** |
| * xmlInitCharEncodingHandlers: |
| * |
| * DEPRECATED: Alias for xmlInitParser. |
| */ |
| void |
| xmlInitCharEncodingHandlers(void) { |
| xmlInitParser(); |
| } |
| |
| /** |
| * xmlInitEncodingInternal: |
| * |
| * Initialize the char encoding support. |
| */ |
| void |
| xmlInitEncodingInternal(void) { |
| unsigned short int tst = 0x1234; |
| unsigned char *ptr = (unsigned char *) &tst; |
| |
| if (*ptr == 0x12) xmlLittleEndian = 0; |
| else xmlLittleEndian = 1; |
| } |
| |
| /** |
| * xmlCleanupCharEncodingHandlers: |
| * |
| * DEPRECATED: This function will be made private. Call xmlCleanupParser |
| * to free global state but see the warnings there. xmlCleanupParser |
| * should be only called once at program exit. In most cases, you don't |
| * have call cleanup functions at all. |
| * |
| * Cleanup the memory allocated for the char encoding support, it |
| * unregisters all the encoding handlers and the aliases. |
| */ |
| void |
| xmlCleanupCharEncodingHandlers(void) { |
| xmlCleanupEncodingAliases(); |
| |
| if (globalHandlers == NULL) return; |
| |
| for (;nbCharEncodingHandler > 0;) { |
| xmlCharEncodingHandler *handler; |
| |
| nbCharEncodingHandler--; |
| handler = globalHandlers[nbCharEncodingHandler]; |
| if (handler != NULL) { |
| if (handler->name != NULL) |
| xmlFree(handler->name); |
| xmlFree(handler); |
| } |
| } |
| xmlFree(globalHandlers); |
| globalHandlers = NULL; |
| nbCharEncodingHandler = 0; |
| } |
| |
| /** |
| * xmlRegisterCharEncodingHandler: |
| * @handler: the xmlCharEncodingHandlerPtr handler block |
| * |
| * DEPRECATED: This function modifies global state and is not |
| * thread-safe. |
| * |
| * Register the char encoding handler. |
| */ |
| void |
| xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { |
| if (handler == NULL) |
| return; |
| if (globalHandlers == NULL) { |
| globalHandlers = xmlMalloc( |
| MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0])); |
| if (globalHandlers == NULL) |
| goto free_handler; |
| } |
| |
| if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) |
| goto free_handler; |
| globalHandlers[nbCharEncodingHandler++] = handler; |
| return; |
| |
| free_handler: |
| if (handler != NULL) { |
| if (handler->name != NULL) { |
| xmlFree(handler->name); |
| } |
| xmlFree(handler); |
| } |
| } |
| |
| static int |
| xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt, |
| const char *name, xmlCharEncodingHandler *handler) { |
| xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL }; |
| int ret; |
| |
| ret = impl(implCtxt, name, &conv); |
| |
| if (ret == XML_ERR_OK) { |
| handler->input = |
| (xmlCharEncodingInputFunc) (void (*)(void)) conv.input; |
| handler->output = |
| (xmlCharEncodingOutputFunc) (void (*)(void)) conv.output; |
| handler->ctxtDtor = conv.ctxtDtor; |
| handler->inputCtxt = conv.inputCtxt; |
| handler->outputCtxt = conv.outputCtxt; |
| } |
| |
| return(ret); |
| } |
| |
| /** |
| * xmlFindExtraHandler: |
| * @norig: name of the char encoding |
| * @name: potentially aliased name of the encoding |
| * @output: boolean, use handler for output |
| * @impl: a conversion implementation (optional) |
| * @implCtxt: user data for conversion implementation (optional) |
| * @out: pointer to resulting handler |
| * |
| * Search the non-default handlers for an exact match. |
| * |
| * Returns an xmlParserErrors error code. |
| */ |
| static int |
| xmlFindExtraHandler(const char *norig, const char *name, int output, |
| xmlCharEncConvImpl impl, void *implCtxt, |
| xmlCharEncodingHandler **out) { |
| xmlCharEncodingHandler *handler; |
| int ret; |
| int i; |
| |
| handler = xmlMalloc(sizeof(*handler)); |
| if (handler == NULL) |
| return(XML_ERR_NO_MEMORY); |
| memset(handler, 0, sizeof(*handler)); |
| |
| handler->name = xmlMemStrdup(name); |
| if (handler->name == NULL) { |
| ret = XML_ERR_NO_MEMORY; |
| goto done; |
| } |
| |
| /* |
| * Try custom implementation before deprecated global handlers. |
| * |
| * Note that we pass the original name without deprecated |
| * alias resolution. |
| */ |
| if (impl != NULL) { |
| ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler); |
| if (ret != XML_ERR_OK) |
| goto done; |
| |
| *out = handler; |
| return(XML_ERR_OK); |
| } |
| |
| /* |
| * Deprecated |
| */ |
| if (globalHandlers != NULL) { |
| for (i = 0; i < nbCharEncodingHandler; i++) { |
| xmlCharEncodingHandler *h = globalHandlers[i]; |
| |
| if (!xmlStrcasecmp((const xmlChar *) name, |
| (const xmlChar *) h->name)) { |
| if ((output ? h->output : h->input) != NULL) { |
| *out = h; |
| ret = XML_ERR_OK; |
| goto done; |
| } |
| } |
| } |
| } |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler); |
| if (ret == XML_ERR_OK) { |
| *out = handler; |
| return(XML_ERR_OK); |
| } |
| if (ret != XML_ERR_UNSUPPORTED_ENCODING) |
| goto done; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| |
| #ifdef LIBXML_ICU_ENABLED |
| ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler); |
| if (ret == XML_ERR_OK) { |
| *out = handler; |
| return(XML_ERR_OK); |
| } |
| if (ret != XML_ERR_UNSUPPORTED_ENCODING) |
| goto done; |
| #endif /* LIBXML_ICU_ENABLED */ |
| |
| ret = XML_ERR_UNSUPPORTED_ENCODING; |
| |
| done: |
| if (handler != NULL) { |
| xmlFree(handler->name); |
| xmlFree(handler); |
| } |
| |
| return(ret); |
| } |
| |
| /** |
| * xmlLookupCharEncodingHandler: |
| * @enc: an xmlCharEncoding value. |
| * @out: pointer to result |
| * |
| * Find or create a handler matching the encoding. The following |
| * converters are looked up in order: |
| * |
| * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) |
| * - User-registered global handler (deprecated) |
| * - iconv if enabled |
| * - ICU if enabled |
| * |
| * The handler must be closed with xmlCharEncCloseFunc. |
| * |
| * If the encoding is UTF-8, a NULL handler and no error code will |
| * be returned. |
| * |
| * Available since 2.13.0. |
| * |
| * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another |
| * xmlParserErrors error code. |
| */ |
| int |
| xmlLookupCharEncodingHandler(xmlCharEncoding enc, |
| xmlCharEncodingHandler **out) { |
| const xmlCharEncodingHandler *handler; |
| |
| if (out == NULL) |
| return(XML_ERR_ARGUMENT); |
| *out = NULL; |
| |
| if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS)) |
| return(XML_ERR_UNSUPPORTED_ENCODING); |
| |
| /* Return NULL handler for UTF-8 */ |
| if ((enc == XML_CHAR_ENCODING_UTF8) || |
| (enc == XML_CHAR_ENCODING_NONE)) |
| return(XML_ERR_OK); |
| |
| handler = &defaultHandlers[enc]; |
| if ((handler->input != NULL) || (handler->output != NULL)) { |
| *out = (xmlCharEncodingHandler *) handler; |
| return(XML_ERR_OK); |
| } |
| |
| if (handler->name != NULL) |
| return(xmlFindExtraHandler(handler->name, handler->name, 0, |
| NULL, NULL, out)); |
| |
| return(XML_ERR_UNSUPPORTED_ENCODING); |
| } |
| |
| /** |
| * xmlGetCharEncodingHandler: |
| * @enc: an xmlCharEncoding value. |
| * |
| * DEPRECATED: Use xmlLookupCharEncodingHandler which has better error |
| * reporting. |
| * |
| * Returns the handler or NULL if no handler was found or an error |
| * occurred. |
| */ |
| xmlCharEncodingHandlerPtr |
| xmlGetCharEncodingHandler(xmlCharEncoding enc) { |
| xmlCharEncodingHandler *ret; |
| |
| xmlLookupCharEncodingHandler(enc, &ret); |
| return(ret); |
| } |
| |
| /** |
| * xmlCreateCharEncodingHandler: |
| * @name: a string describing the char encoding. |
| * @output: boolean, use handler for output |
| * @impl: a conversion implementation (optional) |
| * @implCtxt: user data for conversion implementation (optional) |
| * @out: pointer to result |
| * |
| * Find or create a handler matching the encoding. The following |
| * converters are looked up in order: |
| * |
| * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) |
| * - Custom implementation if provided |
| * - User-registered global handler (deprecated) |
| * - iconv if enabled |
| * - ICU if enabled |
| * |
| * The handler must be closed with xmlCharEncCloseFunc. |
| * |
| * If the encoding is UTF-8, a NULL handler and no error code will |
| * be returned. |
| * |
| * Available since 2.14.0. |
| * |
| * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another |
| * xmlParserErrors error code. |
| */ |
| int |
| xmlCreateCharEncodingHandler(const char *name, int output, |
| xmlCharEncConvImpl impl, void *implCtxt, |
| xmlCharEncodingHandler **out) { |
| const xmlCharEncodingHandler *handler; |
| const char *norig, *nalias; |
| xmlCharEncoding enc; |
| |
| if (out == NULL) |
| return(XML_ERR_ARGUMENT); |
| *out = NULL; |
| |
| if (name == NULL) |
| return(XML_ERR_ARGUMENT); |
| |
| norig = name; |
| nalias = xmlGetEncodingAlias(name); |
| if (nalias != NULL) |
| name = nalias; |
| |
| enc = xmlParseCharEncodingInternal(name); |
| |
| /* Return NULL handler for UTF-8 */ |
| if (enc == XML_CHAR_ENCODING_UTF8) |
| return(XML_ERR_OK); |
| |
| if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { |
| handler = &defaultHandlers[enc]; |
| if ((output ? handler->output : handler->input) != NULL) { |
| *out = (xmlCharEncodingHandler *) handler; |
| return(XML_ERR_OK); |
| } |
| } |
| |
| return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out)); |
| } |
| |
| /** |
| * xmlOpenCharEncodingHandler: |
| * @name: a string describing the char encoding. |
| * @output: boolean, use handler for output |
| * @out: pointer to result |
| * |
| * Find or create a handler matching the encoding. The following |
| * converters are looked up in order: |
| * |
| * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) |
| * - User-registered global handler (deprecated) |
| * - iconv if enabled |
| * - ICU if enabled |
| * |
| * The handler must be closed with xmlCharEncCloseFunc. |
| * |
| * If the encoding is UTF-8, a NULL handler and no error code will |
| * be returned. |
| * |
| * Available since 2.13.0. |
| * |
| * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another |
| * xmlParserErrors error code. |
| */ |
| int |
| xmlOpenCharEncodingHandler(const char *name, int output, |
| xmlCharEncodingHandler **out) { |
| return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out)); |
| } |
| |
| /** |
| * xmlFindCharEncodingHandler: |
| * @name: a string describing the char encoding. |
| * |
| * DEPRECATED: Use xmlOpenCharEncodingHandler which has better error |
| * reporting. |
| * |
| * If the encoding is UTF-8, this will return a no-op handler that |
| * shouldn't be used. |
| * |
| * Returns the handler or NULL if no handler was found or an error |
| * occurred. |
| */ |
| xmlCharEncodingHandlerPtr |
| xmlFindCharEncodingHandler(const char *name) { |
| xmlCharEncodingHandler *ret; |
| |
| /* |
| * This handler shouldn't be used, but we must return a non-NULL |
| * handler. |
| */ |
| if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) || |
| (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0)) |
| return((xmlCharEncodingHandlerPtr) |
| &defaultHandlers[XML_CHAR_ENCODING_UTF8]); |
| |
| xmlOpenCharEncodingHandler(name, 0, &ret); |
| return(ret); |
| } |
| |
| /************************************************************************ |
| * * |
| * ICONV based generic conversion functions * |
| * * |
| ************************************************************************/ |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| typedef struct { |
| iconv_t cd; |
| } xmlIconvCtxt; |
| |
| /** |
| * xmlIconvConvert: |
| * @vctxt: conversion context |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of input bytes |
| * @inlen: the length of @in |
| * |
| * Returns an XML_ENC_ERR code. |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * as the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets produced. |
| */ |
| static int |
| xmlIconvConvert(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, void *vctxt) { |
| xmlIconvCtxt *ctxt = vctxt; |
| size_t icv_inlen, icv_outlen; |
| const char *icv_in = (const char *) in; |
| char *icv_out = (char *) out; |
| size_t ret; |
| |
| if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { |
| if (outlen != NULL) *outlen = 0; |
| return(XML_ENC_ERR_INTERNAL); |
| } |
| icv_inlen = *inlen; |
| icv_outlen = *outlen; |
| /* |
| * Some versions take const, other versions take non-const input. |
| */ |
| ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen); |
| *inlen -= icv_inlen; |
| *outlen -= icv_outlen; |
| if (ret == (size_t) -1) { |
| if (errno == EILSEQ) |
| return(XML_ENC_ERR_INPUT); |
| if (errno == E2BIG) |
| return(XML_ENC_ERR_SPACE); |
| /* |
| * EINVAL means a truncated multi-byte sequence at the end |
| * of the input buffer. We treat this as success. |
| */ |
| if (errno == EINVAL) |
| return(XML_ENC_ERR_SUCCESS); |
| return(XML_ENC_ERR_INTERNAL); |
| } |
| return(XML_ENC_ERR_SUCCESS); |
| } |
| |
| static void |
| xmlIconvFree(void *vctxt) { |
| xmlIconvCtxt *ctxt = vctxt; |
| |
| if (ctxt->cd != (iconv_t) -1) |
| iconv_close(ctxt->cd); |
| |
| xmlFree(ctxt); |
| } |
| |
| static int |
| xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) { |
| xmlCharEncodingHandler *handler = vctxt; |
| xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL; |
| iconv_t icv_in; |
| iconv_t icv_out; |
| int ret; |
| |
| inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt)); |
| if (inputCtxt == NULL) { |
| ret = XML_ERR_NO_MEMORY; |
| goto error; |
| } |
| inputCtxt->cd = (iconv_t) -1; |
| |
| icv_in = iconv_open("UTF-8", name); |
| if (icv_in == (iconv_t) -1) { |
| if (errno == EINVAL) |
| ret = XML_ERR_UNSUPPORTED_ENCODING; |
| else if (errno == ENOMEM) |
| ret = XML_ERR_NO_MEMORY; |
| else |
| ret = XML_ERR_SYSTEM; |
| goto error; |
| } |
| inputCtxt->cd = icv_in; |
| |
| outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt)); |
| if (outputCtxt == NULL) { |
| ret = XML_ERR_NO_MEMORY; |
| goto error; |
| } |
| outputCtxt->cd = (iconv_t) -1; |
| |
| icv_out = iconv_open(name, "UTF-8"); |
| if (icv_out == (iconv_t) -1) { |
| if (errno == EINVAL) |
| ret = XML_ERR_UNSUPPORTED_ENCODING; |
| else if (errno == ENOMEM) |
| ret = XML_ERR_NO_MEMORY; |
| else |
| ret = XML_ERR_SYSTEM; |
| goto error; |
| } |
| outputCtxt->cd = icv_out; |
| |
| conv->input = xmlIconvConvert; |
| conv->output = xmlIconvConvert; |
| conv->ctxtDtor = xmlIconvFree; |
| conv->inputCtxt = inputCtxt; |
| conv->outputCtxt = outputCtxt; |
| |
| /* Backward compatibility */ |
| if (handler != NULL) { |
| handler->iconv_in = icv_in; |
| handler->iconv_out = icv_out; |
| } |
| |
| return(XML_ERR_OK); |
| |
| error: |
| if (inputCtxt != NULL) |
| xmlIconvFree(inputCtxt); |
| if (outputCtxt != NULL) |
| xmlIconvFree(outputCtxt); |
| return(ret); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| |
| /************************************************************************ |
| * * |
| * ICU based generic conversion functions * |
| * * |
| ************************************************************************/ |
| |
| #ifdef LIBXML_ICU_ENABLED |
| /* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */ |
| #define ICU_PIVOT_BUF_SIZE 1024 |
| |
| typedef struct _uconv_t xmlUconvCtxt; |
| struct _uconv_t { |
| UConverter *uconv; /* for conversion between an encoding and UTF-16 */ |
| UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ |
| UChar *pivot_source; |
| UChar *pivot_target; |
| int isInput; |
| UChar pivot_buf[ICU_PIVOT_BUF_SIZE]; |
| }; |
| |
| /** |
| * xmlUconvConvert: |
| * @vctxt: converison context |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of input bytes |
| * @inlen: the length of @in |
| * |
| * Returns an XML_ENC_ERR code. |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * as the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets produced. |
| */ |
| static int |
| xmlUconvConvert(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, void *vctxt) { |
| xmlUconvCtxt *cd = vctxt; |
| const char *ucv_in = (const char *) in; |
| char *ucv_out = (char *) out; |
| UConverter *target, *source; |
| UErrorCode err = U_ZERO_ERROR; |
| int ret; |
| |
| if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { |
| if (outlen != NULL) |
| *outlen = 0; |
| return(XML_ENC_ERR_INTERNAL); |
| } |
| |
| /* |
| * Note that the ICU API is stateful. It can always consume a certain |
| * amount of input even if the output buffer would overflow. The |
| * remaining input must be processed by calling ucnv_convertEx with a |
| * possibly empty input buffer. |
| * |
| * ucnv_convertEx is always called with reset and flush set to 0, |
| * so we don't mess up the state. This should never generate |
| * U_TRUNCATED_CHAR_FOUND errors. |
| */ |
| if (cd->isInput) { |
| source = cd->uconv; |
| target = cd->utf8; |
| } else { |
| source = cd->utf8; |
| target = cd->uconv; |
| } |
| |
| ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen, |
| &ucv_in, ucv_in + *inlen, cd->pivot_buf, |
| &cd->pivot_source, &cd->pivot_target, |
| cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); |
| |
| *inlen = ucv_in - (const char*) in; |
| *outlen = ucv_out - (char *) out; |
| |
| if (U_SUCCESS(err)) { |
| ret = XML_ENC_ERR_SUCCESS; |
| } else { |
| switch (err) { |
| case U_TRUNCATED_CHAR_FOUND: |
| /* Shouldn't happen without flush */ |
| ret = XML_ENC_ERR_SUCCESS; |
| break; |
| |
| case U_BUFFER_OVERFLOW_ERROR: |
| ret = XML_ENC_ERR_SPACE; |
| break; |
| |
| case U_INVALID_CHAR_FOUND: |
| case U_ILLEGAL_CHAR_FOUND: |
| case U_ILLEGAL_ESCAPE_SEQUENCE: |
| case U_UNSUPPORTED_ESCAPE_SEQUENCE: |
| ret = XML_ENC_ERR_INPUT; |
| break; |
| |
| case U_MEMORY_ALLOCATION_ERROR: |
| ret = XML_ENC_ERR_MEMORY; |
| break; |
| |
| default: |
| ret = XML_ENC_ERR_INTERNAL; |
| break; |
| } |
| } |
| |
| return(ret); |
| } |
| |
| static int |
| openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out) |
| { |
| UErrorCode status; |
| xmlUconvCtxt *conv; |
| |
| *out = NULL; |
| |
| conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt)); |
| if (conv == NULL) |
| return(XML_ERR_NO_MEMORY); |
| |
| conv->isInput = isInput; |
| conv->pivot_source = conv->pivot_buf; |
| conv->pivot_target = conv->pivot_buf; |
| |
| status = U_ZERO_ERROR; |
| conv->uconv = ucnv_open(name, &status); |
| if (U_FAILURE(status)) |
| goto error; |
| |
| status = U_ZERO_ERROR; |
| if (isInput) { |
| ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, |
| NULL, NULL, NULL, &status); |
| } |
| else { |
| ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, |
| NULL, NULL, NULL, &status); |
| } |
| if (U_FAILURE(status)) |
| goto error; |
| |
| status = U_ZERO_ERROR; |
| conv->utf8 = ucnv_open("UTF-8", &status); |
| if (U_FAILURE(status)) |
| goto error; |
| |
| *out = conv; |
| return(0); |
| |
| error: |
| if (conv->uconv) |
| ucnv_close(conv->uconv); |
| xmlFree(conv); |
| |
| if (status == U_FILE_ACCESS_ERROR) |
| return(XML_ERR_UNSUPPORTED_ENCODING); |
| if (status == U_MEMORY_ALLOCATION_ERROR) |
| return(XML_ERR_NO_MEMORY); |
| return(XML_ERR_SYSTEM); |
| } |
| |
| static void |
| closeIcuConverter(xmlUconvCtxt *conv) |
| { |
| if (conv == NULL) |
| return; |
| ucnv_close(conv->uconv); |
| ucnv_close(conv->utf8); |
| xmlFree(conv); |
| } |
| |
| static void |
| xmlUconvFree(void *vctxt) { |
| closeIcuConverter(vctxt); |
| } |
| |
| static int |
| xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name, |
| xmlCharEncConverter *conv) { |
| xmlUconvCtxt *ucv_in = NULL; |
| xmlUconvCtxt *ucv_out = NULL; |
| int ret; |
| |
| ret = openIcuConverter(name, 1, &ucv_in); |
| if (ret != 0) |
| goto error; |
| ret = openIcuConverter(name, 0, &ucv_out); |
| if (ret != 0) |
| goto error; |
| |
| conv->input = xmlUconvConvert; |
| conv->output = xmlUconvConvert; |
| conv->ctxtDtor = xmlUconvFree; |
| conv->inputCtxt = ucv_in; |
| conv->outputCtxt = ucv_out; |
| |
| return(XML_ERR_OK); |
| |
| error: |
| if (ucv_in != NULL) |
| closeIcuConverter(ucv_in); |
| if (ucv_out != NULL) |
| closeIcuConverter(ucv_out); |
| return(ret); |
| } |
| #endif /* LIBXML_ICU_ENABLED */ |
| |
| /************************************************************************ |
| * * |
| * The real API used by libxml for on-the-fly conversion * |
| * * |
| ************************************************************************/ |
| |
| /** |
| * xmlEncConvertError: |
| * @code: XML_ENC_ERR code |
| * |
| * Convert XML_ENC_ERR to libxml2 error codes. |
| */ |
| static int |
| xmlEncConvertError(int code) { |
| int ret; |
| |
| switch (code) { |
| case XML_ENC_ERR_SUCCESS: |
| ret = XML_ERR_OK; |
| break; |
| case XML_ENC_ERR_INPUT: |
| ret = XML_ERR_INVALID_ENCODING; |
| break; |
| case XML_ENC_ERR_MEMORY: |
| ret = XML_ERR_NO_MEMORY; |
| break; |
| default: |
| ret = XML_ERR_INTERNAL_ERROR; |
| break; |
| } |
| |
| return(ret); |
| } |
| |
| /** |
| * xmlEncInputChunk: |
| * @handler: encoding handler |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of input bytes |
| * @inlen: the length of @in |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * as the return value is 0, else unpredictable. |
| * The value of @outlen after return is the number of octets produced. |
| * |
| * Returns an XML_ENC_ERR code. |
| */ |
| int |
| xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, |
| int *outlen, const unsigned char *in, int *inlen) { |
| int ret; |
| |
| if (handler->input != NULL) { |
| xmlCharEncConvFunc conv = |
| (xmlCharEncConvFunc) (void (*)(void)) handler->input; |
| |
| ret = conv(out, outlen, in, inlen, handler->inputCtxt); |
| if (ret > 0) |
| ret = XML_ENC_ERR_SUCCESS; |
| } |
| else { |
| *outlen = 0; |
| *inlen = 0; |
| ret = XML_ENC_ERR_INTERNAL; |
| } |
| |
| return(ret); |
| } |
| |
| /** |
| * xmlEncOutputChunk: |
| * @handler: encoding handler |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of input bytes |
| * @inlen: the length of @in |
| * |
| * Returns an XML_ENC_ERR code. |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * as the return value is 0, else unpredictable. |
| * The value of @outlen after return is the number of octets produced. |
| */ |
| static int |
| xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, |
| int *outlen, const unsigned char *in, int *inlen) { |
| int ret; |
| |
| if (handler->output != NULL) { |
| xmlCharEncConvFunc conv = |
| (xmlCharEncConvFunc) (void (*)(void)) handler->output; |
| |
| ret = conv(out, outlen, in, inlen, handler->outputCtxt); |
| if (ret > 0) |
| ret = XML_ENC_ERR_SUCCESS; |
| } |
| else { |
| *outlen = 0; |
| *inlen = 0; |
| ret = XML_ENC_ERR_INTERNAL; |
| } |
| |
| return(ret); |
| } |
| |
| /** |
| * xmlCharEncFirstLine: |
| * @handler: char encoding transformation data structure |
| * @out: an xmlBuffer for the output. |
| * @in: an xmlBuffer for the input |
| * |
| * DEPERECATED: Don't use. |
| * |
| * Returns the number of bytes written or an XML_ENC_ERR code. |
| */ |
| int |
| xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| xmlBufferPtr in) { |
| return(xmlCharEncInFunc(handler, out, in)); |
| } |
| |
| /** |
| * xmlCharEncInput: |
| * @input: a parser input buffer |
| * @sizeOut: pointer to output size |
| * |
| * @sizeOut should be set to the maximum output size (or SIZE_MAX). |
| * After return, it is set to the number of bytes written. |
| * |
| * Generic front-end for the encoding handler on parser input |
| * |
| * Returns an XML_ENC_ERR code. |
| */ |
| int |
| xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut) |
| { |
| xmlBufPtr out, in; |
| const xmlChar *dataIn; |
| size_t availIn; |
| size_t maxOut; |
| size_t totalIn, totalOut; |
| int ret; |
| |
| out = input->buffer; |
| in = input->raw; |
| |
| maxOut = *sizeOut; |
| totalOut = 0; |
| |
| *sizeOut = 0; |
| |
| availIn = xmlBufUse(in); |
| if (availIn == 0) |
| return(0); |
| dataIn = xmlBufContent(in); |
| totalIn = 0; |
| |
| while (1) { |
| size_t availOut; |
| int completeOut, completeIn; |
| int c_out, c_in; |
| |
| availOut = xmlBufAvail(out); |
| if (availOut > INT_MAX / 2) |
| availOut = INT_MAX / 2; |
| |
| if (availOut < maxOut) { |
| c_out = availOut; |
| completeOut = 0; |
| } else { |
| c_out = maxOut; |
| completeOut = 1; |
| } |
| |
| if (availIn > INT_MAX / 2) { |
| c_in = INT_MAX / 2; |
| completeIn = 0; |
| } else { |
| c_in = availIn; |
| completeIn = 1; |
| } |
| |
| ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, |
| dataIn, &c_in); |
| |
| totalIn += c_in; |
| dataIn += c_in; |
| availIn -= c_in; |
| |
| totalOut += c_out; |
| maxOut -= c_out; |
| xmlBufAddLen(out, c_out); |
| |
| if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) { |
| input->error = xmlEncConvertError(ret); |
| return(ret); |
| } |
| |
| if ((completeOut) && (completeIn)) |
| break; |
| if ((completeOut) && (ret == XML_ENC_ERR_SPACE)) |
| break; |
| if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS)) |
| break; |
| |
| if (ret == XML_ENC_ERR_SPACE) { |
| if (xmlBufGrow(out, 4096) < 0) { |
| input->error = XML_ERR_NO_MEMORY; |
| return(XML_ENC_ERR_MEMORY); |
| } |
| } |
| } |
| |
| xmlBufShrink(in, totalIn); |
| |
| if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn) |
| input->rawconsumed = ULONG_MAX; |
| else |
| input->rawconsumed += totalIn; |
| |
| *sizeOut = totalOut; |
| return(XML_ERR_OK); |
| } |
| |
| /** |
| * xmlCharEncInFunc: |
| * @handler: char encoding transformation data structure |
| * @out: an xmlBuffer for the output. |
| * @in: an xmlBuffer for the input |
| * |
| * Generic front-end for the encoding handler input function |
| * |
| * Returns the number of bytes written or an XML_ENC_ERR code. |
| */ |
| int |
| xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, |
| xmlBufferPtr in) |
| { |
| int ret; |
| int written; |
| int toconv; |
| |
| if (handler == NULL) |
| return(XML_ENC_ERR_INTERNAL); |
| if (out == NULL) |
| return(XML_ENC_ERR_INTERNAL); |
| if (in == NULL) |
| return(XML_ENC_ERR_INTERNAL); |
| |
| toconv = in->use; |
| if (toconv == 0) |
| return (0); |
| written = out->size - out->use -1; /* count '\0' */ |
| if (toconv * 2 >= written) { |
| xmlBufferGrow(out, out->size + toconv * 2); |
| written = out->size - out->use - 1; |
| } |
| ret = xmlEncInputChunk(handler, &out->content[out->use], &written, |
| in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| |
| return (written? written : ret); |
| } |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| /** |
| * xmlCharEncOutput: |
| * @output: a parser output buffer |
| * @init: is this an initialization call without data |
| * |
| * Generic front-end for the encoding handler on parser output |
| * a first call with @init == 1 has to be made first to initiate the |
| * output in case of non-stateless encoding needing to initiate their |
| * state or the output (like the BOM in UTF16). |
| * In case of UTF8 sequence conversion errors for the given encoder, |
| * the content will be automatically remapped to a CharRef sequence. |
| * |
| * Returns the number of bytes written or an XML_ENC_ERR code. |
| */ |
| int |
| xmlCharEncOutput(xmlOutputBufferPtr output, int init) |
| { |
| int ret; |
| size_t written; |
| int writtentot = 0; |
| size_t toconv; |
| int c_in; |
| int c_out; |
| xmlBufPtr in; |
| xmlBufPtr out; |
| |
| if ((output == NULL) || (output->encoder == NULL) || |
| (output->buffer == NULL) || (output->conv == NULL)) |
| return(XML_ENC_ERR_INTERNAL); |
| out = output->conv; |
| in = output->buffer; |
| |
| retry: |
| |
| written = xmlBufAvail(out); |
| |
| /* |
| * First specific handling of the initialization call |
| */ |
| if (init) { |
| c_in = 0; |
| c_out = written; |
| /* TODO: Check return value. */ |
| xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out, |
| NULL, &c_in); |
| xmlBufAddLen(out, c_out); |
| return(c_out); |
| } |
| |
| /* |
| * Conversion itself. |
| */ |
| toconv = xmlBufUse(in); |
| if (toconv > 64 * 1024) |
| toconv = 64 * 1024; |
| if (toconv * 4 >= written) { |
| if (xmlBufGrow(out, toconv * 4) < 0) { |
| ret = XML_ENC_ERR_MEMORY; |
| goto error; |
| } |
| written = xmlBufAvail(out); |
| } |
| if (written > 256 * 1024) |
| written = 256 * 1024; |
| |
| c_in = toconv; |
| c_out = written; |
| ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out, |
| xmlBufContent(in), &c_in); |
| xmlBufShrink(in, c_in); |
| xmlBufAddLen(out, c_out); |
| writtentot += c_out; |
| |
| if (ret == XML_ENC_ERR_SPACE) |
| goto retry; |
| |
| /* |
| * Attempt to handle error cases |
| */ |
| if (ret == XML_ENC_ERR_INPUT) { |
| xmlChar charref[20]; |
| int len = xmlBufUse(in); |
| xmlChar *content = xmlBufContent(in); |
| int cur, charrefLen; |
| |
| cur = xmlGetUTF8Char(content, &len); |
| if (cur <= 0) |
| goto error; |
| |
| /* |
| * Removes the UTF8 sequence, and replace it by a charref |
| * and continue the transcoding phase, hoping the error |
| * did not mangle the encoder state. |
| */ |
| charrefLen = xmlSerializeDecCharRef((char *) charref, cur); |
| xmlBufGrow(out, charrefLen * 4); |
| c_out = xmlBufAvail(out); |
| c_in = charrefLen; |
| ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out, |
| charref, &c_in); |
| if ((ret < 0) || (c_in != charrefLen)) { |
| ret = XML_ENC_ERR_INTERNAL; |
| goto error; |
| } |
| |
| xmlBufShrink(in, len); |
| xmlBufAddLen(out, c_out); |
| writtentot += c_out; |
| goto retry; |
| } |
| |
| error: |
| if (((writtentot <= 0) && (ret != 0)) || |
| (ret == XML_ENC_ERR_MEMORY)) { |
| if (output->error == 0) |
| output->error = xmlEncConvertError(ret); |
| return(ret); |
| } |
| |
| return(writtentot); |
| } |
| #endif |
| |
| /** |
| * xmlCharEncOutFunc: |
| * @handler: char encoding transformation data structure |
| * @out: an xmlBuffer for the output. |
| * @in: an xmlBuffer for the input |
| * |
| * Generic front-end for the encoding handler output function |
| * a first call with @in == NULL has to be made firs to initiate the |
| * output in case of non-stateless encoding needing to initiate their |
| * state or the output (like the BOM in UTF16). |
| * In case of UTF8 sequence conversion errors for the given encoder, |
| * the content will be automatically remapped to a CharRef sequence. |
| * |
| * Returns the number of bytes written or an XML_ENC_ERR code. |
| */ |
| int |
| xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| xmlBufferPtr in) { |
| int ret; |
| int written; |
| int writtentot = 0; |
| int toconv; |
| |
| if (handler == NULL) return(XML_ENC_ERR_INTERNAL); |
| if (out == NULL) return(XML_ENC_ERR_INTERNAL); |
| |
| retry: |
| |
| written = out->size - out->use; |
| |
| if (written > 0) |
| written--; /* Gennady: count '/0' */ |
| |
| /* |
| * First specific handling of in = NULL, i.e. the initialization call |
| */ |
| if (in == NULL) { |
| toconv = 0; |
| /* TODO: Check return value. */ |
| xmlEncOutputChunk(handler, &out->content[out->use], &written, |
| NULL, &toconv); |
| out->use += written; |
| out->content[out->use] = 0; |
| return(0); |
| } |
| |
| /* |
| * Conversion itself. |
| */ |
| toconv = in->use; |
| if (toconv * 4 >= written) { |
| xmlBufferGrow(out, toconv * 4); |
| written = out->size - out->use - 1; |
| } |
| ret = xmlEncOutputChunk(handler, &out->content[out->use], &written, |
| in->content, &toconv); |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| writtentot += written; |
| out->content[out->use] = 0; |
| |
| if (ret == XML_ENC_ERR_SPACE) |
| goto retry; |
| |
| /* |
| * Attempt to handle error cases |
| */ |
| if (ret == XML_ENC_ERR_INPUT) { |
| xmlChar charref[20]; |
| int len = in->use; |
| const xmlChar *utf = (const xmlChar *) in->content; |
| int cur, charrefLen; |
| |
| cur = xmlGetUTF8Char(utf, &len); |
| if (cur <= 0) |
| return(ret); |
| |
| /* |
| * Removes the UTF8 sequence, and replace it by a charref |
| * and continue the transcoding phase, hoping the error |
| * did not mangle the encoder state. |
| */ |
| charrefLen = xmlSerializeDecCharRef((char *) charref, cur); |
| xmlBufferShrink(in, len); |
| xmlBufferGrow(out, charrefLen * 4); |
| written = out->size - out->use - 1; |
| toconv = charrefLen; |
| ret = xmlEncOutputChunk(handler, &out->content[out->use], &written, |
| charref, &toconv); |
| if ((ret < 0) || (toconv != charrefLen)) |
| return(XML_ENC_ERR_INTERNAL); |
| |
| out->use += written; |
| writtentot += written; |
| out->content[out->use] = 0; |
| goto retry; |
| } |
| return(writtentot ? writtentot : ret); |
| } |
| |
| /** |
| * xmlCharEncCloseFunc: |
| * @handler: char encoding transformation data structure |
| * |
| * Releases an xmlCharEncodingHandler. Must be called after |
| * a handler is no longer in use. |
| * |
| * Returns 0. |
| */ |
| int |
| xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { |
| if (handler == NULL) |
| return(0); |
| |
| if (handler->flags & XML_HANDLER_STATIC) |
| return(0); |
| |
| xmlFree(handler->name); |
| if (handler->ctxtDtor != NULL) { |
| handler->ctxtDtor(handler->inputCtxt); |
| handler->ctxtDtor(handler->outputCtxt); |
| } |
| xmlFree(handler); |
| return(0); |
| } |
| |
| /** |
| * xmlByteConsumed: |
| * @ctxt: an XML parser context |
| * |
| * DEPRECATED: Don't use. |
| * |
| * This function provides the current index of the parser relative |
| * to the start of the current entity. This function is computed in |
| * bytes from the beginning starting at zero and finishing at the |
| * size in byte of the file if parsing a file. The function is |
| * of constant cost if the input is UTF-8 but can be costly if run |
| * on non-UTF-8 input. |
| * |
| * Returns the index in bytes from the beginning of the entity or -1 |
| * in case the index could not be computed. |
| */ |
| long |
| xmlByteConsumed(xmlParserCtxtPtr ctxt) { |
| xmlParserInputPtr in; |
| |
| if (ctxt == NULL) |
| return(-1); |
| in = ctxt->input; |
| if (in == NULL) |
| return(-1); |
| |
| if ((in->buf != NULL) && (in->buf->encoder != NULL)) { |
| int unused = 0; |
| xmlCharEncodingHandler * handler = in->buf->encoder; |
| |
| /* |
| * Encoding conversion, compute the number of unused original |
| * bytes from the input not consumed and subtract that from |
| * the raw consumed value, this is not a cheap operation |
| */ |
| if (in->end - in->cur > 0) { |
| unsigned char *convbuf; |
| const unsigned char *cur = (const unsigned char *)in->cur; |
| int toconv, ret; |
| |
| convbuf = xmlMalloc(32000); |
| if (convbuf == NULL) |
| return(-1); |
| |
| toconv = in->end - cur; |
| unused = 32000; |
| ret = xmlEncOutputChunk(handler, convbuf, &unused, cur, &toconv); |
| |
| xmlFree(convbuf); |
| |
| if (ret != XML_ENC_ERR_SUCCESS) |
| return(-1); |
| } |
| |
| if (in->buf->rawconsumed < (unsigned long) unused) |
| return(-1); |
| return(in->buf->rawconsumed - unused); |
| } |
| |
| return(in->consumed + (in->cur - in->base)); |
| } |
| |
| /************************************************************************ |
| * * |
| * Conversions To/From UTF8 encoding * |
| * * |
| ************************************************************************/ |
| |
| static int |
| asciiToAscii(unsigned char* out, int *poutlen, |
| const unsigned char* in, int *pinlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| const unsigned char *inend; |
| const unsigned char *instart = in; |
| int inlen, outlen, ret; |
| |
| if (in == NULL) { |
| *pinlen = 0; |
| *poutlen = 0; |
| return(XML_ENC_ERR_SUCCESS); |
| } |
| |
| inlen = *pinlen; |
| outlen = *poutlen; |
| |
| if (outlen < inlen) { |
| inlen = outlen; |
| ret = XML_ENC_ERR_SPACE; |
| } else { |
| ret = inlen; |
| } |
| |
| inend = in + inlen; |
| *poutlen = inlen; |
| *pinlen = inlen; |
| |
| while (in < inend) { |
| unsigned c = *in; |
| |
| if (c >= 0x80) { |
| *poutlen = in - instart; |
| *pinlen = in - instart; |
| return(XML_ENC_ERR_INPUT); |
| } |
| |
| in++; |
| *out++ = c; |
| } |
| |
| return(ret); |
| } |
| |
| static int |
| latin1ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| unsigned char* outstart = out; |
| const unsigned char* instart = in; |
| unsigned char* outend; |
| const unsigned char* inend; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL)) |
| return(XML_ENC_ERR_INTERNAL); |
| |
| outend = out + *outlen; |
| inend = in + *inlen; |
| |
| while (in < inend) { |
| unsigned c = *in; |
| |
| if (c < 0x80) { |
| if (out >= outend) |
| goto done; |
| *out++ = c; |
| } else { |
| if (outend - out < 2) |
| goto done; |
| *out++ = (c >> 6) | 0xC0; |
| *out++ = (c & 0x3F) | 0x80; |
| } |
| |
| in++; |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| /** |
| * isolat1ToUTF8: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of ISO Latin 1 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 |
| * block of chars out. |
| * |
| * Returns the number of bytes written or an XML_ENC_ERR code. |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets produced. |
| */ |
| int |
| isolat1ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen) { |
| return(latin1ToUTF8(out, outlen, in, inlen, NULL)); |
| } |
| |
| static int |
| UTF8ToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| int len; |
| int ret; |
| |
| if (in == NULL) { |
| *inlen = 0; |
| *outlen = 0; |
| return(XML_ENC_ERR_SUCCESS); |
| } |
| |
| if (*outlen < *inlen) { |
| len = *outlen; |
| ret = XML_ENC_ERR_SPACE; |
| } else { |
| len = *inlen; |
| ret = len; |
| } |
| |
| memcpy(out, in, len); |
| |
| *outlen = len; |
| *inlen = len; |
| return(ret); |
| } |
| |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| static int |
| UTF8ToLatin1(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| const unsigned char* outend; |
| const unsigned char* outstart = out; |
| const unsigned char* instart = in; |
| const unsigned char* inend; |
| unsigned c; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) |
| return(XML_ENC_ERR_INTERNAL); |
| |
| if (in == NULL) { |
| *inlen = 0; |
| *outlen = 0; |
| return(XML_ENC_ERR_SUCCESS); |
| } |
| |
| inend = in + *inlen; |
| outend = out + *outlen; |
| while (in < inend) { |
| if (out >= outend) |
| goto done; |
| |
| c = *in; |
| |
| if (c < 0x80) { |
| *out++ = c; |
| } else if ((c >= 0xC2) && (c <= 0xC3)) { |
| if (inend - in < 2) |
| break; |
| in++; |
| *out++ = (unsigned char) ((c << 6) | (*in & 0x3F)); |
| } else { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| |
| in++; |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| /** |
| * UTF8Toisolat1: |
| * @out: a pointer to an array of bytes to store the result |
| * @outlen: the length of @out |
| * @in: a pointer to an array of UTF-8 chars |
| * @inlen: the length of @in |
| * |
| * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 |
| * block of chars out. |
| * |
| * Returns the number of bytes written or an XML_ENC_ERR code. |
| * |
| * The value of @inlen after return is the number of octets consumed |
| * if the return value is positive, else unpredictable. |
| * The value of @outlen after return is the number of octets produced. |
| */ |
| int |
| UTF8Toisolat1(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen) { |
| if ((out == NULL) || (outlen == NULL) || (in == NULL) || (inlen == NULL)) |
| return(XML_ENC_ERR_INTERNAL); |
| |
| return(UTF8ToLatin1(out, outlen, in, inlen, NULL)); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| static int |
| UTF16LEToUTF8(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| const unsigned char *instart = in; |
| const unsigned char *inend = in + (*inlen & ~1); |
| unsigned char *outstart = out; |
| unsigned char *outend = out + *outlen; |
| unsigned c, d; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| while (in < inend) { |
| c = in[0] | (in[1] << 8); |
| |
| if (c < 0x80) { |
| if (out >= outend) |
| goto done; |
| out[0] = c; |
| in += 2; |
| out += 1; |
| } else if (c < 0x800) { |
| if (outend - out < 2) |
| goto done; |
| out[0] = (c >> 6) | 0xC0; |
| out[1] = (c & 0x3F) | 0x80; |
| in += 2; |
| out += 2; |
| } else if ((c & 0xF800) != 0xD800) { |
| if (outend - out < 3) |
| goto done; |
| out[0] = (c >> 12) | 0xE0; |
| out[1] = ((c >> 6) & 0x3F) | 0x80; |
| out[2] = (c & 0x3F) | 0x80; |
| in += 2; |
| out += 3; |
| } else { |
| /* Surrogate pair */ |
| if ((c & 0xFC00) != 0xD800) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (inend - in < 4) |
| break; |
| d = in[2] | (in[3] << 8); |
| if ((d & 0xFC00) != 0xDC00) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (outend - out < 4) |
| goto done; |
| c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000); |
| out[0] = (c >> 18) | 0xF0; |
| out[1] = ((c >> 12) & 0x3F) | 0x80; |
| out[2] = ((c >> 6) & 0x3F) | 0x80; |
| out[3] = (c & 0x3F) | 0x80; |
| in += 4; |
| out += 4; |
| } |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| static int |
| UTF8ToUTF16LE(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| const unsigned char *instart = in; |
| const unsigned char *inend; |
| unsigned char *outstart = out; |
| unsigned char *outend; |
| unsigned c, d; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| /* UTF16LE encoding has no BOM */ |
| if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) |
| return(XML_ENC_ERR_INTERNAL); |
| if (in == NULL) { |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| inend = in + *inlen; |
| outend = out + (*outlen & ~1); |
| while (in < inend) { |
| c = in[0]; |
| |
| if (c < 0x80) { |
| if (out >= outend) |
| goto done; |
| out[0] = c; |
| out[1] = 0; |
| in += 1; |
| out += 2; |
| } else { |
| int i, len; |
| unsigned min; |
| |
| if (c < 0xE0) { |
| if (c < 0xC2) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| c &= 0x1F; |
| len = 2; |
| min = 0x80; |
| } else if (c < 0xF0) { |
| c &= 0x0F; |
| len = 3; |
| min = 0x800; |
| } else { |
| c &= 0x0F; |
| len = 4; |
| min = 0x10000; |
| } |
| |
| if (inend - in < len) |
| break; |
| |
| for (i = 1; i < len; i++) { |
| if ((in[i] & 0xC0) != 0x80) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| c = (c << 6) | (in[i] & 0x3F); |
| } |
| |
| if ((c < min) || |
| ((c >= 0xD800) && (c <= 0xDFFF)) || |
| (c > 0x10FFFF)) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| |
| if (c < 0x10000) { |
| if (out >= outend) |
| goto done; |
| out[0] = c & 0xFF; |
| out[1] = c >> 8; |
| out += 2; |
| } else { |
| if (outend - out < 4) |
| goto done; |
| c -= 0x10000; |
| d = (c & 0x03FF) | 0xDC00; |
| c = (c >> 10) | 0xD800; |
| out[0] = c & 0xFF; |
| out[1] = c >> 8; |
| out[2] = d & 0xFF; |
| out[3] = d >> 8; |
| out += 4; |
| } |
| |
| in += len; |
| } |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| static int |
| UTF8ToUTF16(unsigned char* outb, int *outlen, |
| const unsigned char* in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| if (in == NULL) { |
| /* |
| * initialization, add the Byte Order Mark for UTF-16LE |
| */ |
| if (*outlen >= 2) { |
| outb[0] = 0xFF; |
| outb[1] = 0xFE; |
| *outlen = 2; |
| *inlen = 0; |
| return(2); |
| } |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL)); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| static int |
| UTF16BEToUTF8(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| const unsigned char *instart = in; |
| const unsigned char *inend = in + (*inlen & ~1); |
| unsigned char *outstart = out; |
| unsigned char *outend = out + *outlen; |
| unsigned c, d; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| while (in < inend) { |
| c = (in[0] << 8) | in[1]; |
| |
| if (c < 0x80) { |
| if (out >= outend) |
| goto done; |
| out[0] = c; |
| in += 2; |
| out += 1; |
| } else if (c < 0x800) { |
| if (outend - out < 2) |
| goto done; |
| out[0] = (c >> 6) | 0xC0; |
| out[1] = (c & 0x3F) | 0x80; |
| in += 2; |
| out += 2; |
| } else if ((c & 0xF800) != 0xD800) { |
| if (outend - out < 3) |
| goto done; |
| out[0] = (c >> 12) | 0xE0; |
| out[1] = ((c >> 6) & 0x3F) | 0x80; |
| out[2] = (c & 0x3F) | 0x80; |
| in += 2; |
| out += 3; |
| } else { |
| /* Surrogate pair */ |
| if ((c & 0xFC00) != 0xD800) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (inend - in < 4) |
| break; |
| d = (in[2] << 8) | in[3]; |
| if ((d & 0xFC00) != 0xDC00) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (outend - out < 4) |
| goto done; |
| c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000); |
| out[0] = (c >> 18) | 0xF0; |
| out[1] = ((c >> 12) & 0x3F) | 0x80; |
| out[2] = ((c >> 6) & 0x3F) | 0x80; |
| out[3] = (c & 0x3F) | 0x80; |
| in += 4; |
| out += 4; |
| } |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| #ifdef LIBXML_OUTPUT_ENABLED |
| static int |
| UTF8ToUTF16BE(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| const unsigned char *instart = in; |
| const unsigned char *inend; |
| unsigned char *outstart = out; |
| unsigned char *outend; |
| unsigned c, d; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| /* UTF-16BE has no BOM */ |
| if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); |
| if (in == NULL) { |
| *outlen = 0; |
| *inlen = 0; |
| return(0); |
| } |
| inend = in + *inlen; |
| outend = out + (*outlen & ~1); |
| while (in < inend) { |
| c = in[0]; |
| |
| if (c < 0x80) { |
| if (out >= outend) |
| goto done; |
| out[0] = 0; |
| out[1] = c; |
| in += 1; |
| out += 2; |
| } else { |
| int i, len; |
| unsigned min; |
| |
| if (c < 0xE0) { |
| if (c < 0xC2) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| c &= 0x1F; |
| len = 2; |
| min = 0x80; |
| } else if (c < 0xF0) { |
| c &= 0x0F; |
| len = 3; |
| min = 0x800; |
| } else { |
| c &= 0x0F; |
| len = 4; |
| min = 0x10000; |
| } |
| |
| if (inend - in < len) |
| break; |
| |
| for (i = 1; i < len; i++) { |
| if ((in[i] & 0xC0) != 0x80) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| c = (c << 6) | (in[i] & 0x3F); |
| } |
| |
| if ((c < min) || |
| ((c >= 0xD800) && (c <= 0xDFFF)) || |
| (c > 0x10FFFF)) { |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| |
| if (c < 0x10000) { |
| if (out >= outend) |
| goto done; |
| out[0] = c >> 8; |
| out[1] = c & 0xFF; |
| out += 2; |
| } else { |
| if (outend - out < 4) |
| goto done; |
| c -= 0x10000; |
| d = (c & 0x03FF) | 0xDC00; |
| c = (c >> 10) | 0xD800; |
| out[0] = c >> 8; |
| out[1] = c & 0xFF; |
| out[2] = d >> 8; |
| out[3] = d & 0xFF; |
| out += 4; |
| } |
| |
| in += len; |
| } |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| |
| #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED) |
| static int |
| UTF8ToHtmlWrapper(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, |
| void *vctxt ATTRIBUTE_UNUSED) { |
| return(UTF8ToHtml(out, outlen, in, inlen)); |
| } |
| #endif |
| |
| #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \ |
| defined(LIBXML_ISO8859X_ENABLED) |
| |
| static int |
| UTF8ToISO8859x(unsigned char *out, int *outlen, |
| const unsigned char *in, int *inlen, void *vctxt) { |
| const unsigned char *xlattable = vctxt; |
| const unsigned char *instart = in; |
| const unsigned char *inend; |
| unsigned char *outstart = out; |
| unsigned char *outend; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| if (in == NULL) { |
| /* |
| * initialization nothing to do |
| */ |
| *outlen = 0; |
| *inlen = 0; |
| return(XML_ENC_ERR_SUCCESS); |
| } |
| |
| inend = in + *inlen; |
| outend = out + *outlen; |
| while (in < inend) { |
| unsigned d = *in; |
| |
| if (d < 0x80) { |
| if (out >= outend) |
| goto done; |
| in += 1; |
| } else if (d < 0xE0) { |
| unsigned c; |
| |
| if (inend - in < 2) |
| break; |
| c = in[1] & 0x3F; |
| d = d & 0x1F; |
| d = xlattable [48 + c + xlattable [d] * 64]; |
| if (d == 0) { |
| /* not in character set */ |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (out >= outend) |
| goto done; |
| in += 2; |
| } else if (d < 0xF0) { |
| unsigned c1; |
| unsigned c2; |
| |
| if (inend - in < 3) |
| break; |
| c1 = in[1] & 0x3F; |
| c2 = in[2] & 0x3F; |
| d = d & 0x0F; |
| d = xlattable [48 + c2 + xlattable [48 + c1 + |
| xlattable [32 + d] * 64] * 64]; |
| if (d == 0) { |
| /* not in character set */ |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (out >= outend) |
| goto done; |
| in += 3; |
| } else { |
| /* cannot transcode >= U+010000 */ |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| |
| *out++ = d; |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| static int |
| ISO8859xToUTF8(unsigned char* out, int *outlen, |
| const unsigned char* in, int *inlen, void *vctxt) { |
| unsigned short const *unicodetable = vctxt; |
| const unsigned char* instart = in; |
| const unsigned char* inend; |
| unsigned char* outstart = out; |
| unsigned char* outend; |
| int ret = XML_ENC_ERR_SPACE; |
| |
| outend = out + *outlen; |
| inend = in + *inlen; |
| |
| while (in < inend) { |
| unsigned c = *in; |
| |
| if (c < 0x80) { |
| if (out >= outend) |
| goto done; |
| *out++ = c; |
| } else { |
| c = unicodetable[c - 0x80]; |
| if (c == 0) { |
| /* undefined code point */ |
| ret = XML_ENC_ERR_INPUT; |
| goto done; |
| } |
| if (c < 0x800) { |
| if (outend - out < 2) |
| goto done; |
| *out++ = ((c >> 6) & 0x1F) | 0xC0; |
| *out++ = (c & 0x3F) | 0x80; |
| } else { |
| if (outend - out < 3) |
| goto done; |
| *out++ = ((c >> 12) & 0x0F) | 0xE0; |
| *out++ = ((c >> 6) & 0x3F) | 0x80; |
| *out++ = (c & 0x3F) | 0x80; |
| } |
| } |
| |
| in += 1; |
| } |
| |
| ret = out - outstart; |
| |
| done: |
| *outlen = out - outstart; |
| *inlen = in - instart; |
| return(ret); |
| } |
| |
| #endif |
| |