Index: encoding.c =================================================================== --- encoding.c (revision 3733) +++ encoding.c (working copy) @@ -58,7 +58,7 @@ static int xmlCharEncodingAliasesNb = 0; static int xmlCharEncodingAliasesMax = 0; -#ifdef LIBXML_ICONV_ENABLED +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) #if 0 #define DEBUG_ENCODING /* Define this to get encoding traces */ #endif @@ -97,6 +97,54 @@ NULL, 0, val, NULL, NULL, 0, 0, msg, val); } +#ifdef LIBXML_ICU_ENABLED +static uconv_t* +openIcuConverter(const char* name, int toUnicode) +{ + UErrorCode status = U_ZERO_ERROR; + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); + if (conv == NULL) + return NULL; + + conv->uconv = ucnv_open(name, &status); + if (U_FAILURE(status)) + goto error; + + status = U_ZERO_ERROR; + if (toUnicode) { + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, + NULL, NULL, NULL, &status); + } + else { + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, + NULL, NULL, NULL, &status); + } + if (U_FAILURE(status)) + goto error; + + status = U_ZERO_ERROR; + conv->utf8 = ucnv_open("UTF-8", &status); + if (U_SUCCESS(status)) + return conv; + +error: + if (conv->uconv) + ucnv_close(conv->uconv); + xmlFree(conv); + return NULL; +} + +static void +closeIcuConverter(uconv_t *conv) +{ + if (conv != NULL) { + ucnv_close(conv->uconv); + ucnv_close(conv->utf8); + xmlFree(conv); + } +} +#endif /* LIBXML_ICU_ENABLED */ + /************************************************************************ * * * Conversions To/From UTF8 encoding * @@ -1306,7 +1354,11 @@ #ifdef LIBXML_ICONV_ENABLED handler->iconv_in = NULL; handler->iconv_out = NULL; -#endif /* LIBXML_ICONV_ENABLED */ +#endif +#ifdef LIBXML_ICU_ENABLED + handler->uconv_in = NULL; + handler->uconv_out = NULL; +#endif /* * registers and returns the handler. @@ -1371,7 +1423,7 @@ xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); #endif /* LIBXML_OUTPUT_ENABLED */ -#ifndef LIBXML_ICONV_ENABLED +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) #ifdef LIBXML_ISO8859X_ENABLED xmlRegisterCharEncodingHandlersISO8859x (); #endif @@ -1576,6 +1628,10 @@ xmlCharEncodingHandlerPtr enc; iconv_t icv_in, icv_out; #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + xmlCharEncodingHandlerPtr enc; + uconv_t *ucv_in, *ucv_out; +#endif /* LIBXML_ICU_ENABLED */ char upper[100]; int i; @@ -1642,6 +1698,35 @@ "iconv : problems with filters for '%s'\n", name); } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + /* check whether icu can handle this */ + ucv_in = openIcuConverter(name, 1); + ucv_out = openIcuConverter(name, 0); + if (ucv_in != NULL && ucv_out != NULL) { + enc = (xmlCharEncodingHandlerPtr) + xmlMalloc(sizeof(xmlCharEncodingHandler)); + if (enc == NULL) { + closeIcuConverter(ucv_in); + closeIcuConverter(ucv_out); + return(NULL); + } + enc->name = xmlMemStrdup(name); + enc->input = NULL; + enc->output = NULL; + enc->uconv_in = ucv_in; + enc->uconv_out = ucv_out; +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "Found ICU converter handler for encoding %s\n", name); +#endif + return enc; + } else if (ucv_in != NULL || ucv_out != NULL) { + closeIcuConverter(ucv_in); + closeIcuConverter(ucv_out); + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, + "ICU converter : problems with filters for '%s'\n", name); + } +#endif /* LIBXML_ICU_ENABLED */ #ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, @@ -1732,6 +1817,75 @@ /************************************************************************ * * + * ICU based generic conversion functions * + * * + ************************************************************************/ + +#ifdef LIBXML_ICU_ENABLED +/** + * xmlUconvWrapper: + * @cd: ICU uconverter data structure + * @toUnicode : non-zero if toUnicode. 0 otherwise. + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ISO Latin 1 chars + * @inlen: the length of @in + * + * Returns 0 if success, or + * -1 by lack of space, or + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + * -3 if there the last byte can't form a single output char. + * + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictable. + * The value of @outlen after return is the number of ocetes consumed. + */ +static int +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, + const unsigned char *in, int *inlen) { + const char *ucv_in = (const char *) in; + char *ucv_out = (char *) out; + UErrorCode err = U_ZERO_ERROR; + + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { + if (outlen != NULL) *outlen = 0; + return(-1); + } + + /* + * TODO(jungshik) + * 1. is ucnv_convert(To|From)Algorithmic better? + * 2. had we better use an explicit pivot buffer? + * 3. error returned comes from 'fromUnicode' only even + * when toUnicode is true ! + */ + if (toUnicode) { + /* encoding => UTF-16 => UTF-8 */ + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, + 0, TRUE, &err); + } else { + /* UTF-8 => UTF-16 => encoding */ + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, + 0, TRUE, &err); + } + *inlen = ucv_in - (const char*) in; + *outlen = ucv_out - (char *) out; + if (U_SUCCESS(err)) + return 0; + if (err == U_BUFFER_OVERFLOW_ERROR) + return -1; + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) + return -2; + /* if (err == U_TRUNCATED_CHAR_FOUND) */ + return -3; +} +#endif /* LIBXML_ICU_ENABLED */ + +/************************************************************************ + * * * The real API used by libxml for on-the-fly conversion * * * ************************************************************************/ @@ -1794,6 +1948,16 @@ if (ret == -1) ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_in != NULL) { + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ #ifdef DEBUG_ENCODING switch (ret) { case 0: @@ -1879,6 +2043,17 @@ ret = -3; } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_in != NULL) { + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ switch (ret) { case 0: #ifdef DEBUG_ENCODING @@ -1979,6 +2154,15 @@ out->content[out->use] = 0; } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_out != NULL) { + ret = xmlUconvWrapper(handler->uconv_out, 0, + &out->content[out->use], + &written, NULL, &toconv); + out->use += written; + out->content[out->use] = 0; + } +#endif /* LIBXML_ICU_ENABLED */ #ifdef DEBUG_ENCODING xmlGenericError(xmlGenericErrorContext, "initialized encoder\n"); @@ -2023,6 +2207,26 @@ } } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (handler->uconv_out != NULL) { + ret = xmlUconvWrapper(handler->uconv_out, 0, + &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + writtentot += written; + out->content[out->use] = 0; + if (ret == -1) { + if (written > 0) { + /* + * Can be a limitation of iconv + */ + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICU_ENABLED */ else { xmlEncodingErr(XML_I18N_NO_OUTPUT, "xmlCharEncOutFunc: no output function !\n", NULL); @@ -2135,6 +2339,22 @@ xmlFree(handler); } #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { + if (handler->name != NULL) + xmlFree(handler->name); + handler->name = NULL; + if (handler->uconv_out != NULL) { + closeIcuConverter(handler->uconv_out); + handler->uconv_out = NULL; + } + if (handler->uconv_in != NULL) { + closeIcuConverter(handler->uconv_in); + handler->uconv_in = NULL; + } + xmlFree(handler); + } +#endif #ifdef DEBUG_ENCODING if (ret) xmlGenericError(xmlGenericErrorContext, @@ -2210,6 +2430,22 @@ cur += toconv; } while (ret == -2); #endif +#ifdef LIBXML_ICU_ENABLED + } else if (handler->uconv_out != NULL) { + do { + toconv = in->end - cur; + written = 32000; + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], + &written, cur, &toconv); + if (ret < 0) { + if (written > 0) + ret = -2; + else + return(-1); + } + unused += written; + cur += toconv; + } while (ret == -2); } else { /* could not find a converter */ return(-1); @@ -2221,8 +2457,9 @@ } return(in->consumed + (in->cur - in->base)); } +#endif -#ifndef LIBXML_ICONV_ENABLED +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) #ifdef LIBXML_ISO8859X_ENABLED /** @@ -3294,4 +3531,3 @@ #define bottom_encoding #include "elfgcchack.h" - Index: include/libxml/encoding.h =================================================================== --- include/libxml/encoding.h (revision 3733) +++ include/libxml/encoding.h (working copy) @@ -26,7 +26,25 @@ #ifdef LIBXML_ICONV_ENABLED #include +#else +#ifdef LIBXML_ICU_ENABLED +#include +#if 0 +/* Forward-declare UConverter here rather than pulling in + * to prevent unwanted ICU symbols being exposed to users of libxml2. + * One particular case is Qt4 conflicting on UChar32. + */ +#include +struct UConverter; +typedef struct UConverter UConverter; +#ifdef _MSC_VER +typedef wchar_t UChar; +#else +typedef uint16_t UChar; #endif +#endif +#endif +#endif #ifdef __cplusplus extern "C" { #endif @@ -125,6 +143,13 @@ * Block defining the handlers for non UTF-8 encodings. * If iconv is supported, there are two extra fields. */ +#ifdef LIBXML_ICU_ENABLED +struct _uconv_t { + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ +}; +typedef struct _uconv_t uconv_t; +#endif typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; @@ -136,6 +161,10 @@ iconv_t iconv_in; iconv_t iconv_out; #endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + uconv_t *uconv_in; + uconv_t *uconv_out; +#endif /* LIBXML_ICU_ENABLED */ }; #ifdef __cplusplus Index: include/libxml/parser.h =================================================================== --- include/libxml/parser.h (revision 3733) +++ include/libxml/parser.h (working copy) @@ -276,6 +276,7 @@ int nsNr; /* the number of inherited namespaces */ int nsMax; /* the size of the arrays */ const xmlChar * *nsTab; /* the array of prefix/namespace name */ + struct _xmlParserCtxt *nsParent; /* parent context to inherit namespaces from */ int *attallocs; /* which attribute were allocated */ void * *pushTab; /* array of data for push */ xmlHashTablePtr attsDefault; /* defaulted attributes if any */ @@ -1207,6 +1208,7 @@ XML_WITH_DEBUG_MEM = 29, XML_WITH_DEBUG_RUN = 30, XML_WITH_ZLIB = 31, + XML_WITH_ICU = 32, XML_WITH_NONE = 99999 /* just to be sure of allocation size */ } xmlFeature; @@ -1217,4 +1219,3 @@ } #endif #endif /* __XML_PARSER_H__ */ - Index: include/libxml/xmlversion.h.in =================================================================== --- include/libxml/xmlversion.h.in (revision 3733) +++ include/libxml/xmlversion.h.in (working copy) @@ -269,6 +269,15 @@ #endif /** + * LIBXML_ICU_ENABLED: + * + * Whether icu support is available + */ +#if @WITH_ICU@ +#define LIBXML_ICU_ENABLED +#endif + +/** * LIBXML_ISO8859X_ENABLED: * * Whether ISO-8859-* support is made available in case iconv is not Index: parser.c =================================================================== --- parser.c (revision 3733) +++ parser.c (working copy) @@ -816,6 +816,12 @@ #else return(0); #endif + case XML_WITH_ICU: +#ifdef LIBXML_ICU_ENABLED + return(1); +#else + return(0); +#endif default: break; } @@ -7705,6 +7711,7 @@ return(NULL); return(ctxt->nsTab[i + 1]); } + if (ctxt->nsParent) return xmlGetNamespace(ctxt->nsParent, prefix); return(NULL); } @@ -11948,6 +11955,8 @@ ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5); ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36); + ctxt->nsParent = oldctxt; + oldsax = ctxt->sax; ctxt->sax = oldctxt->sax; xmlDetectSAX2(ctxt); Index: win32/Makefile.msvc =================================================================== --- win32/Makefile.msvc (revision 3733) +++ win32/Makefile.msvc (working copy) @@ -71,6 +71,9 @@ !if "$(WITH_ICONV)" == "1" LIBS = $(LIBS) iconv.lib !endif ++!if "$(WITH_ICU)" == "1" ++LIBS = $(LIBS) icuuc.lib ++!endif !if "$(WITH_ZLIB)" == "1" LIBS = $(LIBS) zdll.lib !endif Index: win32/configure.js =================================================================== --- win32/configure.js (revision 3733) +++ win32/configure.js (working copy) @@ -40,6 +40,7 @@ var withXptr = true; var withXinclude = true; var withIconv = true; +var withIcu = false; var withIso8859x = false; var withZlib = false; var withDebug = true; @@ -124,6 +125,7 @@ txt += " xptr: Enable XPointer support (" + (withXptr? "yes" : "no") + ")\n"; txt += " xinclude: Enable XInclude support (" + (withXinclude? "yes" : "no") + ")\n"; txt += " iconv: Enable iconv support (" + (withIconv? "yes" : "no") + ")\n"; + txt += " icu: Enable icu support (" + (withIcu? "yes" : "no") + ")\n"; txt += " iso8859x: Enable ISO8859X support (" + (withIso8859x? "yes" : "no") + ")\n"; txt += " zlib: Enable zlib support (" + (withZlib? "yes" : "no") + ")\n"; txt += " xml_debug: Enable XML debbugging module (" + (withDebug? "yes" : "no") + ")\n"; @@ -233,6 +235,7 @@ vf.WriteLine("WITH_XPTR=" + (withXptr? "1" : "0")); vf.WriteLine("WITH_XINCLUDE=" + (withXinclude? "1" : "0")); vf.WriteLine("WITH_ICONV=" + (withIconv? "1" : "0")); + vf.WriteLine("WITH_ICU=" + (withIcu? "1" : "0")); vf.WriteLine("WITH_ISO8859X=" + (withIso8859x? "1" : "0")); vf.WriteLine("WITH_ZLIB=" + (withZlib? "1" : "0")); vf.WriteLine("WITH_DEBUG=" + (withDebug? "1" : "0")); @@ -319,6 +322,8 @@ of.WriteLine(s.replace(/\@WITH_XINCLUDE\@/, withXinclude? "1" : "0")); } else if (s.search(/\@WITH_ICONV\@/) != -1) { of.WriteLine(s.replace(/\@WITH_ICONV\@/, withIconv? "1" : "0")); + } else if (s.search(/\@WITH_ICU\@/) != -1) { + of.WriteLine(s.replace(/\@WITH_ICU\@/, withIcu? "1" : "0")); } else if (s.search(/\@WITH_ISO8859X\@/) != -1) { of.WriteLine(s.replace(/\@WITH_ISO8859X\@/, withIso8859x? "1" : "0")); } else if (s.search(/\@WITH_ZLIB\@/) != -1) { @@ -462,6 +467,8 @@ withXinclude = strToBool(arg.substring(opt.length + 1, arg.length)); else if (opt == "iconv") withIconv = strToBool(arg.substring(opt.length + 1, arg.length)); + else if (opt == "icu") + withIcu = strToBool(arg.substring(opt.length + 1, arg.length)); else if (opt == "iso8859x") withIso8859x = strToBool(arg.substring(opt.length + 1, arg.length)); else if (opt == "zlib") @@ -611,7 +618,13 @@ makefile = ".\\Makefile.mingw"; else if (compiler == "bcb") makefile = ".\\Makefile.bcb"; -fso.CopyFile(makefile, ".\\Makefile", true); +var new_makefile = ".\\Makefile"; +var f = fso.FileExists(new_makefile); +if (f) { + var t = fso.GetFile(new_makefile); + t.Attributes =0; +} +fso.CopyFile(makefile, new_makefile, true); WScript.Echo("Created Makefile."); // Create the config.h. var confighsrc = "..\\include\\win32config.h"; @@ -640,6 +653,7 @@ txtOut += " XPointer support: " + boolToStr(withXptr) + "\n"; txtOut += " XInclude support: " + boolToStr(withXinclude) + "\n"; txtOut += " iconv support: " + boolToStr(withIconv) + "\n"; +txtOut += " icu support: " + boolToStr(withIcu) + "\n"; txtOut += " iso8859x support: " + boolToStr(withIso8859x) + "\n"; txtOut += " zlib support: " + boolToStr(withZlib) + "\n"; txtOut += " Debugging module: " + boolToStr(withDebug) + "\n"; Index: xmlmodule.c =================================================================== --- xmlmodule.c (revision 3733) +++ xmlmodule.c (working copy) @@ -300,7 +300,7 @@ static void * xmlModulePlatformOpen(const char *name) { - return LoadLibrary(name); + return LoadLibraryA(name); } /* Index: xmlregexp.c =================================================================== --- xmlregexp.c (revision 3733) +++ xmlregexp.c (working copy) @@ -6464,7 +6464,7 @@ if (name != NULL) { value += 30 * (*name); while ((ch = *name++) != 0) { - value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); + value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch); } } return (value);