Closed michaelrsweet closed 13 years ago
Original reporter:
mxml-native-encoding.patch is native only changes. It works in EBCDIC environment, which is important as libxml2 does not and cannot.
It just uses native encoding, no translation, so it actually works.
The patch is missing adding --with-native-encoding at autoconf to set the ENABLE_NATIVE_ENCODING in config.h
Original reporter:
I posted latest diff against trunk.
Builds on Linux-2.6 (Intel, ppc), Linux-2.4, Windows(MSVC+mingw), Solaris-8, Solaris-10 (intel, sparc), AIX-5.3, AIX-6.1, OS/390.
Support cross compile, "make dist", "make check" and more.
Using libtool, automake. Any other way would make it much harder.
Original reporter: Michael Sweet
Moving to Future release; after looking at the changes it is unlikely I'll be adding them to the regular release anytime soon, sorry...
Original reporter:
What will you add? Is the native support acceptable?
Original reporter: Michael Sweet
At this point even the native support is not acceptable due to the changes involved.
"mxml-native-encoding.patch":
Index: mxml-file.c
===================================================================
--- mxml-file.c (revision 406)
+++ mxml-file.c (working copy)
@@ -68,13 +71,18 @@
#define ENCODE_UTF8 0 /* UTF-8 */
#define ENCODE_UTF16BE 1 /* UTF-16 Big-Endian */
#define ENCODE_UTF16LE 2 /* UTF-16 Little-Endian */
+#define ENCODE_NATIVE 3 /* Native encoding, no conversion */
/*
* Macro to test for a bad XML character...
*/
+#if ' ' == 0x20 /* ASCII */
#define mxml_bad_char(ch) ((ch) < ' ' && (ch) != '\n' && (ch) != '\r' && (ch) != '\t')
+#else
+#define mxml_bad_char(ch) (!isprint(ch) && (ch) != '\n' && (ch) != '\r' && (ch) != '\t')
+#endif
/*
@@ -98,7 +106,7 @@
*/
static int mxml_add_char(int ch, char **ptr, char **buffer,
- int *bufsize);
+ int *bufsize, int *encoding);
static int mxml_fd_getc(void *p, int *encoding);
static int mxml_fd_putc(int ch, void *p);
static int mxml_fd_read(_mxml_fdbuf_t *buf);
@@ -628,7 +636,8 @@
mxml_add_char(int ch, /* I - Character to add */
char **bufptr, /* IO - Current position in buffer */
char **buffer, /* IO - Current buffer */
- int *bufsize) /* IO - Current buffer size */
+ int *bufsize, /* IO - Current buffer size */
+ int *encoding) /* I - Encoding */
{
char *newbuffer; /* New buffer value */
@@ -657,7 +666,7 @@
*buffer = newbuffer;
}
- if (ch < 0x80)
+ if (ch < 0x80 || *encoding == ENCODE_NATIVE)
{
/*
* Single byte ASCII...
@@ -727,12 +736,13 @@
switch (*encoding)
{
+ case ENCODE_NATIVE :
case ENCODE_UTF8 :
/*
* Got a UTF-8 character; convert UTF-8 to Unicode and return...
*/
- if (!(ch & 0x80))
+ if (!(ch & 0x80) || *encoding == ENCODE_NATIVE)
{
#if DEBUG > 1
printf("mxml_fd_getc: %c (0x%04x)\n", ch < ' ' ? '.' : ch, ch);
@@ -1145,12 +1155,13 @@
switch (*encoding)
{
+ case ENCODE_NATIVE :
case ENCODE_UTF8 :
/*
* Got a UTF-8 character; convert UTF-8 to Unicode and return...
*/
- if (!(ch & 0x80))
+ if (!(ch & 0x80) || *encoding == ENCODE_NATIVE)
{
if (mxml_bad_char(ch))
{
@@ -1366,7 +1377,7 @@
entptr = entity;
while ((ch = (*getc_cb)(p, encoding)) != EOF)
- if (ch > 126 || (!isalnum(ch) && ch != '#'))
+ if ((*encoding != ENCODE_NATIVE && ch > 126) || (!isalnum(ch) && ch != '#'))
break;
else if (entptr < (entity + sizeof(entity) - 1))
*entptr++ = ch;
@@ -1459,7 +1470,11 @@
parent = top;
first = NULL;
whitespace = 0;
+#ifdef ENABLE_NATIVE_ENCODING
+ encoding = ENCODE_NATIVE;
+#else
encoding = ENCODE_UTF8;
+#endif
if (cb && parent)
type = (*cb)(parent);
@@ -1604,10 +1619,10 @@
if ((ch = mxml_get_entity(parent, p, &encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
- else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
else if (((bufptr - buffer) == 1 && buffer[0] == '?') ||
((bufptr - buffer) == 3 && !strncmp(buffer, "!--", 3)) ||
@@ -1627,7 +1642,7 @@
if (ch == '>' && bufptr > (buffer + 4) &&
bufptr[-3] != '-' && bufptr[-2] == '-' && bufptr[-1] == '-')
break;
- else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
@@ -1684,7 +1699,7 @@
{
if (ch == '>' && !strncmp(bufptr - 2, "]]", 2))
break;
- else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
@@ -1741,7 +1756,7 @@
{
if (ch == '>' && bufptr > buffer && bufptr[-1] == '?')
break;
- else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ else if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
@@ -1814,7 +1829,7 @@
if ((ch = mxml_get_entity(parent, p, &encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
}
@@ -1989,7 +2004,7 @@
if ((ch = mxml_get_entity(parent, p, &encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
else if (type == MXML_OPAQUE || type == MXML_CUSTOM || !mxml_isspace(ch))
@@ -1998,7 +2013,7 @@
* Add character to current buffer...
*/
- if (mxml_add_char(ch, &bufptr, &buffer, &bufsize))
+ if (mxml_add_char(ch, &bufptr, &buffer, &bufsize, &encoding))
goto error;
}
}
@@ -2159,7 +2174,7 @@
if ((ch = mxml_get_entity(node, p, encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &ptr, &name, &namesize))
+ if (mxml_add_char(ch, &ptr, &name, &namesize, encoding))
goto error;
if (ch == quote)
@@ -2182,7 +2197,7 @@
if ((ch = mxml_get_entity(node, p, encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &ptr, &name, &namesize))
+ if (mxml_add_char(ch, &ptr, &name, &namesize, encoding))
goto error;
}
}
@@ -2228,7 +2243,7 @@
if ((ch = mxml_get_entity(node, p, encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &ptr, &value, &valsize))
+ if (mxml_add_char(ch, &ptr, &value, &valsize, encoding))
goto error;
}
@@ -2252,7 +2267,7 @@
if ((ch = mxml_get_entity(node, p, encoding, getc_cb)) == EOF)
goto error;
- if (mxml_add_char(ch, &ptr, &value, &valsize))
+ if (mxml_add_char(ch, &ptr, &value, &valsize, encoding))
goto error;
}
@@ -2343,8 +2358,9 @@
switch (*encoding)
{
+ case ENCODE_NATIVE :
case ENCODE_UTF8 :
- if (!(ch & 0x80))
+ if (!(ch & 0x80) || *encoding == ENCODE_NATIVE)
{
#if DEBUG > 1
printf("mxml_string_getc: %c (0x%04x)\n", ch < ' ' ? '.' : ch, ch);
Version: -feature Original reporter:
Hello,
I've just found out that libxml2 cannot work in EBCDIC environment. The reason is that the source encoded in ASCII and the library assume it can use '<' == UTF_CHAR, while '<' is in fact in EBCDIC.
Same goes to mxml, however, it is much simpler library, so I added ENCODE_NATIVE encoding, which does not perform any conversion. And it works now.
In order to support cross compile, windows mingw, and proper "make check", "make dist", "make distcheck", separate builddir I also converted the proprietary Makefile to automake.
I also used libtool to create libraries, which is much more simple and generic than the manual attempt.
Then I reorganized the autoconf, to finish with the rewrite.
I also removed the generated files, there is no reason to store them in subversion.
More work to do is to move library files into src/lib, tools into src/tools to make it even more organized.
Please review, I will be happy to work with you in order to merge this work.
I hope I got this right.