diff options
Diffstat (limited to 'xml_parser.c')
-rw-r--r-- | xml_parser.c | 1183 |
1 files changed, 1183 insertions, 0 deletions
diff --git a/xml_parser.c b/xml_parser.c new file mode 100644 index 00000000..dfec5a78 --- /dev/null +++ b/xml_parser.c @@ -0,0 +1,1183 @@ +/* + * parser.c : an XML 1.0 non-verifying parser + * + * See Copyright for the status of this software. + * + * $Id$ + */ + +#include <config.h> +#include <stdio.h> +#include <ctype.h> +#include <string.h> /* for memset() only */ +#include <malloc.h> +#include <sys/stat.h> +#ifdef HAVE_FCNTL_H +#include <fcntl.h> +#endif +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif +#ifdef HAVE_ZLIB_H +#include <zlib.h> +#endif + +#include "xml_tree.h" +#include "xml_parser.h" +#include "xml_entities.h" + +/* + * A few macros needed to help building the parser. + */ + +#ifdef UNICODE +/* + * UNICODE version of the macros. Incomplete now TODO !!!! + */ +#define IS_CHAR(c) \ + (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \ + (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) + +#define SKIP_BLANKS(p) \ + while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \ + (*(p) == 0x3000)) (p)++; + +/* I'm too lazy to complete this one TODO !!!! */ +#define IS_BASECHAR(c) \ + ((((c) >= 0x41) && ((c) <= 0x5a)) || \ + (((c) >= 0x61) && ((c) <= 0x7a)) || \ + (((c) >= 0xaa) && ((c) <= 0x5b)) || \ + (((c) >= 0xc0) && ((c) <= 0xd6)) || \ + (((c) >= 0xd8) && ((c) <= 0xf6)) || \ + (((c) >= 0xf8) && ((c) <= 0xff)) || \ + ((c) == 0xba)) + +/* I'm too lazy to complete this one TODO !!!! */ +#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39)) + +/* I'm too lazy to complete this one TODO !!!! */ +#define IS_COMBINING(c) 0 + +#define IS_IGNORABLE(c) \ + ((((c) >= 0x200c) && ((c) <= 0x200f)) || \ + (((c) >= 0x202a) && ((c) <= 0x202e)) || \ + (((c) >= 0x206a) && ((c) <= 0x206f)) || \ + ((c) == 0xfeff)) + +#define IS_EXTENDER(c) \ + (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \ + ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \ + ((c) == 0xec6) || ((c) == 0x3005) \ + (((c) >= 0x3031) && ((c) <= 0x3035)) || \ + (((c) >= 0x309b) && ((c) <= 0x309e)) || \ + (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \ + (((c) >= 0xff70) && ((c) <= 0xff9e)) || \ + ((c) == 0xff9f)) + +#define IS_IDEOGRAPHIC(c) \ + ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \ + (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \ + (((c) >= 0x3021) && ((c) <= 0x3029)) || \ + ((c) == 0x3007)) + +#define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c)) + +/* I'm too lazy to complete this one ! */ +#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa)) +#else +/* + * 8bits / ASCII version of the macros. + */ +#define IS_CHAR(c) \ + (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20)) + +#define IS_BASECHAR(c) \ + ((((c) >= 0x41) && ((c) <= 0x5a)) || \ + (((c) >= 0x61) && ((c) <= 0x7a)) || \ + (((c) >= 0xaa) && ((c) <= 0x5b)) || \ + (((c) >= 0xc0) && ((c) <= 0xd6)) || \ + (((c) >= 0xd8) && ((c) <= 0xf6)) || \ + (((c) >= 0xf8) && ((c) <= 0xff)) || \ + ((c) == 0xba)) + +#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39)) + +#define IS_LETTER(c) IS_BASECHAR(c) + +#define IS_COMBINING(c) 0 + +#define IS_IGNORABLE(c) 0 + +#define IS_EXTENDER(c) ((c) == 0xb7) + +#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa)) +#endif + + +#define SKIP_EOL(p) \ + if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \ + if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; } + +#define SKIP_BLANKS(p) \ + while (IS_BLANK(*(p))) (p)++; + +#define MOVETO_ENDTAG(p) \ + while (IS_CHAR(*p) && (*(p) != '>')) (p)++; + +#define MOVETO_STARTTAG(p) \ + while (IS_CHAR(*p) && (*(p) != '<')) (p)++; + +/* + * Forward definition for recusive behaviour. + */ +xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt); + +/* + * xmlHandleData : this routine represent's the specific application + * behaviour when reading a piece of text. + * + * For example in WebDav, any piece made only of blanks is eliminated + */ + +CHAR *xmlHandleData(CHAR *in) { + CHAR *cur; + + if (in == NULL) return(NULL); + cur = in; + while (IS_CHAR(*cur)) { + if (!IS_BLANK(*cur)) goto not_blank; + cur++; + } + free(in); + return(NULL); + +not_blank: + return(in); +} + +/* + * xmlStrndup : a strdup for array of CHAR's + */ + +CHAR *xmlStrndup(const CHAR *cur, int len) { + CHAR *ret = malloc((len + 1) * sizeof(CHAR)); + + if (ret == NULL) { + fprintf(stderr, "malloc of %d byte failed\n", + (len + 1) * sizeof(CHAR)); + return(NULL); + } + memcpy(ret, cur, len * sizeof(CHAR)); + ret[len] = 0; + return(ret); +} + +/* + * xmlStrdup : a strdup for CHAR's + */ + +CHAR *xmlStrdup(const CHAR *cur) { + const CHAR *p = cur; + + while (IS_CHAR(*p)) p++; + return(xmlStrndup(cur, p - cur)); +} + +/* + * xmlStrcmp : a strcmp for CHAR's + */ + +int xmlStrcmp(const CHAR *str1, const CHAR *str2) { + register int tmp; + + do { + tmp = *str1++ - *str2++; + if (tmp != 0) return(tmp); + } while ((*str1 != 0) && (*str2 != 0)); + return (*str1 - *str2); +} + +/* + * xmlStrncmp : a strncmp for CHAR's + */ + +int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) { + register int tmp; + + if (len <= 0) return(0); + do { + tmp = *str1++ - *str2++; + if (tmp != 0) return(tmp); + len--; + if (len <= 0) return(0); + } while ((*str1 != 0) && (*str2 != 0)); + return (*str1 - *str2); +} + +/* + * xmlStrchr : a strchr for CHAR's + */ + +CHAR *xmlStrchr(const CHAR *str, CHAR val) { + while (*str != 0) { + if (*str == val) return((CHAR *) str); + str++; + } + return(NULL); +} + +/* + * xmlParseName : parse an XML name. + */ + +CHAR *xmlParseName(xmlParserCtxtPtr ctxt) { + const CHAR *q; + CHAR *ret = NULL; + + /* + * Name ::= (Letter | '_') (NameChar)* + */ + if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL); + q = ctxt->cur++; + while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || + (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || (ctxt->cur[0] == '_') || + (ctxt->cur[0] == ':') || + (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || + (IS_EXTENDER(ctxt->cur[0]))) + ctxt->cur++; + + ret = xmlStrndup(q, ctxt->cur - q); + + return(ret); +} + +/* + * Parse and return a string between quotes or doublequotes + */ +CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) { + CHAR *ret = NULL; + const CHAR *q; + + if (ctxt->cur[0] == '"') { + ctxt->cur++; + q = ctxt->cur; + while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '"')) ctxt->cur++; + if (ctxt->cur[0] != '"') + fprintf(stderr, "String not closed \"%.50s\n", q); + else { + ret = xmlStrndup(q, ctxt->cur - q); + ctxt->cur++; + } + } else if (ctxt->cur[0] == '\''){ + ctxt->cur++; + q = ctxt->cur; + while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '\'')) ctxt->cur++; + if (ctxt->cur[0] != '\'') + fprintf(stderr, "String not closed '%.50s\n", q); + else { + ret = xmlStrndup(q, ctxt->cur - q); + ctxt->cur++; + } + } + return(ret); +} + +/* + * Skip an XML (SGML) comment <!-- .... --> + * + * TODO !!!! Save the comment in the tree !!! + */ +void xmlParserSkipComment(xmlParserCtxtPtr ctxt) { + const CHAR *q, *start; + const CHAR *r; + + /* + * An extra check may avoid errors and isn't that costly ! + */ + if ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '!') || + (ctxt->cur[2] != '-') || (ctxt->cur[3] != '-')) return; + + ctxt->cur += 4; + start = q = ctxt->cur; + ctxt->cur++; + r = ctxt->cur; + ctxt->cur++; + while (IS_CHAR(ctxt->cur[0]) && + ((ctxt->cur[0] == ':') || (ctxt->cur[0] != '>') || + (*r != '-') || (*q != '-'))) { + ctxt->cur++;r++;q++; + } + if (!IS_CHAR(ctxt->cur[0])) { + fprintf(stderr, "Comment not terminated <!--%.50s\n", start); + ctxt->cur = start; /* !!! We shouldn't really try to recover !!! */ + } else { + ctxt->cur++; + } +} + +/* + * xmlParseNamespace: parse specific '<?namespace ...' constructs. + */ + +void xmlParseNamespace(xmlParserCtxtPtr ctxt) { + CHAR *href = NULL; + CHAR *AS = NULL; + int garbage = 0; + + /* + * We just skipped "namespace" or "xml:namespace" + */ + SKIP_BLANKS(ctxt->cur); + + while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '>')) { + /* + * We can have "ns" or "prefix" attributes + * Old encoding as 'href' or 'AS' attributes is still supported + */ + if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 's')) { + garbage = 0; + ctxt->cur += 2; + SKIP_BLANKS(ctxt->cur); + + if (ctxt->cur[0] != '=') continue; + ctxt->cur++; + SKIP_BLANKS(ctxt->cur); + + href = xmlParseQuotedString(ctxt); + SKIP_BLANKS(ctxt->cur); + } else if ((ctxt->cur[0] == 'h') && (ctxt->cur[1] == 'r') && + (ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f')) { + garbage = 0; + ctxt->cur += 4; + SKIP_BLANKS(ctxt->cur); + + if (ctxt->cur[0] != '=') continue; + ctxt->cur++; + SKIP_BLANKS(ctxt->cur); + + href = xmlParseQuotedString(ctxt); + SKIP_BLANKS(ctxt->cur); + } else if ((ctxt->cur[0] == 'p') && (ctxt->cur[1] == 'r') && + (ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f') && + (ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'x')) { + garbage = 0; + ctxt->cur += 6; + SKIP_BLANKS(ctxt->cur); + + if (ctxt->cur[0] != '=') continue; + ctxt->cur++; + SKIP_BLANKS(ctxt->cur); + + AS = xmlParseQuotedString(ctxt); + SKIP_BLANKS(ctxt->cur); + } else if ((ctxt->cur[0] == 'A') && (ctxt->cur[1] == 'S')) { + garbage = 0; + ctxt->cur += 2; + SKIP_BLANKS(ctxt->cur); + + if (ctxt->cur[0] != '=') continue; + ctxt->cur++; + SKIP_BLANKS(ctxt->cur); + + AS = xmlParseQuotedString(ctxt); + SKIP_BLANKS(ctxt->cur); + } else if ((ctxt->cur[0] == '?') && (ctxt->cur[1] == '>')) { + garbage = 0; + ctxt->cur ++; + } else { + /* + * Found garbage when parsing the namespace + */ + if (!garbage) fprintf(stderr, + "\nxmlParseNamespace found garbage: "); + fprintf(stderr, "%c", ctxt->cur[0]); + ctxt->cur++; + } + } + + MOVETO_ENDTAG(ctxt->cur); + ctxt->cur++; + + /* + * Register the DTD. + */ + if (href != NULL) + xmlNewDtd(ctxt->doc, href, AS); + + if (AS != NULL) free(AS); + if (href != NULL) free(href); +} + +/* + * xmlParsePI: parse an XML Processing Instruction. + */ + +void xmlParsePI(xmlParserCtxtPtr ctxt) { + if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) { + /* + * this is a Processing Instruction. + */ + ctxt->cur += 2; + + /* + * Special for WebDav, support for the Processing Instruction + * '<?namespace ...' contruct in the header of the XML document. + */ + if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 'a') && + (ctxt->cur[2] == 'm') && (ctxt->cur[3] == 'e') && + (ctxt->cur[4] == 's') && (ctxt->cur[5] == 'p') && + (ctxt->cur[6] == 'a') && (ctxt->cur[7] == 'c') && + (ctxt->cur[8] == 'e')) { + ctxt->cur += 9; + xmlParseNamespace(ctxt); + } else if ((ctxt->cur[0] == 'x') && (ctxt->cur[1] == 'm') && + (ctxt->cur[2] == 'l') && (ctxt->cur[3] == ':') && + (ctxt->cur[4] == 'n') && (ctxt->cur[5] == 'a') && + (ctxt->cur[6] == 'm') && (ctxt->cur[7] == 'e') && + (ctxt->cur[8] == 's') && (ctxt->cur[9] == 'p') && + (ctxt->cur[10] == 'a') && (ctxt->cur[11] == 'c') && + (ctxt->cur[12] == 'e')) { + ctxt->cur += 13; + xmlParseNamespace(ctxt); + } else { + /* Unknown PI, ignore it ! */ + fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n", + ctxt->cur); + MOVETO_ENDTAG(ctxt->cur); + ctxt->cur++; + } + } +} + +/* + * xmlParseAttribute: parse a start of tag. + * + * Attribute ::= Name Eq AttValue + */ + +void xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlNodePtr node) { + const CHAR *q; + CHAR *name, *value = NULL; + + if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) { + return; + } + q = ctxt->cur++; + while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || + (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || + (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') || + (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || + (IS_EXTENDER(ctxt->cur[0]))) + ctxt->cur++; + name = xmlStrndup(q, ctxt->cur - q); + + /* + * We should have the equal, we are laxist here and allow attributes + * without values and extra spaces. + */ + SKIP_BLANKS(ctxt->cur); + if (ctxt->cur[0] == '=') { + ctxt->cur++; + SKIP_BLANKS(ctxt->cur); + if ((ctxt->cur[0] != '\'') && (ctxt->cur[0] != '"')) { + fprintf(stderr, "Quotes were expected for attribute value %.20s\n", + q); + } else + value = xmlParseQuotedString(ctxt); + } + + /* + * Add the attribute to the node. + */ + if (name != NULL) { + xmlNewProp(node, name, value); + free(name); + } + if ( value != NULL ) + free(value); +} + +/* + * xmlParseStartTag: parse a start of tag. + */ + +xmlNodePtr xmlParseStartTag(xmlParserCtxtPtr ctxt) { + const CHAR *q; + CHAR *ns, *name; + xmlDtdPtr dtd = NULL; + xmlNodePtr ret = NULL; + + /* + * Theorically one should just parse a Name, but with the addition + * of the namespace needed for WebDav, it's a bit more complicated + * since the element name may be prefixed by a namespace prefix. + * + * QName ::= (NSPart ':')? LocalPart + * NSPart ::= Name + * LocalPart ::= Name + * STag ::= '<' QName (S Attribute)* S? '>' + * + * instead of : + * + * STag ::= '<' QName (S Attribute)* S? '>' + */ + if (ctxt->cur[0] != '<') return(NULL); + ctxt->cur++; + + if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL); + q = ctxt->cur++; + while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || + (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || + (ctxt->cur[0] == '_') || + (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || + (IS_EXTENDER(ctxt->cur[0]))) + ctxt->cur++; + + if (ctxt->cur[0] == ':') { + ns = xmlStrndup(q, ctxt->cur - q); + + ctxt->cur++; /* skip the column */ + if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) { + fprintf(stderr, + "Start tag : no element name after namespace identifier %.20s\n", + q); + free(ns); + return(NULL); + } + q = ctxt->cur++; + while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || + (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || + (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') || + (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || + (IS_EXTENDER(ctxt->cur[0]))) + ctxt->cur++; + name = xmlStrndup(q, ctxt->cur - q); + + /* + * Search the DTD associated to ns. + */ + dtd = xmlSearchDtd(ctxt->doc, ns); + if (dtd == NULL) + fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns); + free(ns); + } else + name = xmlStrndup(q, ctxt->cur - q); + + ret = xmlNewNode(dtd, name, NULL); + + /* + * Now parse the attributes, it ends up with the ending + * + * (S Attribute)* S? + */ + SKIP_BLANKS(ctxt->cur); + while ((IS_CHAR(ctxt->cur[0])) && + (ctxt->cur[0] != '>') && + ((ctxt->cur[0] != '/') || (ctxt->cur[1] != '>'))) { + if (IS_LETTER(ctxt->cur[0]) || (ctxt->cur[0] == '_')) + xmlParseAttribute(ctxt, ret); + else { + /* We should warn TODO !!! */ + ctxt->cur++; + } + SKIP_BLANKS(ctxt->cur); + } + + return(ret); +} + +/* + * xmlParseEndTag: parse an end of tag, note that the '</' part has + * already been read. + */ + +void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlDtdPtr *dtdPtr, CHAR **tagPtr) { + const CHAR *q; + CHAR *ns, *name; + xmlDtdPtr dtd = NULL; + + *dtdPtr = NULL; + *tagPtr = NULL; + + /* + * Theorically one should just parse a Name, but with the addition + * of the namespace needed for WebDav, it's a bit more complicated + * since the element name may be prefixed by a namespace prefix. + * + * QName ::= (NSPart ':')? LocalPart + * NSPart ::= Name + * LocalPart ::= Name + * ETag ::= '</' QName S? '>' + * + * instead of : + * + * ETag ::= '</' Name S? '>' + */ + if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return; + q = ctxt->cur++; + while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || + (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || + (ctxt->cur[0] == '_') || + (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || + (IS_EXTENDER(ctxt->cur[0]))) + ctxt->cur++; + + if (ctxt->cur[0] == ':') { + ns = xmlStrndup(q, ctxt->cur - q); + + ctxt->cur++; /* skip the column */ + if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) { + fprintf(stderr, + "End tag : no element name after namespace identifier %.20s\n", + q); + free(ns); + return; + } + q = ctxt->cur++; + while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || + (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || + (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') || + (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || + (IS_EXTENDER(ctxt->cur[0]))) + ctxt->cur++; + name = xmlStrndup(q, ctxt->cur - q); + + /* + * Search the DTD associated to ns. + */ + dtd = xmlSearchDtd(ctxt->doc, ns); + if (dtd == NULL) + fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns); + free(ns); + } else + name = xmlStrndup(q, ctxt->cur - q); + + *dtdPtr = dtd; + *tagPtr = name; + + /* + * We should definitely be at the ending "S? '>'" part + */ + SKIP_BLANKS(ctxt->cur); + if ((!IS_CHAR(ctxt->cur[0])) || (ctxt->cur[0] != '>')) { + fprintf(stderr, "End tag : expected '>', got %.20s\n", ctxt->cur); + /* + * Note : skipping to the next '>' is probably otherkill, + * especially in case the '>' is hust missing. + * + * Otherwise add: + * MOVETO_ENDTAG(ctxt->cur); + */ + } else + ctxt->cur++; + + return; +} + +/* + * xmlParseCDSect: escaped pure raw content. + */ +CHAR *xmlParseCDSect(xmlParserCtxtPtr ctxt) { + const CHAR *r, *s, *base; + CHAR *ret; + + base = ctxt->cur; + if (!IS_CHAR(ctxt->cur[0])) { + fprintf(stderr, "CData section not finished : %.20s\n", base); + return(NULL); + } + r = ctxt->cur++; + if (!IS_CHAR(ctxt->cur[0])) { + fprintf(stderr, "CData section not finished : %.20s\n", base); + return(NULL); + } + s = ctxt->cur++; + while (IS_CHAR(ctxt->cur[0]) && + ((*r != ']') || (*s != ']') || (ctxt->cur[0] != '>'))) { + r++;s++;ctxt->cur++; + } + if (!IS_CHAR(ctxt->cur[0])) { + fprintf(stderr, "CData section not finished : %.20s\n", base); + return(NULL); + } + ret = xmlStrndup(base, ctxt->cur-base); + + return(ret); +} + +/* + * xmlParseContent: a content is + * (element | PCData | Reference | CDSect | PI | Comment) + * + * element : starts by '<' + * PCData : any CHAR but '&' or '<' + * Reference : starts by '&' + * CDSect : starts by '<![CDATA[' + * PI : starts by '<?' + */ + +xmlNodePtr xmlParseContent(xmlParserCtxtPtr ctxt, xmlNodePtr node) { + const CHAR *q; + CHAR *data = NULL; + xmlNodePtr ret = NULL; + + /* + * First case : a Processing Instruction. + */ + if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) { + xmlParsePI(ctxt); + } + /* + * Second case : a CDSection + */ + if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') && + (ctxt->cur[2] == '[') && (ctxt->cur[3] == 'C') && + (ctxt->cur[4] == 'D') && (ctxt->cur[5] == 'A') && + (ctxt->cur[6] == 'T') && (ctxt->cur[7] == 'A') && + (ctxt->cur[8] == '[')) { + ctxt->cur += 9; + data = xmlParseCDSect(ctxt); + } + /* + * Third case : a sub-element. + */ + else if (ctxt->cur[0] == '<') { + ret = xmlParseElement(ctxt); + } + /* + * Last case, text. Note that References are handled directly. + */ + else { + q = ctxt->cur; + while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '<')) ctxt->cur++; + + if (!IS_CHAR(ctxt->cur[0])) { + fprintf(stderr, "Truncated content : %.50s\n", q); + return(NULL); + } + + /* + * Do the Entities decoding... + */ + data = xmlStrdup(xmlDecodeEntities(ctxt->doc, q, ctxt->cur - q)); + } + + /* + * Handle the data if any. If there is no child + * add it as content, otherwise create a new node of type text. + */ + if (data != NULL) + data = xmlHandleData(data); + if (data != NULL) { + if (node->childs == NULL) + xmlNodeSetContent(node, data); + else + ret = xmlNewText(data); + free(data); + } + + return(ret); +} + +/* + * xmlParseElement: parse an XML element + */ + +xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt) { + xmlNodePtr ret, child; + const CHAR *openTag = ctxt->cur; + const CHAR *closeTag = ctxt->cur; + + ret = xmlParseStartTag(ctxt); + if (ret == NULL) { + return(NULL); + } + + /* + * Check for an Empty Element. + */ + if ((ctxt->cur[0] == '/') && (ctxt->cur[1] == '>')) { + ctxt->cur += 2; + return(ret); + } + if (ctxt->cur[0] == '>') ctxt->cur++; + else { + fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", openTag); + return(NULL); + } + + /* + * Parse the content of the element: + * (element | PCData | Reference | CDSect | PI | Comment) * + * + * element : starts by '<' + * PCData : any CHAR but '&' or '<' + * Reference : starts by '&' + * CDSect : starts by '<![CDATA[' + * PI : starts by '<?' + * + * The loop stops upon detection of an end of tag '</' + */ + while ((IS_CHAR(ctxt->cur[0])) && + ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '/'))) { + child = xmlParseContent(ctxt, ret); + if (child != NULL) + xmlAddChild(ret, child); + } + if (!IS_CHAR(ctxt->cur[0])) { + fprintf(stderr, "Premature end of data in tag %.30s\n", openTag); + return(NULL); + } + + /* + * parse the end of tag : '</' has been detected. + */ + ctxt->cur += 2; + if (ctxt->cur[0] == '>') ctxt->cur++; /* simplified closing </> */ + else { + CHAR *endTag; + xmlDtdPtr endDtd; + + xmlParseEndTag(ctxt, &endDtd, &endTag); + + /* + * Check that the Name in the ETag is the same as in the STag. + */ + if (endDtd != ret->dtd) { + fprintf(stderr, "Start and End tags don't use the same DTD:\n"); + fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag); + } + if (strcmp(ret->name, endTag)) { + fprintf(stderr, "Start and End tags don't use the same name:\n"); + fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag); + } + + if ( endTag != NULL ) + free(endTag); + } + + return(ret); +} + +/* + * xmlParseXMLDecl: parse an XML declaration header + */ + +void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { + CHAR *version; + + /* + * We know that '<?xml' is here. + */ + ctxt->cur += 5; + + /* + * Parse the version info + */ + SKIP_BLANKS(ctxt->cur); + + /* + * We should have 'version=' here ! + */ + if ((ctxt->cur[0] == 'v') && (ctxt->cur[1] == 'e') && + (ctxt->cur[2] == 'r') && (ctxt->cur[3] == 's') && + (ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'o') && + (ctxt->cur[6] == 'n') && (ctxt->cur[7] == '=')) { + ctxt->cur += 8; + version = xmlParseQuotedString(ctxt); + if (version == NULL) + ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION); + else { + ctxt->doc = xmlNewDoc(version); + free(version); + } + } else { + ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION); + } + + /* + * We should check for Required Markup Declaration TODO !!!! + */ + MOVETO_ENDTAG(ctxt->cur); + ctxt->cur++; + +} + +/* + * xmlParseMisc: parse an XML Misc optionnal field. + * (Comment | PI | S)* + */ + +void xmlParseMisc(xmlParserCtxtPtr ctxt) { + while (((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) || + ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') && + (ctxt->cur[2] == '-') && (ctxt->cur[2] == '-')) || + IS_BLANK(ctxt->cur[0])) { + if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) { + xmlParsePI(ctxt); + } else if (IS_BLANK(ctxt->cur[0])) { + ctxt->cur++; + } else + xmlParserSkipComment(ctxt); + } +} + +/* + * xmlParseDocument : parse an XML document and build a tree. + */ + +int xmlParseDocument(xmlParserCtxtPtr ctxt) { + /* + * We should check for encoding here and plug-in some + * conversion code TODO !!!! + */ + + /* + * Wipe out everything which is before the first '<' + */ + SKIP_BLANKS(ctxt->cur); + + /* + * Check for the XMLDecl in the Prolog. + */ + if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') && + (ctxt->cur[2] == 'x') && (ctxt->cur[3] == 'm') && + (ctxt->cur[4] == 'l')) { + xmlParseXMLDecl(ctxt); + /* SKIP_EOL(cur); */ + SKIP_BLANKS(ctxt->cur); + } else if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') && + (ctxt->cur[2] == 'X') && (ctxt->cur[3] == 'M') && + (ctxt->cur[4] == 'L')) { + /* + * The first drafts were using <?XML and the final W3C REC + * now use <?xml ... + */ + xmlParseXMLDecl(ctxt); + /* SKIP_EOL(cur); */ + SKIP_BLANKS(ctxt->cur); + } else { + ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION); + } + + /* + * The Misc part of the Prolog + * (Comment | PI | S) * + */ + xmlParseMisc(ctxt); + + /* + * Time to start parsing + */ + ctxt->doc->root = xmlParseElement(ctxt); + + return(0); +} + +/* + * xmlParseDoc : parse an XML in-memory document and build a tree. + */ + +xmlDocPtr xmlParseDoc(CHAR *cur) { + xmlDocPtr ret; + xmlParserCtxtPtr ctxt; + + if (cur == NULL) return(NULL); + + ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt)); + if (ctxt == NULL) { + perror("malloc"); + return(NULL); + } + + xmlInitParserCtxt(ctxt); + ctxt->base = cur; + ctxt->cur = cur; + + xmlParseDocument(ctxt); + ret = ctxt->doc; + free(ctxt->nodes); + free(ctxt); + + return(ret); +} + +/* + * xmlParseFile : parse an XML file and build a tree. + */ + +xmlDocPtr xmlParseFile(const char *filename) { + xmlDocPtr ret; +#ifdef HAVE_ZLIB_H + gzFile input; +#else + int input; +#endif + int res; + struct stat buf; + char *buffer; + xmlParserCtxtPtr ctxt; + + res = stat(filename, &buf); + if (res < 0) return(NULL); + +#ifdef HAVE_ZLIB_H +retry_bigger: + buffer = malloc((buf.st_size * 20) + 100); +#else + buffer = malloc(buf.st_size + 100); +#endif + if (buffer == NULL) { + perror("malloc"); + return(NULL); + } + + memset(buffer, 0, sizeof(buffer)); +#ifdef HAVE_ZLIB_H + input = gzopen (filename, "r"); + if (input == NULL) { + fprintf (stderr, "Cannot read file %s :\n", filename); + perror ("gzopen failed"); + return(NULL); + } +#else + input = open (filename, O_RDONLY); + if (input < 0) { + fprintf (stderr, "Cannot read file %s :\n", filename); + perror ("open failed"); + return(NULL); + } +#endif +#ifdef HAVE_ZLIB_H + res = gzread(input, buffer, 20 * buf.st_size); +#else + res = read(input, buffer, buf.st_size); +#endif + if (res < 0) { + fprintf (stderr, "Cannot read file %s :\n", filename); +#ifdef HAVE_ZLIB_H + perror ("gzread failed"); +#else + perror ("read failed"); +#endif + return(NULL); + } +#ifdef HAVE_ZLIB_H + gzclose(input); + if (res >= 20 * buf.st_size) { + free(buffer); + buf.st_size *= 2; + goto retry_bigger; + } + buf.st_size = res; +#else + close(input); +#endif + + + ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt)); + if (ctxt == NULL) { + perror("malloc"); + return(NULL); + } + buffer[buf.st_size] = '\0'; + + xmlInitParserCtxt(ctxt); + ctxt->filename = filename; + ctxt->base = buffer; + ctxt->cur = buffer; + + xmlParseDocument(ctxt); + ret = ctxt->doc; + free(buffer); + free(ctxt->nodes); + free(ctxt); + + return(ret); +} + +/* + * xmlParseFile : parse an XML memory block and build a tree. + */ + +xmlDocPtr xmlParseMemory(char *buffer, int size) { + xmlDocPtr ret; + xmlParserCtxtPtr ctxt; + + ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt)); + if (ctxt == NULL) { + perror("malloc"); + return(NULL); + } + + buffer[size - 1] = '\0'; + + xmlInitParserCtxt(ctxt); + ctxt->base = buffer; + ctxt->cur = buffer; + + xmlParseDocument(ctxt); + ret = ctxt->doc; + free(ctxt->nodes); + free(ctxt); + + return(ret); +} + + + + +/* Initialize parser context */ +void xmlInitParserCtxt(xmlParserCtxtPtr ctxt) +{ + int i; + + ctxt->filename = NULL; + ctxt->base = NULL; + ctxt->cur = NULL; + ctxt->line = 1; + ctxt->col = 1; + ctxt->doc = NULL; + ctxt->depth = 0; + ctxt->max_depth = 10; + ctxt->nodes = (xmlNodePtr *) malloc(ctxt->max_depth * sizeof(xmlNodePtr)); + if (ctxt->nodes == NULL) { + fprintf(stderr, "malloc of %d byte failed\n", + ctxt->max_depth * sizeof(xmlNodePtr)); + ctxt->max_depth = 0; + } else { + for (i = 0;i < ctxt->max_depth;i++) + ctxt->nodes[i] = NULL; + } +} + + +/* + * Clear (release owned resources) and reinitialize context + */ +void xmlClearParserCtxt(xmlParserCtxtPtr ctx) +{ + xmlInitParserCtxt(ctx); +} + + +/* + * Setup the parser context to parse a new buffer; Clears any prior + * contents from the parser context. The buffer parameter must not be + * NULL, but the filename parameter can be + */ +void xmlSetupParserForBuffer(xmlParserCtxtPtr ctxt, const CHAR* buffer, + const char* filename) +{ + xmlClearParserCtxt(ctxt); + ctxt->base = buffer; + ctxt->cur = buffer; + ctxt->filename = filename; +} + + + +void xmlReportError(xmlParserCtxtPtr ctx, const CHAR* msg) +{ + fputs(msg, stderr); +} |