diff options
Diffstat (limited to 'libjava/gnu/xml/aelfred2/XmlParser.java')
-rw-r--r-- | libjava/gnu/xml/aelfred2/XmlParser.java | 5835 |
1 files changed, 0 insertions, 5835 deletions
diff --git a/libjava/gnu/xml/aelfred2/XmlParser.java b/libjava/gnu/xml/aelfred2/XmlParser.java deleted file mode 100644 index 15d730ff4a7..00000000000 --- a/libjava/gnu/xml/aelfred2/XmlParser.java +++ /dev/null @@ -1,5835 +0,0 @@ -/* XmlParser.java -- - Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. - -This file is part of GNU Classpath. - -GNU Classpath is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GNU Classpath is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GNU Classpath; see the file COPYING. If not, write to the -Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -02110-1301 USA. - -Linking this library statically or dynamically with other modules is -making a combined work based on this library. Thus, the terms and -conditions of the GNU General Public License cover the whole -combination. - -As a special exception, the copyright holders of this library give you -permission to link this library with independent modules to produce an -executable, regardless of the license terms of these independent -modules, and to copy and distribute the resulting executable under -terms of your choice, provided that you also meet, for each linked -independent module, the terms and conditions of the license of that -module. An independent module is a module which is not derived from -or based on this library. If you modify this library, you may extend -this exception to your version of the library, but you are not -obligated to do so. If you do not wish to do so, delete this -exception statement from your version. - -Partly derived from code which carried the following notice: - - Copyright (c) 1997, 1998 by Microstar Software Ltd. - - AElfred is free for both commercial and non-commercial use and - redistribution, provided that Microstar's copyright and disclaimer are - retained intact. You are free to modify AElfred for your own use and - to redistribute AElfred with your modifications, provided that the - modifications are clearly documented. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - merchantability or fitness for a particular purpose. Please use it AT - YOUR OWN RISK. -*/ - -package gnu.xml.aelfred2; - -import gnu.java.security.action.GetPropertyAction; - -import java.io.BufferedInputStream; -import java.io.CharConversionException; -import java.io.EOFException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.IOException; -import java.io.Reader; -import java.io.UnsupportedEncodingException; -import java.net.URL; -import java.net.URLConnection; -import java.security.AccessController; - -import java.util.Iterator; -import java.util.HashMap; -import java.util.LinkedList; - -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; - - -/** - * Parse XML documents and return parse events through call-backs. - * Use the <code>SAXDriver</code> class as your entry point, as all - * internal parser interfaces are subject to change. - * - * @author Written by David Megginson <dmeggins@microstar.com> - * (version 1.2a with bugfixes) - * @author Updated by David Brownell <dbrownell@users.sourceforge.net> - * @see SAXDriver - */ -final class XmlParser -{ - - // avoid slow per-character readCh() - private final static boolean USE_CHEATS = true; - - //////////////////////////////////////////////////////////////////////// - // Constants. - //////////////////////////////////////////////////////////////////////// - - // - // Constants for element content type. - // - - /** - * Constant: an element has not been declared. - * @see #getElementContentType - */ - public final static int CONTENT_UNDECLARED = 0; - - /** - * Constant: the element has a content model of ANY. - * @see #getElementContentType - */ - public final static int CONTENT_ANY = 1; - - /** - * Constant: the element has declared content of EMPTY. - * @see #getElementContentType - */ - public final static int CONTENT_EMPTY = 2; - - /** - * Constant: the element has mixed content. - * @see #getElementContentType - */ - public final static int CONTENT_MIXED = 3; - - /** - * Constant: the element has element content. - * @see #getElementContentType - */ - public final static int CONTENT_ELEMENTS = 4; - - - // - // Constants for the entity type. - // - - /** - * Constant: the entity has not been declared. - * @see #getEntityType - */ - public final static int ENTITY_UNDECLARED = 0; - - /** - * Constant: the entity is internal. - * @see #getEntityType - */ - public final static int ENTITY_INTERNAL = 1; - - /** - * Constant: the entity is external, non-parsable data. - * @see #getEntityType - */ - public final static int ENTITY_NDATA = 2; - - /** - * Constant: the entity is external XML data. - * @see #getEntityType - */ - public final static int ENTITY_TEXT = 3; - - // - // Attribute type constants are interned literal strings. - // - - // - // Constants for supported encodings. "external" is just a flag. - // - private final static int ENCODING_EXTERNAL = 0; - private final static int ENCODING_UTF_8 = 1; - private final static int ENCODING_ISO_8859_1 = 2; - private final static int ENCODING_UCS_2_12 = 3; - private final static int ENCODING_UCS_2_21 = 4; - private final static int ENCODING_UCS_4_1234 = 5; - private final static int ENCODING_UCS_4_4321 = 6; - private final static int ENCODING_UCS_4_2143 = 7; - private final static int ENCODING_UCS_4_3412 = 8; - private final static int ENCODING_ASCII = 9; - - // - // Constants for attribute default value. - // - - /** - * Constant: the attribute is not declared. - * @see #getAttributeDefaultValueType - */ - public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; - - /** - * Constant: the attribute has a literal default value specified. - * @see #getAttributeDefaultValueType - * @see #getAttributeDefaultValue - */ - public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; - - /** - * Constant: the attribute was declared #IMPLIED. - * @see #getAttributeDefaultValueType - */ - public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; - - /** - * Constant: the attribute was declared #REQUIRED. - * @see #getAttributeDefaultValueType - */ - public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; - - /** - * Constant: the attribute was declared #FIXED. - * @see #getAttributeDefaultValueType - * @see #getAttributeDefaultValue - */ - public final static int ATTRIBUTE_DEFAULT_FIXED = 34; - - // - // Constants for input. - // - private final static int INPUT_NONE = 0; - private final static int INPUT_INTERNAL = 1; - private final static int INPUT_STREAM = 3; - private final static int INPUT_READER = 5; - - // - // Flags for reading literals. - // - // expand general entity refs (attribute values in dtd and content) - private final static int LIT_ENTITY_REF = 2; - // normalize this value (space chars) (attributes, public ids) - private final static int LIT_NORMALIZE = 4; - // literal is an attribute value - private final static int LIT_ATTRIBUTE = 8; - // don't expand parameter entities - private final static int LIT_DISABLE_PE = 16; - // don't expand [or parse] character refs - private final static int LIT_DISABLE_CREF = 32; - // don't parse general entity refs - private final static int LIT_DISABLE_EREF = 64; - // literal is a public ID value - private final static int LIT_PUBID = 256; - - // - // Flags affecting PE handling in DTDs (if expandPE is true). - // PEs expand with space padding, except inside literals. - // - private final static int CONTEXT_NORMAL = 0; - private final static int CONTEXT_LITERAL = 1; - - // Emit warnings for relative URIs with no base URI. - static boolean uriWarnings; - static - { - String key = "gnu.xml.aelfred2.XmlParser.uriWarnings"; - GetPropertyAction a = new GetPropertyAction(key); - uriWarnings = "true".equals(AccessController.doPrivileged(a)); - } - - // - // The current XML handler interface. - // - private SAXDriver handler; - - // - // I/O information. - // - private Reader reader; // current reader - private InputStream is; // current input stream - private int line; // current line number - private int column; // current column number - private int sourceType; // type of input source - private LinkedList inputStack; // stack of input soruces - private URLConnection externalEntity; // current external entity - private int encoding; // current character encoding - private int currentByteCount; // bytes read from current source - private InputSource scratch; // temporary - - // - // Buffers for decoded but unparsed character input. - // - private char[] readBuffer; - private int readBufferPos; - private int readBufferLength; - private int readBufferOverflow; // overflow from last data chunk. - - // - // Buffer for undecoded raw byte input. - // - private final static int READ_BUFFER_MAX = 16384; - private byte[] rawReadBuffer; - - - // - // Buffer for attribute values, char refs, DTD stuff. - // - private static int DATA_BUFFER_INITIAL = 4096; - private char[] dataBuffer; - private int dataBufferPos; - - // - // Buffer for parsed names. - // - private static int NAME_BUFFER_INITIAL = 1024; - private char[] nameBuffer; - private int nameBufferPos; - - // - // Save any standalone flag - // - private boolean docIsStandalone; - - // - // Hashtables for DTD information on elements, entities, and notations. - // Populated until we start ignoring decls (because of skipping a PE) - // - private HashMap elementInfo; - private HashMap entityInfo; - private HashMap notationInfo; - private boolean skippedPE; - - // - // Element type currently in force. - // - private String currentElement; - private int currentElementContent; - - // - // Stack of entity names, to detect recursion. - // - private LinkedList entityStack; - - // - // PE expansion is enabled in most chunks of the DTD, not all. - // When it's enabled, literals are treated differently. - // - private boolean inLiteral; - private boolean expandPE; - private boolean peIsError; - - // - // can't report entity expansion inside two constructs: - // - attribute expansions (internal entities only) - // - markup declarations (parameter entities only) - // - private boolean doReport; - - // - // Symbol table, for caching interned names. - // - // These show up wherever XML names or nmtokens are used: naming elements, - // attributes, PIs, notations, entities, and enumerated attribute values. - // - // NOTE: This hashtable doesn't grow. The default size is intended to be - // rather large for most documents. Example: one snapshot of the DocBook - // XML 4.1 DTD used only about 350 such names. As a rule, only pathological - // documents (ones that don't reuse names) should ever see much collision. - // - // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. - // "2039" keeps the hash table size at about two memory pages on typical - // 32 bit hardware. - // - private final static int SYMBOL_TABLE_LENGTH = 2039; - - private Object[][] symbolTable; - - // - // Hash table of attributes found in current start tag. - // - private String[] tagAttributes; - private int tagAttributePos; - - // - // Utility flag: have we noticed a CR while reading the last - // data chunk? If so, we will have to go back and normalise - // CR or CR/LF line ends. - // - private boolean sawCR; - - // - // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. - // - private boolean inCDATA; - - // - // Xml version. - // - private static final int XML_10 = 0; - private static final int XML_11 = 1; - private int xmlVersion = XML_10; - - ////////////////////////////////////////////////////////////////////// - // Constructors. - //////////////////////////////////////////////////////////////////////// - - /** - * Construct a new parser with no associated handler. - * @see #setHandler - * @see #parse - */ - // package private - XmlParser() - { - } - - /** - * Set the handler that will receive parsing events. - * @param handler The handler to receive callback events. - * @see #parse - */ - // package private - void setHandler(SAXDriver handler) - { - this.handler = handler; - } - - /** - * Parse an XML document from the character stream, byte stream, or URI - * that you provide (in that order of preference). Any URI that you - * supply will become the base URI for resolving relative URI, and may - * be used to acquire a reader or byte stream. - * - * <p> Only one thread at a time may use this parser; since it is - * private to this package, post-parse cleanup is done by the caller, - * which MUST NOT REUSE the parser (just null it). - * - * @param systemId Absolute URI of the document; should never be null, - * but may be so iff a reader <em>or</em> a stream is provided. - * @param publicId The public identifier of the document, or null. - * @param reader A character stream; must be null if stream isn't. - * @param stream A byte input stream; must be null if reader isn't. - * @param encoding The suggested encoding, or null if unknown. - * @exception java.lang.Exception Basically SAXException or IOException - */ - // package private - void doParse(String systemId, String publicId, Reader reader, - InputStream stream, String encoding) - throws Exception - { - if (handler == null) - { - throw new IllegalStateException("no callback handler"); - } - - initializeVariables(); - - // predeclare the built-in entities here (replacement texts) - // we don't need to intern(), since we're guaranteed literals - // are always (globally) interned. - setInternalEntity("amp", "&"); - setInternalEntity("lt", "<"); - setInternalEntity("gt", ">"); - setInternalEntity("apos", "'"); - setInternalEntity("quot", """); - - try - { - // pushURL first to ensure locator is correct in startDocument - // ... it might report an IO or encoding exception. - handler.startDocument(); - pushURL(false, "[document]", - // default baseURI: null - new ExternalIdentifiers(publicId, systemId, null), - reader, stream, encoding, false); - - parseDocument(); - } - catch (EOFException e) - { - //empty input - error("empty document, with no root element."); - } - finally - { - if (reader != null) - { - try - { - reader.close(); - } - catch (IOException e) - { - /* ignore */ - } - } - if (stream != null) - { - try - { - stream.close(); - } - catch (IOException e) - { - /* ignore */ - } - } - if (is != null) - { - try - { - is.close(); - } - catch (IOException e) - { - /* ignore */ - } - } - scratch = null; - } - } - - ////////////////////////////////////////////////////////////////////// - // Error reporting. - ////////////////////////////////////////////////////////////////////// - - /** - * Report an error. - * @param message The error message. - * @param textFound The text that caused the error (or null). - * @see SAXDriver#error - * @see #line - */ - private void error(String message, String textFound, String textExpected) - throws SAXException - { - if (textFound != null) - { - message = message + " (found \"" + textFound + "\")"; - } - if (textExpected != null) - { - message = message + " (expected \"" + textExpected + "\")"; - } - handler.fatal(message); - - // "can't happen" - throw new SAXException(message); - } - - /** - * Report a serious error. - * @param message The error message. - * @param textFound The text that caused the error (or null). - */ - private void error(String message, char textFound, String textExpected) - throws SAXException - { - error(message, new Character(textFound).toString(), textExpected); - } - - /** - * Report typical case fatal errors. - */ - private void error(String message) - throws SAXException - { - handler.fatal(message); - } - - ////////////////////////////////////////////////////////////////////// - // Major syntactic productions. - ////////////////////////////////////////////////////////////////////// - - /** - * Parse an XML document. - * <pre> - * [1] document ::= prolog element Misc* - * </pre> - * <p>This is the top-level parsing function for a single XML - * document. As a minimum, a well-formed document must have - * a document element, and a valid document must have a prolog - * (one with doctype) as well. - */ - private void parseDocument() - throws Exception - { - try - { // added by MHK - boolean sawDTD = parseProlog(); - require('<'); - parseElement(!sawDTD); - } - catch (EOFException ee) - { // added by MHK - error("premature end of file", "[EOF]", null); - } - - try - { - parseMisc(); //skip all white, PIs, and comments - char c = readCh(); //if this doesn't throw an exception... - error("unexpected characters after document end", c, null); - } - catch (EOFException e) - { - return; - } - } - - static final char[] startDelimComment = { '<', '!', '-', '-' }; - static final char[] endDelimComment = { '-', '-' }; - - /** - * Skip a comment. - * <pre> - * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" - * </pre> - * <p> (The <code><!--</code> has already been read.) - */ - private void parseComment() - throws Exception - { - char c; - boolean saved = expandPE; - - expandPE = false; - parseUntil(endDelimComment); - require('>'); - expandPE = saved; - handler.comment(dataBuffer, 0, dataBufferPos); - dataBufferPos = 0; - } - - static final char[] startDelimPI = { '<', '?' }; - static final char[] endDelimPI = { '?', '>' }; - - /** - * Parse a processing instruction and do a call-back. - * <pre> - * [16] PI ::= '<?' PITarget - * (S (Char* - (Char* '?>' Char*)))? - * '?>' - * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) - * </pre> - * <p> (The <code><?</code> has already been read.) - */ - private void parsePI() - throws SAXException, IOException - { - String name; - boolean saved = expandPE; - - expandPE = false; - name = readNmtoken(true); - //NE08 - if (name.indexOf(':') >= 0) - { - error("Illegal character(':') in processing instruction name ", - name, null); - } - if ("xml".equalsIgnoreCase(name)) - { - error("Illegal processing instruction target", name, null); - } - if (!tryRead(endDelimPI)) - { - requireWhitespace(); - parseUntil(endDelimPI); - } - expandPE = saved; - handler.processingInstruction(name, dataBufferToString()); - } - - static final char[] endDelimCDATA = { ']', ']', '>' }; - - private boolean isDirtyCurrentElement; - - /** - * Parse a CDATA section. - * <pre> - * [18] CDSect ::= CDStart CData CDEnd - * [19] CDStart ::= '<![CDATA[' - * [20] CData ::= (Char* - (Char* ']]>' Char*)) - * [21] CDEnd ::= ']]>' - * </pre> - * <p> (The '<![CDATA[' has already been read.) - */ - private void parseCDSect() - throws Exception - { - parseUntil(endDelimCDATA); - dataBufferFlush(); - } - - /** - * Parse the prolog of an XML document. - * <pre> - * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? - * </pre> - * <p>We do not look for the XML declaration here, because it was - * handled by pushURL (). - * @see pushURL - * @return true if a DTD was read. - */ - private boolean parseProlog() - throws Exception - { - parseMisc(); - - if (tryRead("<!DOCTYPE")) - { - parseDoctypedecl(); - parseMisc(); - return true; - } - return false; - } - - private void checkLegalVersion(String version) - throws SAXException - { - int len = version.length(); - for (int i = 0; i < len; i++) - { - char c = version.charAt(i); - if ('0' <= c && c <= '9') - { - continue; - } - if (c == '_' || c == '.' || c == ':' || c == '-') - { - continue; - } - if ('a' <= c && c <= 'z') - { - continue; - } - if ('A' <= c && c <= 'Z') - { - continue; - } - error ("illegal character in version", version, "1.0"); - } - } - - /** - * Parse the XML declaration. - * <pre> - * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' - * [24] VersionInfo ::= S 'version' Eq - * ("'" VersionNum "'" | '"' VersionNum '"' ) - * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* - * [32] SDDecl ::= S 'standalone' Eq - * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) - * [80] EncodingDecl ::= S 'encoding' Eq - * ( "'" EncName "'" | "'" EncName "'" ) - * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* - * </pre> - * <p> (The <code><?xml</code> and whitespace have already been read.) - * @return the encoding in the declaration, uppercased; or null - * @see #parseTextDecl - * @see #setupDecoding - */ - private String parseXMLDecl(boolean ignoreEncoding) - throws SAXException, IOException - { - String version; - String encodingName = null; - String standalone = null; - int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; - String inputEncoding = null; - - switch (this.encoding) - { - case ENCODING_EXTERNAL: - case ENCODING_UTF_8: - inputEncoding = "UTF-8"; - break; - case ENCODING_ISO_8859_1: - inputEncoding = "ISO-8859-1"; - break; - case ENCODING_UCS_2_12: - inputEncoding = "UTF-16BE"; - break; - case ENCODING_UCS_2_21: - inputEncoding = "UTF-16LE"; - break; - } - - // Read the version. - require("version"); - parseEq(); - checkLegalVersion(version = readLiteral(flags)); - if (!version.equals("1.0")) - { - if (version.equals("1.1")) - { - handler.warn("expected XML version 1.0, not: " + version); - xmlVersion = XML_11; - } - else - { - error("illegal XML version", version, "1.0 or 1.1"); - } - } - else - { - xmlVersion = XML_10; - } - // Try reading an encoding declaration. - boolean white = tryWhitespace(); - - if (tryRead("encoding")) - { - if (!white) - { - error("whitespace required before 'encoding='"); - } - parseEq(); - encodingName = readLiteral(flags); - if (!ignoreEncoding) - { - setupDecoding(encodingName); - } - } - - // Try reading a standalone declaration - if (encodingName != null) - { - white = tryWhitespace(); - } - if (tryRead("standalone")) - { - if (!white) - { - error("whitespace required before 'standalone='"); - } - parseEq(); - standalone = readLiteral(flags); - if ("yes".equals(standalone)) - { - docIsStandalone = true; - } - else if (!"no".equals(standalone)) - { - error("standalone flag must be 'yes' or 'no'"); - } - } - - skipWhitespace(); - require("?>"); - - if (inputEncoding == null) - { - inputEncoding = encodingName; - } - handler.xmlDecl(version, encodingName, docIsStandalone, - inputEncoding); - - return encodingName; - } - - /** - * Parse a text declaration. - * <pre> - * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' - * [80] EncodingDecl ::= S 'encoding' Eq - * ( '"' EncName '"' | "'" EncName "'" ) - * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* - * </pre> - * <p> (The <code><?xml</code>' and whitespace have already been read.) - * @return the encoding in the declaration, uppercased; or null - * @see #parseXMLDecl - * @see #setupDecoding - */ - private String parseTextDecl(boolean ignoreEncoding) - throws SAXException, IOException - { - String encodingName = null; - int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; - - // Read an optional version. - if (tryRead ("version")) - { - String version; - parseEq(); - checkLegalVersion(version = readLiteral(flags)); - - if (version.equals("1.1")) - { - if (xmlVersion == XML_10) - { - error("external subset has later version number.", "1.0", - version); - } - handler.warn("expected XML version 1.0, not: " + version); - xmlVersion = XML_11; - } - else if (!version.equals("1.0")) - { - error("illegal XML version", version, "1.0 or 1.1"); - } - requireWhitespace(); - } - - // Read the encoding. - require("encoding"); - parseEq(); - encodingName = readLiteral(flags); - if (!ignoreEncoding) - { - setupDecoding(encodingName); - } - skipWhitespace(); - require("?>"); - - return encodingName; - } - - /** - * Sets up internal state so that we can decode an entity using the - * specified encoding. This is used when we start to read an entity - * and we have been given knowledge of its encoding before we start to - * read any data (e.g. from a SAX input source or from a MIME type). - * - * <p> It is also used after autodetection, at which point only very - * limited adjustments to the encoding may be used (switching between - * related builtin decoders). - * - * @param encodingName The name of the encoding specified by the user. - * @exception IOException if the encoding isn't supported either - * internally to this parser, or by the hosting JVM. - * @see #parseXMLDecl - * @see #parseTextDecl - */ - private void setupDecoding(String encodingName) - throws SAXException, IOException - { - encodingName = encodingName.toUpperCase(); - - // ENCODING_EXTERNAL indicates an encoding that wasn't - // autodetected ... we can use builtin decoders, or - // ones from the JVM (InputStreamReader). - - // Otherwise we can only tweak what was autodetected, and - // only for single byte (ASCII derived) builtin encodings. - - // ASCII-derived encodings - if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) - { - if (encodingName.equals("ISO-8859-1") - || encodingName.equals("8859_1") - || encodingName.equals("ISO8859_1")) - { - encoding = ENCODING_ISO_8859_1; - return; - } - else if (encodingName.equals("US-ASCII") - || encodingName.equals("ASCII")) - { - encoding = ENCODING_ASCII; - return; - } - else if (encodingName.equals("UTF-8") - || encodingName.equals("UTF8")) - { - encoding = ENCODING_UTF_8; - return; - } - else if (encoding != ENCODING_EXTERNAL) - { - // used to start with a new reader ... - throw new UnsupportedEncodingException(encodingName); - } - // else fallthrough ... - // it's ASCII-ish and something other than a builtin - } - - // Unicode and such - if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) - { - if (!(encodingName.equals("ISO-10646-UCS-2") - || encodingName.equals("UTF-16") - || encodingName.equals("UTF-16BE") - || encodingName.equals("UTF-16LE"))) - { - error("unsupported Unicode encoding", encodingName, "UTF-16"); - } - return; - } - - // four byte encodings - if (encoding == ENCODING_UCS_4_1234 - || encoding == ENCODING_UCS_4_4321 - || encoding == ENCODING_UCS_4_2143 - || encoding == ENCODING_UCS_4_3412) - { - // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists - if (!encodingName.equals("ISO-10646-UCS-4")) - { - error("unsupported 32-bit encoding", encodingName, - "ISO-10646-UCS-4"); - } - return; - } - - // assert encoding == ENCODING_EXTERNAL - // if (encoding != ENCODING_EXTERNAL) - // throw new RuntimeException ("encoding = " + encoding); - - if (encodingName.equals("UTF-16BE")) - { - encoding = ENCODING_UCS_2_12; - return; - } - if (encodingName.equals("UTF-16LE")) - { - encoding = ENCODING_UCS_2_21; - return; - } - - // We couldn't use the builtin decoders at all. But we can try to - // create a reader, since we haven't messed up buffering. Tweak - // the encoding name if necessary. - - if (encodingName.equals("UTF-16") - || encodingName.equals("ISO-10646-UCS-2")) - { - encodingName = "Unicode"; - } - // Ignoring all the EBCDIC aliases here - - reader = new InputStreamReader(is, encodingName); - sourceType = INPUT_READER; - } - - /** - * Parse miscellaneous markup outside the document element and DOCTYPE - * declaration. - * <pre> - * [27] Misc ::= Comment | PI | S - * </pre> - */ - private void parseMisc() - throws Exception - { - while (true) - { - skipWhitespace(); - if (tryRead(startDelimPI)) - { - parsePI(); - } - else if (tryRead(startDelimComment)) - { - parseComment(); - } - else - { - return; - } - } - } - - /** - * Parse a document type declaration. - * <pre> - * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? - * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' - * </pre> - * <p> (The <code><!DOCTYPE</code> has already been read.) - */ - private void parseDoctypedecl() - throws Exception - { - String rootName; - ExternalIdentifiers ids; - - // Read the document type name. - requireWhitespace(); - rootName = readNmtoken(true); - - // Read the External subset's IDs - skipWhitespace(); - ids = readExternalIds(false, true); - - // report (a) declaration of name, (b) lexical info (ids) - handler.doctypeDecl(rootName, ids.publicId, ids.systemId); - - // Internal subset is parsed first, if present - skipWhitespace(); - if (tryRead('[')) - { - - // loop until the subset ends - while (true) - { - doReport = expandPE = true; - skipWhitespace(); - doReport = expandPE = false; - if (tryRead(']')) - { - break; // end of subset - } - else - { - // WFC, PEs in internal subset (only between decls) - peIsError = expandPE = true; - parseMarkupdecl(); - peIsError = expandPE = false; - } - } - } - skipWhitespace(); - require('>'); - - // Read the external subset, if any - InputSource subset; - - if (ids.systemId == null) - { - subset = handler.getExternalSubset(rootName, - handler.getSystemId()); - } - else - { - subset = null; - } - if (ids.systemId != null || subset != null) - { - pushString(null, ">"); - - // NOTE: [dtd] is so we say what SAX2 expects, - // though it's misleading (subset, not entire dtd) - if (ids.systemId != null) - { - pushURL(true, "[dtd]", ids, null, null, null, true); - } - else - { - handler.warn("modifying document by adding external subset"); - pushURL(true, "[dtd]", - new ExternalIdentifiers(subset.getPublicId(), - subset.getSystemId(), - null), - subset.getCharacterStream(), - subset.getByteStream(), - subset.getEncoding(), - false); - } - - // Loop until we end up back at '>' - while (true) - { - doReport = expandPE = true; - skipWhitespace(); - doReport = expandPE = false; - if (tryRead('>')) - { - break; - } - else - { - expandPE = true; - parseMarkupdecl(); - expandPE = false; - } - } - - // the ">" string isn't popped yet - if (inputStack.size() != 1) - { - error("external subset has unmatched '>'"); - } - } - - // done dtd - handler.endDoctype(); - expandPE = false; - doReport = true; - } - - /** - * Parse a markup declaration in the internal or external DTD subset. - * <pre> - * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl - * | NotationDecl | PI | Comment - * [30] extSubsetDecl ::= (markupdecl | conditionalSect - * | PEReference | S) * - * </pre> - * <p> Reading toplevel PE references is handled as a lexical issue - * by the caller, as is whitespace. - */ - private void parseMarkupdecl() - throws Exception - { - char[] saved = null; - boolean savedPE = expandPE; - - // prevent "<%foo;" and ensures saved entity is right - require('<'); - unread('<'); - expandPE = false; - - if (tryRead("<!ELEMENT")) - { - saved = readBuffer; - expandPE = savedPE; - parseElementDecl(); - } - else if (tryRead("<!ATTLIST")) - { - saved = readBuffer; - expandPE = savedPE; - parseAttlistDecl(); - } - else if (tryRead("<!ENTITY")) - { - saved = readBuffer; - expandPE = savedPE; - parseEntityDecl(); - } - else if (tryRead("<!NOTATION")) - { - saved = readBuffer; - expandPE = savedPE; - parseNotationDecl(); - } - else if (tryRead(startDelimPI)) - { - saved = readBuffer; - expandPE = savedPE; - parsePI(); - } - else if (tryRead(startDelimComment)) - { - saved = readBuffer; - expandPE = savedPE; - parseComment(); - } - else if (tryRead("<![")) - { - saved = readBuffer; - expandPE = savedPE; - if (inputStack.size() > 0) - { - parseConditionalSect(saved); - } - else - { - error("conditional sections illegal in internal subset"); - } - } - else - { - error("expected markup declaration"); - } - - // VC: Proper Decl/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Declaration/PE nesting"); - } - } - - /** - * Parse an element, with its tags. - * <pre> - * [39] element ::= EmptyElementTag | STag content ETag - * [40] STag ::= '<' Name (S Attribute)* S? '>' - * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' - * </pre> - * <p> (The '<' has already been read.) - * <p>NOTE: this method actually chains onto parseContent (), if necessary, - * and parseContent () will take care of calling parseETag (). - */ - private void parseElement(boolean maybeGetSubset) - throws Exception - { - String gi; - char c; - int oldElementContent = currentElementContent; - String oldElement = currentElement; - ElementDecl element; - - // This is the (global) counter for the - // array of specified attributes. - tagAttributePos = 0; - - // Read the element type name. - gi = readNmtoken(true); - - // If we saw no DTD, and this is the document root element, - // let the application modify the input stream by providing one. - if (maybeGetSubset) - { - InputSource subset = handler.getExternalSubset(gi, - handler.getSystemId()); - if (subset != null) - { - String publicId = subset.getPublicId(); - String systemId = subset.getSystemId(); - - handler.warn("modifying document by adding DTD"); - handler.doctypeDecl(gi, publicId, systemId); - pushString(null, ">"); - - // NOTE: [dtd] is so we say what SAX2 expects, - // though it's misleading (subset, not entire dtd) - pushURL(true, "[dtd]", - new ExternalIdentifiers(publicId, systemId, null), - subset.getCharacterStream(), - subset.getByteStream(), - subset.getEncoding(), - false); - - // Loop until we end up back at '>' - while (true) - { - doReport = expandPE = true; - skipWhitespace(); - doReport = expandPE = false; - if (tryRead('>')) - { - break; - } - else - { - expandPE = true; - parseMarkupdecl(); - expandPE = false; - } - } - - // the ">" string isn't popped yet - if (inputStack.size() != 1) - { - error("external subset has unmatched '>'"); - } - - handler.endDoctype(); - } - } - - // Determine the current content type. - currentElement = gi; - element = (ElementDecl) elementInfo.get(gi); - currentElementContent = getContentType(element, CONTENT_ANY); - - // Read the attributes, if any. - // After this loop, "c" is the closing delimiter. - boolean white = tryWhitespace(); - c = readCh(); - while (c != '/' && c != '>') - { - unread(c); - if (!white) - { - error("need whitespace between attributes"); - } - parseAttribute(gi); - white = tryWhitespace(); - c = readCh(); - } - - // Supply any defaulted attributes. - Iterator atts = declaredAttributes(element); - if (atts != null) - { - String aname; -loop: - while (atts.hasNext()) - { - aname = (String) atts.next(); - // See if it was specified. - for (int i = 0; i < tagAttributePos; i++) - { - if (tagAttributes[i] == aname) - { - continue loop; - } - } - // ... or has a default - String value = getAttributeDefaultValue(gi, aname); - - if (value == null) - { - continue; - } - handler.attribute(aname, value, false); - } - } - - // Figure out if this is a start tag - // or an empty element, and dispatch an - // event accordingly. - switch (c) - { - case '>': - handler.startElement(gi); - parseContent(); - break; - case '/': - require('>'); - handler.startElement(gi); - handler.endElement(gi); - break; - } - - // Restore the previous state. - currentElement = oldElement; - currentElementContent = oldElementContent; - } - - /** - * Parse an attribute assignment. - * <pre> - * [41] Attribute ::= Name Eq AttValue - * </pre> - * @param name The name of the attribute's element. - * @see SAXDriver#attribute - */ - private void parseAttribute(String name) - throws Exception - { - String aname; - String type; - String value; - int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; - - // Read the attribute name. - aname = readNmtoken(true); - type = getAttributeType(name, aname); - - // Parse '=' - parseEq(); - - // Read the value, normalizing whitespace - // unless it is CDATA. - if (handler.stringInterning) - { - if (type == "CDATA" || type == null) - { - value = readLiteral(flags); - } - else - { - value = readLiteral(flags | LIT_NORMALIZE); - } - } - else - { - if (type.equals("CDATA") || type == null) - { - value = readLiteral(flags); - } - else - { - value = readLiteral(flags | LIT_NORMALIZE); - } - } - - // WFC: no duplicate attributes - for (int i = 0; i < tagAttributePos; i++) - { - if (aname.equals(tagAttributes [i])) - { - error("duplicate attribute", aname, null); - } - } - - // Inform the handler about the - // attribute. - handler.attribute(aname, value, true); - dataBufferPos = 0; - - // Note that the attribute has been - // specified. - if (tagAttributePos == tagAttributes.length) - { - String newAttrib[] = new String[tagAttributes.length * 2]; - System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); - tagAttributes = newAttrib; - } - tagAttributes[tagAttributePos++] = aname; - } - - /** - * Parse an equals sign surrounded by optional whitespace. - * <pre> - * [25] Eq ::= S? '=' S? - * </pre> - */ - private void parseEq() - throws SAXException, IOException - { - skipWhitespace(); - require('='); - skipWhitespace(); - } - - /** - * Parse an end tag. - * <pre> - * [42] ETag ::= '</' Name S? '>' - * </pre> - * <p>NOTE: parseContent () chains to here, we already read the - * "</". - */ - private void parseETag() - throws Exception - { - require(currentElement); - skipWhitespace(); - require('>'); - handler.endElement(currentElement); - // not re-reporting any SAXException re bogus end tags, - // even though that diagnostic might be clearer ... - } - - /** - * Parse the content of an element. - * <pre> - * [43] content ::= (element | CharData | Reference - * | CDSect | PI | Comment)* - * [67] Reference ::= EntityRef | CharRef - * </pre> - * <p> NOTE: consumes ETtag. - */ - private void parseContent() - throws Exception - { - char c; - - while (true) - { - // consume characters (or ignorable whitspace) until delimiter - parseCharData(); - - // Handle delimiters - c = readCh(); - switch (c) - { - case '&': // Found "&" - c = readCh(); - if (c == '#') - { - parseCharRef(); - } - else - { - unread(c); - parseEntityRef(true); - } - isDirtyCurrentElement = true; - break; - - case '<': // Found "<" - dataBufferFlush(); - c = readCh(); - switch (c) - { - case '!': // Found "<!" - c = readCh(); - switch (c) - { - case '-': // Found "<!-" - require('-'); - isDirtyCurrentElement = false; - parseComment(); - break; - case '[': // Found "<![" - isDirtyCurrentElement = false; - require("CDATA["); - handler.startCDATA(); - inCDATA = true; - parseCDSect(); - inCDATA = false; - handler.endCDATA(); - break; - default: - error("expected comment or CDATA section", c, null); - break; - } - break; - - case '?': // Found "<?" - isDirtyCurrentElement = false; - parsePI(); - break; - - case '/': // Found "</" - isDirtyCurrentElement = false; - parseETag(); - return; - - default: // Found "<" followed by something else - isDirtyCurrentElement = false; - unread(c); - parseElement(false); - break; - } - } - } - } - - /** - * Parse an element type declaration. - * <pre> - * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' - * </pre> - * <p> NOTE: the '<!ELEMENT' has already been read. - */ - private void parseElementDecl() - throws Exception - { - String name; - - requireWhitespace(); - // Read the element type name. - name = readNmtoken(true); - - requireWhitespace(); - // Read the content model. - parseContentspec(name); - - skipWhitespace(); - require('>'); - } - - /** - * Content specification. - * <pre> - * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements - * </pre> - */ - private void parseContentspec(String name) - throws Exception - { - // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... - if (tryRead("EMPTY")) - { - setElement(name, CONTENT_EMPTY, null, null); - if (!skippedPE) - { - handler.getDeclHandler().elementDecl(name, "EMPTY"); - } - return; - } - else if (tryRead("ANY")) - { - setElement(name, CONTENT_ANY, null, null); - if (!skippedPE) - { - handler.getDeclHandler().elementDecl(name, "ANY"); - } - return; - } - else - { - String model; - char[] saved; - - require('('); - saved = readBuffer; - dataBufferAppend('('); - skipWhitespace(); - if (tryRead("#PCDATA")) - { - dataBufferAppend("#PCDATA"); - parseMixed(saved); - model = dataBufferToString(); - setElement(name, CONTENT_MIXED, model, null); - } - else - { - parseElements(saved); - model = dataBufferToString(); - setElement(name, CONTENT_ELEMENTS, model, null); - } - if (!skippedPE) - { - handler.getDeclHandler().elementDecl(name, model); - } - } - } - - /** - * Parse an element-content model. - * <pre> - * [47] elements ::= (choice | seq) ('?' | '*' | '+')? - * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' - * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' - * </pre> - * - * <p> NOTE: the opening '(' and S have already been read. - * - * @param saved Buffer for entity that should have the terminal ')' - */ - private void parseElements(char[] saved) - throws Exception - { - char c; - char sep; - - // Parse the first content particle - skipWhitespace(); - parseCp(); - - // Check for end or for a separator. - skipWhitespace(); - c = readCh(); - switch (c) - { - case ')': - // VC: Proper Group/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Group/PE nesting"); - } - - dataBufferAppend(')'); - c = readCh(); - switch (c) - { - case '*': - case '+': - case '?': - dataBufferAppend(c); - break; - default: - unread(c); - } - return; - case ',': // Register the separator. - case '|': - sep = c; - dataBufferAppend(c); - break; - default: - error("bad separator in content model", c, null); - return; - } - - // Parse the rest of the content model. - while (true) - { - skipWhitespace(); - parseCp(); - skipWhitespace(); - c = readCh(); - if (c == ')') - { - // VC: Proper Group/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Group/PE nesting"); - } - - dataBufferAppend(')'); - break; - } - else if (c != sep) - { - error("bad separator in content model", c, null); - return; - } - else - { - dataBufferAppend(c); - } - } - - // Check for the occurrence indicator. - c = readCh(); - switch (c) - { - case '?': - case '*': - case '+': - dataBufferAppend(c); - return; - default: - unread(c); - return; - } - } - - /** - * Parse a content particle. - * <pre> - * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? - * </pre> - */ - private void parseCp() - throws Exception - { - if (tryRead('(')) - { - dataBufferAppend('('); - parseElements(readBuffer); - } - else - { - dataBufferAppend(readNmtoken(true)); - char c = readCh(); - switch (c) - { - case '?': - case '*': - case '+': - dataBufferAppend(c); - break; - default: - unread(c); - break; - } - } - } - - /** - * Parse mixed content. - * <pre> - * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' - * | '(' S? ('#PCDATA') S? ')' - * </pre> - * - * @param saved Buffer for entity that should have the terminal ')' - */ - private void parseMixed(char[] saved) - throws Exception - { - // Check for PCDATA alone. - skipWhitespace(); - if (tryRead(')')) - { - // VC: Proper Group/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Group/PE nesting"); - } - - dataBufferAppend(")*"); - tryRead('*'); - return; - } - - // Parse mixed content. - skipWhitespace(); - while (!tryRead(")")) - { - require('|'); - dataBufferAppend('|'); - skipWhitespace(); - dataBufferAppend(readNmtoken(true)); - skipWhitespace(); - } - - // VC: Proper Group/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Group/PE nesting"); - } - - require('*'); - dataBufferAppend(")*"); - } - - /** - * Parse an attribute list declaration. - * <pre> - * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' - * </pre> - * <p>NOTE: the '<!ATTLIST' has already been read. - */ - private void parseAttlistDecl() - throws Exception - { - String elementName; - - requireWhitespace(); - elementName = readNmtoken(true); - boolean white = tryWhitespace(); - while (!tryRead('>')) - { - if (!white) - { - error("whitespace required before attribute definition"); - } - parseAttDef(elementName); - white = tryWhitespace(); - } - } - - /** - * Parse a single attribute definition. - * <pre> - * [53] AttDef ::= S Name S AttType S DefaultDecl - * </pre> - */ - private void parseAttDef(String elementName) - throws Exception - { - String name; - String type; - String enumer = null; - - // Read the attribute name. - name = readNmtoken(true); - - // Read the attribute type. - requireWhitespace(); - type = readAttType(); - - // Get the string of enumerated values if necessary. - if (handler.stringInterning) - { - if ("ENUMERATION" == type || "NOTATION" == type) - { - enumer = dataBufferToString(); - } - } - else - { - if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) - { - enumer = dataBufferToString(); - } - } - - // Read the default value. - requireWhitespace(); - parseDefault(elementName, name, type, enumer); - } - - /** - * Parse the attribute type. - * <pre> - * [54] AttType ::= StringType | TokenizedType | EnumeratedType - * [55] StringType ::= 'CDATA' - * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' - * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' - * [57] EnumeratedType ::= NotationType | Enumeration - * </pre> - */ - private String readAttType() - throws Exception - { - if (tryRead('(')) - { - parseEnumeration(false); - return "ENUMERATION"; - } - else - { - String typeString = readNmtoken(true); - if (handler.stringInterning) - { - if ("NOTATION" == typeString) - { - parseNotationType(); - return typeString; - } - else if ("CDATA" == typeString - || "ID" == typeString - || "IDREF" == typeString - || "IDREFS" == typeString - || "ENTITY" == typeString - || "ENTITIES" == typeString - || "NMTOKEN" == typeString - || "NMTOKENS" == typeString) - { - return typeString; - } - } - else - { - if ("NOTATION".equals(typeString)) - { - parseNotationType(); - return typeString; - } - else if ("CDATA".equals(typeString) - || "ID".equals(typeString) - || "IDREF".equals(typeString) - || "IDREFS".equals(typeString) - || "ENTITY".equals(typeString) - || "ENTITIES".equals(typeString) - || "NMTOKEN".equals(typeString) - || "NMTOKENS".equals(typeString)) - { - return typeString; - } - } - error("illegal attribute type", typeString, null); - return null; - } - } - - /** - * Parse an enumeration. - * <pre> - * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' - * </pre> - * <p>NOTE: the '(' has already been read. - */ - private void parseEnumeration(boolean isNames) - throws Exception - { - dataBufferAppend('('); - - // Read the first token. - skipWhitespace(); - dataBufferAppend(readNmtoken(isNames)); - // Read the remaining tokens. - skipWhitespace(); - while (!tryRead(')')) - { - require('|'); - dataBufferAppend('|'); - skipWhitespace(); - dataBufferAppend(readNmtoken (isNames)); - skipWhitespace(); - } - dataBufferAppend(')'); - } - - /** - * Parse a notation type for an attribute. - * <pre> - * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks - * (S? '|' S? name)* S? ')' - * </pre> - * <p>NOTE: the 'NOTATION' has already been read - */ - private void parseNotationType() - throws Exception - { - requireWhitespace(); - require('('); - - parseEnumeration(true); - } - - /** - * Parse the default value for an attribute. - * <pre> - * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' - * | (('#FIXED' S)? AttValue) - * </pre> - */ - private void parseDefault(String elementName, String name, - String type, String enumer) - throws Exception - { - int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; - String value = null; - int flags = LIT_ATTRIBUTE; - boolean saved = expandPE; - String defaultType = null; - - // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace - // chars to spaces (doesn't matter when that's done if it doesn't - // interfere with char refs expanding to whitespace). - - if (!skippedPE) - { - flags |= LIT_ENTITY_REF; - if (handler.stringInterning) - { - if ("CDATA" != type) - { - flags |= LIT_NORMALIZE; - } - } - else - { - if (!"CDATA".equals(type)) - { - flags |= LIT_NORMALIZE; - } - } - } - - expandPE = false; - if (tryRead('#')) - { - if (tryRead("FIXED")) - { - defaultType = "#FIXED"; - valueType = ATTRIBUTE_DEFAULT_FIXED; - requireWhitespace(); - value = readLiteral(flags); - } - else if (tryRead("REQUIRED")) - { - defaultType = "#REQUIRED"; - valueType = ATTRIBUTE_DEFAULT_REQUIRED; - } - else if (tryRead("IMPLIED")) - { - defaultType = "#IMPLIED"; - valueType = ATTRIBUTE_DEFAULT_IMPLIED; - } - else - { - error("illegal keyword for attribute default value"); - } - } - else - { - value = readLiteral(flags); - } - expandPE = saved; - setAttribute(elementName, name, type, enumer, value, valueType); - if (handler.stringInterning) - { - if ("ENUMERATION" == type) - { - type = enumer; - } - else if ("NOTATION" == type) - { - type = "NOTATION " + enumer; - } - } - else - { - if ("ENUMERATION".equals(type)) - { - type = enumer; - } - else if ("NOTATION".equals(type)) - { - type = "NOTATION " + enumer; - } - } - if (!skippedPE) - { - handler.getDeclHandler().attributeDecl(elementName, name, type, - defaultType, value); - } - } - - /** - * Parse a conditional section. - * <pre> - * [61] conditionalSect ::= includeSect || ignoreSect - * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' - * extSubsetDecl ']]>' - * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' - * ignoreSectContents* ']]>' - * [64] ignoreSectContents ::= Ignore - * ('<![' ignoreSectContents* ']]>' Ignore )* - * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) - * </pre> - * <p> NOTE: the '>![' has already been read. - */ - private void parseConditionalSect(char[] saved) - throws Exception - { - skipWhitespace(); - if (tryRead("INCLUDE")) - { - skipWhitespace(); - require('['); - // VC: Proper Conditional Section/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Conditional Section/PE nesting"); - } - skipWhitespace(); - while (!tryRead("]]>")) - { - parseMarkupdecl(); - skipWhitespace(); - } - } - else if (tryRead("IGNORE")) - { - skipWhitespace(); - require('['); - // VC: Proper Conditional Section/PE Nesting - if (readBuffer != saved) - { - handler.verror("Illegal Conditional Section/PE nesting"); - } - int nesting = 1; - char c; - expandPE = false; - for (int nest = 1; nest > 0; ) - { - c = readCh(); - switch (c) - { - case '<': - if (tryRead("![")) - { - nest++; - } - case ']': - if (tryRead("]>")) - { - nest--; - } - } - } - expandPE = true; - } - else - { - error("conditional section must begin with INCLUDE or IGNORE"); - } - } - - private void parseCharRef() - throws SAXException, IOException - { - parseCharRef(true /* do flushDataBuffer by default */); - } - - /** - * Try to read a character reference without consuming data from buffer. - * <pre> - * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' - * </pre> - * <p>NOTE: the '&#' has already been read. - */ - private void tryReadCharRef() - throws SAXException, IOException - { - int value = 0; - char c; - - if (tryRead('x')) - { -loop1: - while (true) - { - c = readCh(); - if (c == ';') - { - break loop1; - } - else - { - int n = Character.digit(c, 16); - if (n == -1) - { - error("illegal character in character reference", c, null); - break loop1; - } - value *= 16; - value += n; - } - } - } - else - { -loop2: - while (true) - { - c = readCh(); - if (c == ';') - { - break loop2; - } - else - { - int n = Character.digit(c, 10); - if (n == -1) - { - error("illegal character in character reference", c, null); - break loop2; - } - value *= 10; - value += n; - } - } - } - - // check for character refs being legal XML - if ((value < 0x0020 - && ! (value == '\n' || value == '\t' || value == '\r')) - || (value >= 0xD800 && value <= 0xDFFF) - || value == 0xFFFE || value == 0xFFFF - || value > 0x0010ffff) - { - error("illegal XML character reference U+" - + Integer.toHexString(value)); - } - - // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz - // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: - if (value > 0x0010ffff) - { - // too big for surrogate - error("character reference " + value + " is too large for UTF-16", - new Integer(value).toString(), null); - } - - } - - /** - * Read and interpret a character reference. - * <pre> - * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' - * </pre> - * <p>NOTE: the '&#' has already been read. - */ - private void parseCharRef(boolean doFlush) - throws SAXException, IOException - { - int value = 0; - char c; - - if (tryRead('x')) - { -loop1: - while (true) - { - c = readCh(); - if (c == ';') - { - break loop1; - } - else - { - int n = Character.digit(c, 16); - if (n == -1) - { - error("illegal character in character reference", c, null); - break loop1; - } - value *= 16; - value += n; - } - } - } - else - { -loop2: - while (true) - { - c = readCh(); - if (c == ';') - { - break loop2; - } - else - { - int n = Character.digit(c, 10); - if (n == -1) - { - error("illegal character in character reference", c, null); - break loop2; - } - value *= 10; - value += c - '0'; - } - } - } - - // check for character refs being legal XML - if ((value < 0x0020 - && ! (value == '\n' || value == '\t' || value == '\r')) - || (value >= 0xD800 && value <= 0xDFFF) - || value == 0xFFFE || value == 0xFFFF - || value > 0x0010ffff) - { - error("illegal XML character reference U+" - + Integer.toHexString(value)); - } - - // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz - // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: - if (value <= 0x0000ffff) - { - // no surrogates needed - dataBufferAppend((char) value); - } - else if (value <= 0x0010ffff) - { - value -= 0x10000; - // > 16 bits, surrogate needed - dataBufferAppend((char) (0xd800 | (value >> 10))); - dataBufferAppend((char) (0xdc00 | (value & 0x0003ff))); - } - else - { - // too big for surrogate - error("character reference " + value + " is too large for UTF-16", - new Integer(value).toString(), null); - } - if (doFlush) - { - dataBufferFlush(); - } - } - - /** - * Parse and expand an entity reference. - * <pre> - * [68] EntityRef ::= '&' Name ';' - * </pre> - * <p>NOTE: the '&' has already been read. - * @param externalAllowed External entities are allowed here. - */ - private void parseEntityRef(boolean externalAllowed) - throws SAXException, IOException - { - String name; - - name = readNmtoken(true); - require(';'); - switch (getEntityType(name)) - { - case ENTITY_UNDECLARED: - // NOTE: XML REC describes amazingly convoluted handling for - // this case. Nothing as meaningful as being a WFness error - // unless the processor might _legitimately_ not have seen a - // declaration ... which is what this implements. - String message; - - message = "reference to undeclared general entity " + name; - if (skippedPE && !docIsStandalone) - { - handler.verror(message); - // we don't know this entity, and it might be external... - if (externalAllowed) - { - handler.skippedEntity(name); - } - } - else - { - error(message); - } - break; - case ENTITY_INTERNAL: - pushString(name, getEntityValue(name)); - - //workaround for possible input pop before marking - //the buffer reading position - char t = readCh(); - unread(t); - int bufferPosMark = readBufferPos; - - int end = readBufferPos + getEntityValue(name).length(); - for (int k = readBufferPos; k < end; k++) - { - t = readCh(); - if (t == '&') - { - t = readCh(); - if (t == '#') - { - //try to match a character ref - tryReadCharRef(); - - //everything has been read - if (readBufferPos >= end) - { - break; - } - k = readBufferPos; - continue; - } - else if (Character.isLetter(t)) - { - //looks like an entity ref - unread(t); - readNmtoken(true); - require(';'); - - //everything has been read - if (readBufferPos >= end) - { - break; - } - k = readBufferPos; - continue; - } - error(" malformed entity reference"); - } - - } - readBufferPos = bufferPosMark; - break; - case ENTITY_TEXT: - if (externalAllowed) - { - pushURL(false, name, getEntityIds(name), - null, null, null, true); - } - else - { - error("reference to external entity in attribute value.", - name, null); - } - break; - case ENTITY_NDATA: - if (externalAllowed) - { - error("unparsed entity reference in content", name, null); - } - else - { - error("reference to external entity in attribute value.", - name, null); - } - break; - default: - throw new RuntimeException(); - } - } - - /** - * Parse and expand a parameter entity reference. - * <pre> - * [69] PEReference ::= '%' Name ';' - * </pre> - * <p>NOTE: the '%' has already been read. - */ - private void parsePEReference() - throws SAXException, IOException - { - String name; - - name = "%" + readNmtoken(true); - require(';'); - switch (getEntityType(name)) - { - case ENTITY_UNDECLARED: - // VC: Entity Declared - handler.verror("reference to undeclared parameter entity " + name); - - // we should disable handling of all subsequent declarations - // unless this is a standalone document (info discarded) - break; - case ENTITY_INTERNAL: - if (inLiteral) - { - pushString(name, getEntityValue(name)); - } - else - { - pushString(name, ' ' + getEntityValue(name) + ' '); - } - break; - case ENTITY_TEXT: - if (!inLiteral) - { - pushString(null, " "); - } - pushURL(true, name, getEntityIds(name), null, null, null, true); - if (!inLiteral) - { - pushString(null, " "); - } - break; - } - } - - /** - * Parse an entity declaration. - * <pre> - * [70] EntityDecl ::= GEDecl | PEDecl - * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' - * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' - * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) - * [74] PEDef ::= EntityValue | ExternalID - * [75] ExternalID ::= 'SYSTEM' S SystemLiteral - * | 'PUBLIC' S PubidLiteral S SystemLiteral - * [76] NDataDecl ::= S 'NDATA' S Name - * </pre> - * <p>NOTE: the '<!ENTITY' has already been read. - */ - private void parseEntityDecl() - throws Exception - { - boolean peFlag = false; - int flags = 0; - - // Check for a parameter entity. - expandPE = false; - requireWhitespace(); - if (tryRead('%')) - { - peFlag = true; - requireWhitespace(); - } - expandPE = true; - - // Read the entity name, and prepend - // '%' if necessary. - String name = readNmtoken(true); - //NE08 - if (name.indexOf(':') >= 0) - { - error("Illegal character(':') in entity name ", name, null); - } - if (peFlag) - { - name = "%" + name; - } - - // Read the entity value. - requireWhitespace(); - char c = readCh(); - unread (c); - if (c == '"' || c == '\'') - { - // Internal entity ... replacement text has expanded refs - // to characters and PEs, but not to general entities - String value = readLiteral(flags); - setInternalEntity(name, value); - } - else - { - // Read the external IDs - ExternalIdentifiers ids = readExternalIds(false, false); - - // Check for NDATA declaration. - boolean white = tryWhitespace(); - if (!peFlag && tryRead("NDATA")) - { - if (!white) - { - error("whitespace required before NDATA"); - } - requireWhitespace(); - String notationName = readNmtoken(true); - if (!skippedPE) - { - setExternalEntity(name, ENTITY_NDATA, ids, notationName); - handler.unparsedEntityDecl(name, ids.publicId, ids.systemId, - ids.baseUri, notationName); - } - } - else if (!skippedPE) - { - setExternalEntity(name, ENTITY_TEXT, ids, null); - handler.getDeclHandler() - .externalEntityDecl(name, ids.publicId, - handler.resolveURIs() - // FIXME: ASSUMES not skipped - // "false" forces error on bad URI - ? handler.absolutize(ids.baseUri, - ids.systemId, - false) - : ids.systemId); - } - } - - // Finish the declaration. - skipWhitespace(); - require('>'); - } - - /** - * Parse a notation declaration. - * <pre> - * [82] NotationDecl ::= '<!NOTATION' S Name S - * (ExternalID | PublicID) S? '>' - * [83] PublicID ::= 'PUBLIC' S PubidLiteral - * </pre> - * <P>NOTE: the '<!NOTATION' has already been read. - */ - private void parseNotationDecl() - throws Exception - { - String nname; - ExternalIdentifiers ids; - - requireWhitespace(); - nname = readNmtoken(true); - //NE08 - if (nname.indexOf(':') >= 0) - { - error("Illegal character(':') in notation name ", nname, null); - } - requireWhitespace(); - - // Read the external identifiers. - ids = readExternalIds(true, false); - - // Register the notation. - setNotation(nname, ids); - - skipWhitespace(); - require('>'); - } - - /** - * Parse character data. - * <pre> - * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) - * </pre> - */ - private void parseCharData() - throws Exception - { - char c; - int state = 0; - boolean pureWhite = false; - - // assert (dataBufferPos == 0); - - // are we expecting pure whitespace? it might be dirty... - if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) - { - pureWhite = true; - } - - // always report right out of readBuffer - // to minimize (pointless) buffer copies - while (true) - { - int lineAugment = 0; - int columnAugment = 0; - int i; - -loop: - for (i = readBufferPos; i < readBufferLength; i++) - { - switch (c = readBuffer[i]) - { - case '\n': - lineAugment++; - columnAugment = 0; - // pureWhite unmodified - break; - case '\r': // should not happen!! - case '\t': - case ' ': - // pureWhite unmodified - columnAugment++; - break; - case '&': - case '<': - columnAugment++; - // pureWhite unmodified - // CLEAN end of text sequence - state = 1; - break loop; - case ']': - // that's not a whitespace char, and - // can not terminate pure whitespace either - pureWhite = false; - if ((i + 2) < readBufferLength) - { - if (readBuffer [i + 1] == ']' - && readBuffer [i + 2] == '>') - { - // ERROR end of text sequence - state = 2; - break loop; - } - } - else - { - // FIXME missing two end-of-buffer cases - } - columnAugment++; - break; - default: - if ((c < 0x0020 || c > 0xFFFD) - || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) - && xmlVersion == XML_11)) - { - error("illegal XML character U+" - + Integer.toHexString(c)); - } - // that's not a whitespace char - pureWhite = false; - columnAugment++; - } - } - - // report text thus far - if (lineAugment > 0) - { - line += lineAugment; - column = columnAugment; - } - else - { - column += columnAugment; - } - - // report characters/whitspace - int length = i - readBufferPos; - - if (length != 0) - { - if (pureWhite) - { - handler.ignorableWhitespace(readBuffer, - readBufferPos, length); - } - else - { - handler.charData(readBuffer, readBufferPos, length); - } - readBufferPos = i; - } - - if (state != 0) - { - break; - } - - // fill next buffer from this entity, or - // pop stack and continue with previous entity - unread(readCh()); - } - if (!pureWhite) - { - isDirtyCurrentElement = true; - } - // finish, maybe with error - if (state != 1) // finish, no error - { - error("character data may not contain ']]>'"); - } - } - - ////////////////////////////////////////////////////////////////////// - // High-level reading and scanning methods. - ////////////////////////////////////////////////////////////////////// - - /** - * Require whitespace characters. - */ - private void requireWhitespace() - throws SAXException, IOException - { - char c = readCh(); - if (isWhitespace(c)) - { - skipWhitespace(); - } - else - { - error("whitespace required", c, null); - } - } - - /** - * Skip whitespace characters. - * <pre> - * [3] S ::= (#x20 | #x9 | #xd | #xa)+ - * </pre> - */ - private void skipWhitespace() - throws SAXException, IOException - { - // Start with a little cheat. Most of - // the time, the white space will fall - // within the current read buffer; if - // not, then fall through. - if (USE_CHEATS) - { - int lineAugment = 0; - int columnAugment = 0; - -loop: - for (int i = readBufferPos; i < readBufferLength; i++) - { - switch (readBuffer[i]) - { - case ' ': - case '\t': - case '\r': - columnAugment++; - break; - case '\n': - lineAugment++; - columnAugment = 0; - break; - case '%': - if (expandPE) - { - break loop; - } - // else fall through... - default: - readBufferPos = i; - if (lineAugment > 0) - { - line += lineAugment; - column = columnAugment; - } - else - { - column += columnAugment; - } - return; - } - } - } - - // OK, do it the slow way. - char c = readCh (); - while (isWhitespace(c)) - { - c = readCh(); - } - unread(c); - } - - /** - * Read a name or (when parsing an enumeration) name token. - * <pre> - * [5] Name ::= (Letter | '_' | ':') (NameChar)* - * [7] Nmtoken ::= (NameChar)+ - * </pre> - */ - private String readNmtoken(boolean isName) - throws SAXException, IOException - { - char c; - - if (USE_CHEATS) - { -loop: - for (int i = readBufferPos; i < readBufferLength; i++) - { - c = readBuffer[i]; - switch (c) - { - case '%': - if (expandPE) - { - break loop; - } - // else fall through... - - // What may legitimately come AFTER a name/nmtoken? - case '<': case '>': case '&': - case ',': case '|': case '*': case '+': case '?': - case ')': - case '=': - case '\'': case '"': - case '[': - case ' ': case '\t': case '\r': case '\n': - case ';': - case '/': - int start = readBufferPos; - if (i == start) - { - error("name expected", readBuffer[i], null); - } - readBufferPos = i; - return intern(readBuffer, start, i - start); - - default: - // FIXME ... per IBM's OASIS test submission, these: - // ? U+06dd - // Combining U+309B - //these switches are kind of ugly but at least we won't - //have to go over the whole lits for each char - if (isName && i == readBufferPos) - { - char c2 = (char) (c & 0x00f0); - switch (c & 0xff00) - { - //starting with 01 - case 0x0100: - switch (c2) - { - case 0x0030: - if (c == 0x0132 || c == 0x0133 || c == 0x013f) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x0040: - if (c == 0x0140 || c == 0x0149) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x00c0: - if (c == 0x01c4 || c == 0x01cc) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x00f0: - if (c == 0x01f1 || c == 0x01f3) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x00b0: - if (c == 0x01f1 || c == 0x01f3) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - default: - if (c == 0x017f) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - } - - break; - //starting with 11 - case 0x1100: - switch (c2) - { - case 0x0000: - if (c == 0x1104 || c == 0x1108 || - c == 0x110a || c == 0x110d) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x0030: - if (c == 0x113b || c == 0x113f) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x0040: - if (c == 0x1141 || c == 0x114d - || c == 0x114f ) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x0050: - if (c == 0x1151 || c == 0x1156) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x0060: - if (c == 0x1162 || c == 0x1164 - || c == 0x1166 || c == 0x116b - || c == 0x116f) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - case 0x00b0: - if (c == 0x11b6 || c == 0x11b9 - || c == 0x11bb || c == 0x116f) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - break; - default: - if (c == 0x1174 || c == 0x119f - || c == 0x11ac || c == 0x11c3 - || c == 0x11f1) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - } - break; - default: - if (c == 0x0e46 || c == 0x1011 - || c == 0x212f || c == 0x0587 - || c == 0x0230 ) - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - } - } - // punt on exact tests from Appendix A; approximate - // them using the Unicode ID start/part rules - if (i == readBufferPos && isName) - { - if (!Character.isUnicodeIdentifierStart(c) - && c != ':' && c != '_') - { - error("Not a name start character, U+" - + Integer.toHexString(c)); - } - } - else if (!Character.isUnicodeIdentifierPart(c) - && c != '-' && c != ':' && c != '_' && c != '.' - && !isExtender(c)) - { - error("Not a name character, U+" - + Integer.toHexString(c)); - } - } - } - } - - nameBufferPos = 0; - - // Read the first character. -loop: - while (true) - { - c = readCh(); - switch (c) - { - case '%': - case '<': case '>': case '&': - case ',': case '|': case '*': case '+': case '?': - case ')': - case '=': - case '\'': case '"': - case '[': - case ' ': case '\t': case '\n': case '\r': - case ';': - case '/': - unread(c); - if (nameBufferPos == 0) - { - error ("name expected"); - } - // punt on exact tests from Appendix A, but approximate them - if (isName - && !Character.isUnicodeIdentifierStart(nameBuffer[0]) - && ":_".indexOf(nameBuffer[0]) == -1) - { - error("Not a name start character, U+" - + Integer.toHexString(nameBuffer[0])); - } - String s = intern(nameBuffer, 0, nameBufferPos); - nameBufferPos = 0; - return s; - default: - // punt on exact tests from Appendix A, but approximate them - - if ((nameBufferPos != 0 || !isName) - && !Character.isUnicodeIdentifierPart(c) - && ":-_.".indexOf(c) == -1 - && !isExtender(c)) - { - error("Not a name character, U+" - + Integer.toHexString(c)); - } - if (nameBufferPos >= nameBuffer.length) - { - nameBuffer = - (char[]) extendArray(nameBuffer, - nameBuffer.length, nameBufferPos); - } - nameBuffer[nameBufferPos++] = c; - } - } - } - - private static boolean isExtender(char c) - { - // [88] Extender ::= ... - return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 - || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 - || (c >= 0x3031 && c <= 0x3035) - || (c >= 0x309d && c <= 0x309e) - || (c >= 0x30fc && c <= 0x30fe); - } - - /** - * Read a literal. With matching single or double quotes as - * delimiters (and not embedded!) this is used to parse: - * <pre> - * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... - * [10] AttValue ::= ... ([^<&] | Reference)* ... - * [11] SystemLiteral ::= ... (URLchar - "'")* ... - * [12] PubidLiteral ::= ... (PubidChar - "'")* ... - * </pre> - * as well as the quoted strings in XML and text declarations - * (for version, encoding, and standalone) which have their - * own constraints. - */ - private String readLiteral(int flags) - throws SAXException, IOException - { - char delim, c; - int startLine = line; - boolean saved = expandPE; - boolean savedReport = doReport; - - // Find the first delimiter. - delim = readCh(); - if (delim != '"' && delim != '\'') - { - error("expected '\"' or \"'\"", delim, null); - return null; - } - inLiteral = true; - if ((flags & LIT_DISABLE_PE) != 0) - { - expandPE = false; - } - doReport = false; - - // Each level of input source has its own buffer; remember - // ours, so we won't read the ending delimiter from any - // other input source, regardless of entity processing. - char[] ourBuf = readBuffer; - - // Read the literal. - try - { - c = readCh(); - boolean ampRead = false; -loop: - while (! (c == delim && readBuffer == ourBuf)) - { - switch (c) - { - // attributes and public ids are normalized - // in almost the same ways - case '\n': - case '\r': - if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) - { - c = ' '; - } - break; - case '\t': - if ((flags & LIT_ATTRIBUTE) != 0) - { - c = ' '; - } - break; - case '&': - c = readCh(); - // Char refs are expanded immediately, except for - // all the cases where it's deferred. - if (c == '#') - { - if ((flags & LIT_DISABLE_CREF) != 0) - { - dataBufferAppend('&'); - break; - } - parseCharRef(false /* Do not do flushDataBuffer */); - - // exotic WFness risk: this is an entity literal, - // dataBuffer [dataBufferPos - 1] == '&', and - // following chars are a _partial_ entity/char ref - - // It looks like an entity ref ... - } - else - { - unread(c); - // Expand it? - if ((flags & LIT_ENTITY_REF) > 0) - { - parseEntityRef(false); - if (String.valueOf(readBuffer).equals("&")) - { - ampRead = true; - } - //Is it just data? - } - else if ((flags & LIT_DISABLE_EREF) != 0) - { - dataBufferAppend('&'); - - // OK, it will be an entity ref -- expanded later. - } - else - { - String name = readNmtoken(true); - require(';'); - dataBufferAppend('&'); - dataBufferAppend(name); - dataBufferAppend(';'); - } - } - c = readCh(); - continue loop; - - case '<': - // and why? Perhaps so "&foo;" expands the same - // inside and outside an attribute? - if ((flags & LIT_ATTRIBUTE) != 0) - { - error("attribute values may not contain '<'"); - } - break; - - // We don't worry about case '%' and PE refs, readCh does. - - default: - break; - } - dataBufferAppend(c); - c = readCh(); - } - } - catch (EOFException e) - { - error("end of input while looking for delimiter (started on line " - + startLine + ')', null, new Character(delim).toString()); - } - inLiteral = false; - expandPE = saved; - doReport = savedReport; - - // Normalise whitespace if necessary. - if ((flags & LIT_NORMALIZE) > 0) - { - dataBufferNormalize(); - } - - // Return the value. - return dataBufferToString(); - } - - /** - * Try reading external identifiers. - * A system identifier is not required for notations. - * @param inNotation Are we parsing a notation decl? - * @param isSubset Parsing external subset decl (may be omitted)? - * @return A three-member String array containing the identifiers, - * or nulls. Order: public, system, baseURI. - */ - private ExternalIdentifiers readExternalIds(boolean inNotation, - boolean isSubset) - throws Exception - { - char c; - ExternalIdentifiers ids = new ExternalIdentifiers(); - int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; - - if (tryRead("PUBLIC")) - { - requireWhitespace(); - ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags); - if (inNotation) - { - skipWhitespace(); - c = readCh(); - unread(c); - if (c == '"' || c == '\'') - { - ids.systemId = readLiteral(flags); - } - } - else - { - requireWhitespace(); - ids.systemId = readLiteral(flags); - } - - for (int i = 0; i < ids.publicId.length(); i++) - { - c = ids.publicId.charAt(i); - if (c >= 'a' && c <= 'z') - { - continue; - } - if (c >= 'A' && c <= 'Z') - { - continue; - } - if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) - { - continue; - } - error("illegal PUBLIC id character U+" - + Integer.toHexString(c)); - } - } - else if (tryRead("SYSTEM")) - { - requireWhitespace(); - ids.systemId = readLiteral(flags); - } - else if (!isSubset) - { - error("missing SYSTEM or PUBLIC keyword"); - } - - if (ids.systemId != null) - { - if (ids.systemId.indexOf('#') != -1) - { - handler.verror("SYSTEM id has a URI fragment: " + ids.systemId); - } - ids.baseUri = handler.getSystemId(); - if (ids.baseUri == null && uriWarnings) - { - handler.warn("No base URI; hope URI is absolute: " - + ids.systemId); - } - } - - return ids; - } - - /** - * Test if a character is whitespace. - * <pre> - * [3] S ::= (#x20 | #x9 | #xd | #xa)+ - * </pre> - * @param c The character to test. - * @return true if the character is whitespace. - */ - private final boolean isWhitespace(char c) - { - if (c > 0x20) - { - return false; - } - if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) - { - return true; - } - return false; // illegal ... - } - - ////////////////////////////////////////////////////////////////////// - // Utility routines. - ////////////////////////////////////////////////////////////////////// - - /** - * Add a character to the data buffer. - */ - private void dataBufferAppend(char c) - { - // Expand buffer if necessary. - if (dataBufferPos >= dataBuffer.length) - { - dataBuffer = (char[]) extendArray(dataBuffer, - dataBuffer.length, dataBufferPos); - } - dataBuffer[dataBufferPos++] = c; - } - - /** - * Add a string to the data buffer. - */ - private void dataBufferAppend(String s) - { - dataBufferAppend(s.toCharArray(), 0, s.length()); - } - - /** - * Append (part of) a character array to the data buffer. - */ - private void dataBufferAppend(char[] ch, int start, int length) - { - dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, - dataBufferPos + length); - - System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); - dataBufferPos += length; - } - - /** - * Normalise space characters in the data buffer. - */ - private void dataBufferNormalize() - { - int i = 0; - int j = 0; - int end = dataBufferPos; - - // Skip spaces at the start. - while (j < end && dataBuffer[j] == ' ') - { - j++; - } - - // Skip whitespace at the end. - while (end > j && dataBuffer[end - 1] == ' ') - { - end --; - } - - // Start copying to the left. - while (j < end) - { - - char c = dataBuffer[j++]; - - // Normalise all other spaces to - // a single space. - if (c == ' ') - { - while (j < end && dataBuffer[j++] == ' ') - { - continue; - } - dataBuffer[i++] = ' '; - dataBuffer[i++] = dataBuffer[j - 1]; - } - else - { - dataBuffer[i++] = c; - } - } - - // The new length is <= the old one. - dataBufferPos = i; - } - - /** - * Convert the data buffer to a string. - */ - private String dataBufferToString() - { - String s = new String(dataBuffer, 0, dataBufferPos); - dataBufferPos = 0; - return s; - } - - /** - * Flush the contents of the data buffer to the handler, as - * appropriate, and reset the buffer for new input. - */ - private void dataBufferFlush() - throws SAXException - { - if (currentElementContent == CONTENT_ELEMENTS - && dataBufferPos > 0 - && !inCDATA) - { - // We can't just trust the buffer to be whitespace, there - // are (error) cases when it isn't - for (int i = 0; i < dataBufferPos; i++) - { - if (!isWhitespace(dataBuffer[i])) - { - handler.charData(dataBuffer, 0, dataBufferPos); - dataBufferPos = 0; - } - } - if (dataBufferPos > 0) - { - handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); - dataBufferPos = 0; - } - } - else if (dataBufferPos > 0) - { - handler.charData(dataBuffer, 0, dataBufferPos); - dataBufferPos = 0; - } - } - - /** - * Require a string to appear, or throw an exception. - * <p><em>Precondition:</em> Entity expansion is not required. - * <p><em>Precondition:</em> data buffer has no characters that - * will get sent to the application. - */ - private void require(String delim) - throws SAXException, IOException - { - int length = delim.length(); - char[] ch; - - if (length < dataBuffer.length) - { - ch = dataBuffer; - delim.getChars(0, length, ch, 0); - } - else - { - ch = delim.toCharArray(); - } - - if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) - { - int offset = readBufferPos; - - for (int i = 0; i < length; i++, offset++) - { - if (ch[i] != readBuffer[offset]) - { - error ("required string", null, delim); - } - } - readBufferPos = offset; - - } - else - { - for (int i = 0; i < length; i++) - { - require(ch[i]); - } - } - } - - /** - * Require a character to appear, or throw an exception. - */ - private void require(char delim) - throws SAXException, IOException - { - char c = readCh(); - - if (c != delim) - { - error("required character", c, new Character(delim).toString()); - } - } - - /** - * Create an interned string from a character array. - * Ælfred uses this method to create an interned version - * of all names and name tokens, so that it can test equality - * with <code>==</code> instead of <code>String.equals ()</code>. - * - * <p>This is much more efficient than constructing a non-interned - * string first, and then interning it. - * - * @param ch an array of characters for building the string. - * @param start the starting position in the array. - * @param length the number of characters to place in the string. - * @return an interned string. - * @see #intern (String) - * @see java.lang.String#intern - */ - public String intern(char[] ch, int start, int length) - { - int index = 0; - int hash = 0; - Object[] bucket; - - // Generate a hash code. This is a widely used string hash, - // often attributed to Brian Kernighan. - for (int i = start; i < start + length; i++) - { - hash = 31 * hash + ch[i]; - } - hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; - - // Get the bucket -- consists of {array,String} pairs - if ((bucket = symbolTable[hash]) == null) - { - // first string in this bucket - bucket = new Object[8]; - - // Search for a matching tuple, and - // return the string if we find one. - } - else - { - while (index < bucket.length) - { - char[] chFound = (char[]) bucket[index]; - - // Stop when we hit an empty entry. - if (chFound == null) - { - break; - } - - // If they're the same length, check for a match. - if (chFound.length == length) - { - for (int i = 0; i < chFound.length; i++) - { - // continue search on failure - if (ch[start + i] != chFound[i]) - { - break; - } - else if (i == length - 1) - { - // That's it, we have a match! - return (String) bucket[index + 1]; - } - } - } - index += 2; - } - // Not found -- we'll have to add it. - - // Do we have to grow the bucket? - bucket = (Object[]) extendArray(bucket, bucket.length, index); - } - symbolTable[hash] = bucket; - - // OK, add it to the end of the bucket -- "local" interning. - // Intern "globally" to let applications share interning benefits. - // That is, "!=" and "==" work on our strings, not just equals(). - String s = new String(ch, start, length).intern(); - bucket[index] = s.toCharArray(); - bucket[index + 1] = s; - return s; - } - - /** - * Ensure the capacity of an array, allocating a new one if - * necessary. Usually extends only for name hash collisions. - */ - private Object extendArray(Object array, int currentSize, int requiredSize) - { - if (requiredSize < currentSize) - { - return array; - } - else - { - Object newArray = null; - int newSize = currentSize * 2; - - if (newSize <= requiredSize) - { - newSize = requiredSize + 1; - } - - if (array instanceof char[]) - { - newArray = new char[newSize]; - } - else if (array instanceof Object[]) - { - newArray = new Object[newSize]; - } - else - { - throw new RuntimeException(); - } - - System.arraycopy(array, 0, newArray, 0, currentSize); - return newArray; - } - } - - ////////////////////////////////////////////////////////////////////// - // XML query routines. - ////////////////////////////////////////////////////////////////////// - - boolean isStandalone() - { - return docIsStandalone; - } - - // - // Elements - // - - private int getContentType(ElementDecl element, int defaultType) - { - int retval; - - if (element == null) - { - return defaultType; - } - retval = element.contentType; - if (retval == CONTENT_UNDECLARED) - { - retval = defaultType; - } - return retval; - } - - /** - * Look up the content type of an element. - * @param name The element type name. - * @return An integer constant representing the content type. - * @see #CONTENT_UNDECLARED - * @see #CONTENT_ANY - * @see #CONTENT_EMPTY - * @see #CONTENT_MIXED - * @see #CONTENT_ELEMENTS - */ - public int getElementContentType(String name) - { - ElementDecl element = (ElementDecl) elementInfo.get(name); - return getContentType(element, CONTENT_UNDECLARED); - } - - /** - * Register an element. - * Array format: - * [0] element type name - * [1] content model (mixed, elements only) - * [2] attribute hash table - */ - private void setElement(String name, int contentType, - String contentModel, HashMap attributes) - throws SAXException - { - if (skippedPE) - { - return; - } - - ElementDecl element = (ElementDecl) elementInfo.get(name); - - // first <!ELEMENT ...> or <!ATTLIST ...> for this type? - if (element == null) - { - element = new ElementDecl(); - element.contentType = contentType; - element.contentModel = contentModel; - element.attributes = attributes; - elementInfo.put(name, element); - return; - } - - // <!ELEMENT ...> declaration? - if (contentType != CONTENT_UNDECLARED) - { - // ... following an associated <!ATTLIST ...> - if (element.contentType == CONTENT_UNDECLARED) - { - element.contentType = contentType; - element.contentModel = contentModel; - } - else - { - // VC: Unique Element Type Declaration - handler.verror("multiple declarations for element type: " - + name); - } - } - - // first <!ATTLIST ...>, before <!ELEMENT ...> ? - else if (attributes != null) - { - element.attributes = attributes; - } - } - - /** - * Look up the attribute hash table for an element. - * The hash table is the second item in the element array. - */ - private HashMap getElementAttributes(String name) - { - ElementDecl element = (ElementDecl) elementInfo.get(name); - return (element == null) ? null : element.attributes; - } - - // - // Attributes - // - - /** - * Get the declared attributes for an element type. - * @param elname The name of the element type. - * @return An iterator over all the attributes declared for - * a specific element type. The results will be valid only - * after the DTD (if any) has been parsed. - * @see #getAttributeType - * @see #getAttributeEnumeration - * @see #getAttributeDefaultValueType - * @see #getAttributeDefaultValue - * @see #getAttributeExpandedValue - */ - private Iterator declaredAttributes(ElementDecl element) - { - HashMap attlist; - - if (element == null) - { - return null; - } - if ((attlist = element.attributes) == null) - { - return null; - } - return attlist.keySet().iterator(); - } - - /** - * Get the declared attributes for an element type. - * @param elname The name of the element type. - * @return An iterator over all the attributes declared for - * a specific element type. The results will be valid only - * after the DTD (if any) has been parsed. - * @see #getAttributeType - * @see #getAttributeEnumeration - * @see #getAttributeDefaultValueType - * @see #getAttributeDefaultValue - * @see #getAttributeExpandedValue - */ - public Iterator declaredAttributes(String elname) - { - return declaredAttributes((ElementDecl) elementInfo.get(elname)); - } - - /** - * Retrieve the declared type of an attribute. - * @param name The name of the associated element. - * @param aname The name of the attribute. - * @return An interend string denoting the type, or null - * indicating an undeclared attribute. - */ - public String getAttributeType(String name, String aname) - { - AttributeDecl attribute = getAttribute(name, aname); - return (attribute == null) ? null : attribute.type; - } - - /** - * Retrieve the allowed values for an enumerated attribute type. - * @param name The name of the associated element. - * @param aname The name of the attribute. - * @return A string containing the token list. - */ - public String getAttributeEnumeration(String name, String aname) - { - AttributeDecl attribute = getAttribute(name, aname); - // assert: attribute.enumeration is "ENUMERATION" or "NOTATION" - return (attribute == null) ? null : attribute.enumeration; - } - - /** - * Retrieve the default value of a declared attribute. - * @param name The name of the associated element. - * @param aname The name of the attribute. - * @return The default value, or null if the attribute was - * #IMPLIED or simply undeclared and unspecified. - * @see #getAttributeExpandedValue - */ - public String getAttributeDefaultValue(String name, String aname) - { - AttributeDecl attribute = getAttribute(name, aname); - return (attribute == null) ? null : attribute.value; - } - - /* - -// FIXME: Leaving this in, until W3C finally resolves the confusion -// between parts of the XML 2nd REC about when entity declararations -// are guaranteed to be known. Current code matches what section 5.1 -// (conformance) describes, but some readings of the self-contradicting -// text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that -// attribute expansion/normalization must be deferred in some cases -// (just TRY to identify them!). - - * Retrieve the expanded value of a declared attribute. - * <p>General entities (and char refs) will be expanded (once). - * @param name The name of the associated element. - * @param aname The name of the attribute. - * @return The expanded default value, or null if the attribute was - * #IMPLIED or simply undeclared - * @see #getAttributeDefaultValue - public String getAttributeExpandedValue (String name, String aname) - throws Exception - { - AttributeDecl attribute = getAttribute (name, aname); - - if (attribute == null) { - return null; - } else if (attribute.defaultValue == null && attribute.value != null) { - // we MUST use the same buf for both quotes else the literal - // can't be properly terminated - char buf [] = new char [1]; - int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; - String type = getAttributeType (name, aname); - - if (type != "CDATA" && type != null) - flags |= LIT_NORMALIZE; - buf [0] = '"'; - pushCharArray (null, buf, 0, 1); - pushString (null, attribute.value); - pushCharArray (null, buf, 0, 1); - attribute.defaultValue = readLiteral (flags); - } - return attribute.defaultValue; - } - */ - - /** - * Retrieve the default value mode of a declared attribute. - * @see #ATTRIBUTE_DEFAULT_SPECIFIED - * @see #ATTRIBUTE_DEFAULT_IMPLIED - * @see #ATTRIBUTE_DEFAULT_REQUIRED - * @see #ATTRIBUTE_DEFAULT_FIXED - */ - public int getAttributeDefaultValueType(String name, String aname) - { - AttributeDecl attribute = getAttribute(name, aname); - return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED : - attribute.valueType; - } - - /** - * Register an attribute declaration for later retrieval. - * Format: - * - String type - * - String default value - * - int value type - * - enumeration - * - processed default value - */ - private void setAttribute(String elName, String name, String type, - String enumeration, String value, int valueType) - throws Exception - { - HashMap attlist; - - if (skippedPE) - { - return; - } - - // Create a new hashtable if necessary. - attlist = getElementAttributes(elName); - if (attlist == null) - { - attlist = new HashMap(); - } - - // ignore multiple attribute declarations! - if (attlist.get(name) != null) - { - // warn ... - return; - } - else - { - AttributeDecl attribute = new AttributeDecl(); - attribute.type = type; - attribute.value = value; - attribute.valueType = valueType; - attribute.enumeration = enumeration; - attlist.put(name, attribute); - - // save; but don't overwrite any existing <!ELEMENT ...> - setElement(elName, CONTENT_UNDECLARED, null, attlist); - } - } - - /** - * Retrieve the attribute declaration for the given element name and name. - */ - private AttributeDecl getAttribute(String elName, String name) - { - HashMap attlist = getElementAttributes(elName); - return (attlist == null) ? null : (AttributeDecl) attlist.get(name); - } - - // - // Entities - // - - /** - * Find the type of an entity. - * @returns An integer constant representing the entity type. - * @see #ENTITY_UNDECLARED - * @see #ENTITY_INTERNAL - * @see #ENTITY_NDATA - * @see #ENTITY_TEXT - */ - public int getEntityType(String ename) - { - EntityInfo entity = (EntityInfo) entityInfo.get(ename); - return (entity == null) ? ENTITY_UNDECLARED : entity.type; - } - - /** - * Return an external entity's identifiers. - * @param ename The name of the external entity. - * @return The entity's public identifier, system identifier, and base URI. - * Null if the entity was not declared as an external entity. - * @see #getEntityType - */ - public ExternalIdentifiers getEntityIds(String ename) - { - EntityInfo entity = (EntityInfo) entityInfo.get(ename); - return (entity == null) ? null : entity.ids; - } - - /** - * Return an internal entity's replacement text. - * @param ename The name of the internal entity. - * @return The entity's replacement text, or null if - * the entity was not declared as an internal entity. - * @see #getEntityType - */ - public String getEntityValue(String ename) - { - EntityInfo entity = (EntityInfo) entityInfo.get(ename); - return (entity == null) ? null : entity.value; - } - - /** - * Register an entity declaration for later retrieval. - */ - private void setInternalEntity(String eName, String value) - throws SAXException - { - if (skippedPE) - { - return; - } - - if (entityInfo.get(eName) == null) - { - EntityInfo entity = new EntityInfo(); - entity.type = ENTITY_INTERNAL; - entity.value = value; - entityInfo.put(eName, entity); - } - if (handler.stringInterning) - { - if ("lt" == eName || "gt" == eName || "quot" == eName - || "apos" == eName || "amp" == eName) - { - return; - } - } - else - { - if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) - || "apos".equals(eName) || "amp".equals(eName)) - { - return; - } - } - handler.getDeclHandler().internalEntityDecl(eName, value); - } - - /** - * Register an external entity declaration for later retrieval. - */ - private void setExternalEntity(String eName, int eClass, - ExternalIdentifiers ids, String nName) - { - if (entityInfo.get(eName) == null) - { - EntityInfo entity = new EntityInfo(); - entity.type = eClass; - entity.ids = ids; - entity.notationName = nName; - entityInfo.put(eName, entity); - } - } - - // - // Notations. - // - - /** - * Report a notation declaration, checking for duplicates. - */ - private void setNotation(String nname, ExternalIdentifiers ids) - throws SAXException - { - if (skippedPE) - { - return; - } - - handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri); - if (notationInfo.get(nname) == null) - { - notationInfo.put(nname, nname); - } - else - { - // VC: Unique Notation Name - handler.verror("Duplicate notation name decl: " + nname); - } - } - - // - // Location. - // - - /** - * Return the current line number. - */ - public int getLineNumber() - { - return line; - } - - /** - * Return the current column number. - */ - public int getColumnNumber() - { - return column; - } - - ////////////////////////////////////////////////////////////////////// - // High-level I/O. - ////////////////////////////////////////////////////////////////////// - - /** - * Read a single character from the readBuffer. - * <p>The readDataChunk () method maintains the buffer. - * <p>If we hit the end of an entity, try to pop the stack and - * keep going. - * <p> (This approach doesn't really enforce XML's rules about - * entity boundaries, but this is not currently a validating - * parser). - * <p>This routine also attempts to keep track of the current - * position in external entities, but it's not entirely accurate. - * @return The next available input character. - * @see #unread (char) - * @see #readDataChunk - * @see #readBuffer - * @see #line - * @return The next character from the current input source. - */ - private char readCh() - throws SAXException, IOException - { - // As long as there's nothing in the - // read buffer, try reading more data - // (for an external entity) or popping - // the entity stack (for either). - while (readBufferPos >= readBufferLength) - { - switch (sourceType) - { - case INPUT_READER: - case INPUT_STREAM: - readDataChunk(); - while (readBufferLength < 1) - { - popInput(); - if (readBufferLength < 1) - { - readDataChunk(); - } - } - break; - - default: - - popInput(); - break; - } - } - - char c = readBuffer[readBufferPos++]; - - if (c == '\n') - { - line++; - column = 0; - } - else - { - if (c == '<') - { - /* the most common return to parseContent () ... NOP */ - } - else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) - || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) - && xmlVersion == XML_11)) - { - error("illegal XML character U+" + Integer.toHexString(c)); - } - - // If we're in the DTD and in a context where PEs get expanded, - // do so ... 1/14/2000 errata identify those contexts. There - // are also spots in the internal subset where PE refs are fatal - // errors, hence yet another flag. - else if (c == '%' && expandPE) - { - if (peIsError) - { - error("PE reference within decl in internal subset."); - } - parsePEReference(); - return readCh(); - } - column++; - } - - return c; - } - - /** - * Push a single character back onto the current input stream. - * <p>This method usually pushes the character back onto - * the readBuffer. - * <p>I don't think that this would ever be called with - * readBufferPos = 0, because the methods always reads a character - * before unreading it, but just in case, I've added a boundary - * condition. - * @param c The character to push back. - * @see #readCh - * @see #unread (char[]) - * @see #readBuffer - */ - private void unread(char c) - throws SAXException - { - // Normal condition. - if (c == '\n') - { - line--; - column = -1; - } - if (readBufferPos > 0) - { - readBuffer[--readBufferPos] = c; - } - else - { - pushString(null, new Character(c).toString()); - } - } - - /** - * Push a char array back onto the current input stream. - * <p>NOTE: you must <em>never</em> push back characters that you - * haven't actually read: use pushString () instead. - * @see #readCh - * @see #unread (char) - * @see #readBuffer - * @see #pushString - */ - private void unread(char[] ch, int length) - throws SAXException - { - for (int i = 0; i < length; i++) - { - if (ch[i] == '\n') - { - line--; - column = -1; - } - } - if (length < readBufferPos) - { - readBufferPos -= length; - } - else - { - pushCharArray(null, ch, 0, length); - } - } - - /** - * Push, or skip, a new external input source. - * The source will be some kind of parsed entity, such as a PE - * (including the external DTD subset) or content for the body. - * - * @param url The java.net.URL object for the entity. - * @see SAXDriver#resolveEntity - * @see #pushString - * @see #sourceType - * @see #pushInput - * @see #detectEncoding - * @see #sourceType - * @see #readBuffer - */ - private void pushURL(boolean isPE, - String ename, - ExternalIdentifiers ids, - Reader reader, - InputStream stream, - String encoding, - boolean doResolve) - throws SAXException, IOException - { - boolean ignoreEncoding; - String systemId; - InputSource source; - - if (!isPE) - { - dataBufferFlush(); - } - - scratch.setPublicId(ids.publicId); - scratch.setSystemId(ids.systemId); - - // See if we should skip or substitute the entity. - // If we're not skipping, resolving reports startEntity() - // and updates the (handler's) stack of URIs. - if (doResolve) - { - // assert (stream == null && reader == null && encoding == null) - source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri); - if (source == null) - { - handler.warn("skipping entity: " + ename); - handler.skippedEntity(ename); - if (isPE) - { - skippedPE = true; - } - return; - } - - // we might be using alternate IDs/encoding - systemId = source.getSystemId(); - // The following warning and setting systemId was deleted bcause - // the application has the option of not setting systemId - // provided that it has set the characte/byte stream. - /* - if (systemId == null) { - handler.warn ("missing system ID, using " + ids.systemId); - systemId = ids.systemId; - } - */ - } - else - { - // "[document]", or "[dtd]" via getExternalSubset() - scratch.setCharacterStream(reader); - scratch.setByteStream(stream); - scratch.setEncoding(encoding); - source = scratch; - systemId = ids.systemId; - if (handler.stringInterning) - { - handler.startExternalEntity(ename, systemId, - "[document]" == ename); - } - else - { - handler.startExternalEntity(ename, systemId, - "[document]".equals(ename)); - } - } - - // we may have been given I/O streams directly - if (source.getCharacterStream() != null) - { - if (source.getByteStream() != null) - error("InputSource has two streams!"); - reader = source.getCharacterStream(); - } - else if (source.getByteStream() != null) - { - encoding = source.getEncoding(); - if (encoding == null) - { - stream = source.getByteStream(); - } - else - { - try - { - reader = new InputStreamReader(source.getByteStream(), - encoding); - } - catch (IOException e) - { - stream = source.getByteStream(); - } - } - } - else if (systemId == null) - { - error("InputSource has no URI!"); - } - scratch.setCharacterStream(null); - scratch.setByteStream(null); - scratch.setEncoding(null); - - // Push the existing status. - pushInput(ename); - - // Create a new read buffer. - // (Note the four-character margin) - readBuffer = new char[READ_BUFFER_MAX + 4]; - readBufferPos = 0; - readBufferLength = 0; - readBufferOverflow = -1; - is = null; - line = 1; - column = 0; - currentByteCount = 0; - - // If there's an explicit character stream, just - // ignore encoding declarations. - if (reader != null) - { - sourceType = INPUT_READER; - this.reader = reader; - tryEncodingDecl(true); - return; - } - - // Else we handle the conversion, and need to ensure - // it's done right. - sourceType = INPUT_STREAM; - if (stream != null) - { - is = stream; - } - else - { - // We have to open our own stream to the URL. - URL url = new URL(systemId); - - externalEntity = url.openConnection(); - externalEntity.connect(); - is = externalEntity.getInputStream(); - } - - // If we get to here, there must be - // an InputStream available. - if (!is.markSupported()) - { - is = new BufferedInputStream(is); - } - - // Get any external encoding label. - if (encoding == null && externalEntity != null) - { - // External labels can be untrustworthy; filesystems in - // particular often have the wrong default for content - // that wasn't locally originated. Those we autodetect. - if (!"file".equals(externalEntity.getURL().getProtocol())) - { - int temp; - - // application/xml;charset=something;otherAttr=... - // ... with many variants on 'something' - encoding = externalEntity.getContentType(); - - // MHK code (fix for Saxon 5.5.1/007): - // protect against encoding==null - if (encoding == null) - { - temp = -1; - } - else - { - temp = encoding.indexOf("charset"); - } - - // RFC 2376 sez MIME text defaults to ASCII, but since the - // JDK will create a MIME type out of thin air, we always - // autodetect when there's no explicit charset attribute. - if (temp < 0) - { - encoding = null; // autodetect - } - else - { - // only this one attribute - if ((temp = encoding.indexOf(';')) > 0) - { - encoding = encoding.substring(0, temp); - } - - if ((temp = encoding.indexOf('=', temp + 7)) > 0) - { - encoding = encoding.substring(temp + 1); - - // attributes can have comment fields (RFC 822) - if ((temp = encoding.indexOf('(')) > 0) - { - encoding = encoding.substring(0, temp); - } - // ... and values may be quoted - if ((temp = encoding.indexOf('"')) > 0) - { - encoding = - encoding.substring(temp + 1, - encoding.indexOf('"', temp + 2)); - } - encoding.trim(); - } - else - { - handler.warn("ignoring illegal MIME attribute: " - + encoding); - encoding = null; - } - } - } - } - - // if we got an external encoding label, use it ... - if (encoding != null) - { - this.encoding = ENCODING_EXTERNAL; - setupDecoding(encoding); - ignoreEncoding = true; - - // ... else autodetect from first bytes. - } - else - { - detectEncoding(); - ignoreEncoding = false; - } - - // Read any XML or text declaration. - // If we autodetected, it may tell us the "real" encoding. - try - { - tryEncodingDecl(ignoreEncoding); - } - catch (UnsupportedEncodingException x) - { - encoding = x.getMessage(); - - // if we don't handle the declared encoding, - // try letting a JVM InputStreamReader do it - try - { - if (sourceType != INPUT_STREAM) - { - throw x; - } - - is.reset(); - readBufferPos = 0; - readBufferLength = 0; - readBufferOverflow = -1; - line = 1; - currentByteCount = column = 0; - - sourceType = INPUT_READER; - this.reader = new InputStreamReader(is, encoding); - is = null; - - tryEncodingDecl(true); - - } - catch (IOException e) - { - error("unsupported text encoding", - encoding, - null); - } - } - } - - /** - * Check for an encoding declaration. This is the second part of the - * XML encoding autodetection algorithm, relying on detectEncoding to - * get to the point that this part can read any encoding declaration - * in the document (using only US-ASCII characters). - * - * <p> Because this part starts to fill parser buffers with this data, - * it's tricky to setup a reader so that Java's built-in decoders can be - * used for the character encodings that aren't built in to this parser - * (such as EUC-JP, KOI8-R, Big5, etc). - * - * @return any encoding in the declaration, uppercased; or null - * @see detectEncoding - */ - private String tryEncodingDecl(boolean ignoreEncoding) - throws SAXException, IOException - { - // Read the XML/text declaration. - if (tryRead("<?xml")) - { - if (tryWhitespace()) - { - if (inputStack.size() > 0) - { - return parseTextDecl(ignoreEncoding); - } - else - { - return parseXMLDecl(ignoreEncoding); - } - } - else - { - // <?xml-stylesheet ...?> or similar - unread('l'); - unread('m'); - unread('x'); - unread('?'); - unread('<'); - } - } - return null; - } - - /** - * Attempt to detect the encoding of an entity. - * <p>The trick here (as suggested in the XML standard) is that - * any entity not in UTF-8, or in UCS-2 with a byte-order mark, - * <b>must</b> begin with an XML declaration or an encoding - * declaration; we simply have to look for "<?xml" in various - * encodings. - * <p>This method has no way to distinguish among 8-bit encodings. - * Instead, it sets up for UTF-8, then (possibly) revises its assumption - * later in setupDecoding (). Any ASCII-derived 8-bit encoding - * should work, but most will be rejected later by setupDecoding (). - * @see #tryEncoding (byte[], byte, byte, byte, byte) - * @see #tryEncoding (byte[], byte, byte) - * @see #setupDecoding - */ - private void detectEncoding() - throws SAXException, IOException - { - byte[] signature = new byte[4]; - - // Read the first four bytes for - // autodetection. - is.mark(4); - is.read(signature); - is.reset(); - - // - // FIRST: four byte encodings (who uses these?) - // - if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, - (byte) 0x00, (byte) 0x3c)) - { - // UCS-4 must begin with "<?xml" - // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) - // "UTF-32BE" - encoding = ENCODING_UCS_4_1234; - } - else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, - (byte) 0x00, (byte) 0x00)) - { - // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) - // "UTF-32LE" - encoding = ENCODING_UCS_4_4321; - } - else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, - (byte) 0x3c, (byte) 0x00)) - { - // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) - encoding = ENCODING_UCS_4_2143; - } - else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, - (byte) 0x00, (byte) 0x00)) - { - // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) - encoding = ENCODING_UCS_4_3412; - - // 00 00 fe ff UCS_4_1234 (with BOM) - // ff fe 00 00 UCS_4_4321 (with BOM) - } - - // - // SECOND: two byte encodings - // note ... with 1/14/2000 errata the XML spec identifies some - // more "broken UTF-16" autodetection cases, with no XML decl, - // which we don't handle here (that's legal too). - // - else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) - { - // UCS-2 with a byte-order marker. (UTF-16) - // 0xfe 0xff: UCS-2, big-endian (12) - encoding = ENCODING_UCS_2_12; - is.read(); is.read(); - } - else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) - { - // UCS-2 with a byte-order marker. (UTF-16) - // 0xff 0xfe: UCS-2, little-endian (21) - encoding = ENCODING_UCS_2_21; - is.read(); is.read(); - } - else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, - (byte) 0x00, (byte) 0x3f)) - { - // UTF-16BE (otherwise, malformed UTF-16) - // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark - encoding = ENCODING_UCS_2_12; - error("no byte-order mark for UCS-2 entity"); - } - else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, - (byte) 0x3f, (byte) 0x00)) - { - // UTF-16LE (otherwise, malformed UTF-16) - // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark - encoding = ENCODING_UCS_2_21; - error("no byte-order mark for UCS-2 entity"); - } - - // - // THIRD: ASCII-derived encodings, fixed and variable lengths - // - else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, - (byte) 0x78, (byte) 0x6d)) - { - // ASCII derived - // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) - encoding = ENCODING_UTF_8; - prefetchASCIIEncodingDecl(); - } - else if (signature[0] == (byte) 0xef - && signature[1] == (byte) 0xbb - && signature[2] == (byte) 0xbf) - { - // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text) - // this un-needed notion slipped into XML 2nd ed through a - // "non-normative" erratum; now required by MSFT and UDDI, - // and E22 made it normative. - encoding = ENCODING_UTF_8; - is.read(); is.read(); is.read(); - } - else - { - // 4c 6f a7 94 ... we don't understand EBCDIC flavors - // ... but we COULD at least kick in some fixed code page - - // (default) UTF-8 without encoding/XML declaration - encoding = ENCODING_UTF_8; - } - } - - /** - * Check for a four-byte signature. - * <p>Utility routine for detectEncoding (). - * <p>Always looks for some part of "<?XML" in a specific encoding. - * @param sig The first four bytes read. - * @param b1 The first byte of the signature - * @param b2 The second byte of the signature - * @param b3 The third byte of the signature - * @param b4 The fourth byte of the signature - * @see #detectEncoding - */ - private static boolean tryEncoding(byte[] sig, byte b1, byte b2, - byte b3, byte b4) - { - return (sig[0] == b1 && sig[1] == b2 - && sig[2] == b3 && sig[3] == b4); - } - - /** - * Check for a two-byte signature. - * <p>Looks for a UCS-2 byte-order mark. - * <p>Utility routine for detectEncoding (). - * @param sig The first four bytes read. - * @param b1 The first byte of the signature - * @param b2 The second byte of the signature - * @see #detectEncoding - */ - private static boolean tryEncoding(byte[] sig, byte b1, byte b2) - { - return ((sig[0] == b1) && (sig[1] == b2)); - } - - /** - * This method pushes a string back onto input. - * <p>It is useful either as the expansion of an internal entity, - * or for backtracking during the parse. - * <p>Call pushCharArray () to do the actual work. - * @param s The string to push back onto input. - * @see #pushCharArray - */ - private void pushString(String ename, String s) - throws SAXException - { - char[] ch = s.toCharArray(); - pushCharArray(ename, ch, 0, ch.length); - } - - /** - * Push a new internal input source. - * <p>This method is useful for expanding an internal entity, - * or for unreading a string of characters. It creates a new - * readBuffer containing the characters in the array, instead - * of characters converted from an input byte stream. - * @param ch The char array to push. - * @see #pushString - * @see #pushURL - * @see #readBuffer - * @see #sourceType - * @see #pushInput - */ - private void pushCharArray(String ename, char[] ch, int start, int length) - throws SAXException - { - // Push the existing status - pushInput(ename); - if (ename != null && doReport) - { - dataBufferFlush(); - handler.startInternalEntity(ename); - } - sourceType = INPUT_INTERNAL; - readBuffer = ch; - readBufferPos = start; - readBufferLength = length; - readBufferOverflow = -1; - } - - /** - * Save the current input source onto the stack. - * <p>This method saves all of the global variables associated with - * the current input source, so that they can be restored when a new - * input source has finished. It also tests for entity recursion. - * <p>The method saves the following global variables onto a stack - * using a fixed-length array: - * <ol> - * <li>sourceType - * <li>externalEntity - * <li>readBuffer - * <li>readBufferPos - * <li>readBufferLength - * <li>line - * <li>encoding - * </ol> - * @param ename The name of the entity (if any) causing the new input. - * @see #popInput - * @see #sourceType - * @see #externalEntity - * @see #readBuffer - * @see #readBufferPos - * @see #readBufferLength - * @see #line - * @see #encoding - */ - private void pushInput(String ename) - throws SAXException - { - // Check for entity recursion. - if (ename != null) - { - Iterator entities = entityStack.iterator(); - while (entities.hasNext()) - { - String e = (String) entities.next(); - if (e != null && e == ename) - { - error("recursive reference to entity", ename, null); - } - } - } - entityStack.addLast(ename); - - // Don't bother if there is no current input. - if (sourceType == INPUT_NONE) - { - return; - } - - // Set up a snapshot of the current - // input source. - Input input = new Input(); - - input.sourceType = sourceType; - input.externalEntity = externalEntity; - input.readBuffer = readBuffer; - input.readBufferPos = readBufferPos; - input.readBufferLength = readBufferLength; - input.line = line; - input.encoding = encoding; - input.readBufferOverflow = readBufferOverflow; - input.is = is; - input.currentByteCount = currentByteCount; - input.column = column; - input.reader = reader; - - // Push it onto the stack. - inputStack.addLast(input); - } - - /** - * Restore a previous input source. - * <p>This method restores all of the global variables associated with - * the current input source. - * @exception java.io.EOFException - * If there are no more entries on the input stack. - * @see #pushInput - * @see #sourceType - * @see #externalEntity - * @see #readBuffer - * @see #readBufferPos - * @see #readBufferLength - * @see #line - * @see #encoding - */ - private void popInput() - throws SAXException, IOException - { - String ename = (String) entityStack.removeLast(); - - if (ename != null && doReport) - { - dataBufferFlush(); - } - switch (sourceType) - { - case INPUT_STREAM: - handler.endExternalEntity(ename); - is.close(); - break; - case INPUT_READER: - handler.endExternalEntity(ename); - reader.close(); - break; - case INPUT_INTERNAL: - if (ename != null && doReport) - { - handler.endInternalEntity(ename); - } - break; - } - - // Throw an EOFException if there - // is nothing else to pop. - if (inputStack.isEmpty()) - { - throw new EOFException("no more input"); - } - - Input input = (Input) inputStack.removeLast(); - - sourceType = input.sourceType; - externalEntity = input.externalEntity; - readBuffer = input.readBuffer; - readBufferPos = input.readBufferPos; - readBufferLength = input.readBufferLength; - line = input.line; - encoding = input.encoding; - readBufferOverflow = input.readBufferOverflow; - is = input.is; - currentByteCount = input.currentByteCount; - column = input.column; - reader = input.reader; - } - - /** - * Return true if we can read the expected character. - * <p>Note that the character will be removed from the input stream - * on success, but will be put back on failure. Do not attempt to - * read the character again if the method succeeds. - * @param delim The character that should appear next. For a - * insensitive match, you must supply this in upper-case. - * @return true if the character was successfully read, or false if - * it was not. - * @see #tryRead (String) - */ - private boolean tryRead(char delim) - throws SAXException, IOException - { - char c; - - // Read the character - c = readCh(); - - // Test for a match, and push the character - // back if the match fails. - if (c == delim) - { - return true; - } - else - { - unread(c); - return false; - } - } - - /** - * Return true if we can read the expected string. - * <p>This is simply a convenience method. - * <p>Note that the string will be removed from the input stream - * on success, but will be put back on failure. Do not attempt to - * read the string again if the method succeeds. - * <p>This method will push back a character rather than an - * array whenever possible (probably the majority of cases). - * @param delim The string that should appear next. - * @return true if the string was successfully read, or false if - * it was not. - * @see #tryRead (char) - */ - private boolean tryRead(String delim) - throws SAXException, IOException - { - return tryRead(delim.toCharArray()); - } - - private boolean tryRead(char[] ch) - throws SAXException, IOException - { - char c; - - // Compare the input, character- - // by character. - - for (int i = 0; i < ch.length; i++) - { - c = readCh(); - if (c != ch[i]) - { - unread(c); - if (i != 0) - { - unread(ch, i); - } - return false; - } - } - return true; - } - - /** - * Return true if we can read some whitespace. - * <p>This is simply a convenience method. - * <p>This method will push back a character rather than an - * array whenever possible (probably the majority of cases). - * @return true if whitespace was found. - */ - private boolean tryWhitespace() - throws SAXException, IOException - { - char c; - c = readCh(); - if (isWhitespace(c)) - { - skipWhitespace(); - return true; - } - else - { - unread(c); - return false; - } - } - - /** - * Read all data until we find the specified string. - * This is useful for scanning CDATA sections and PIs. - * <p>This is inefficient right now, since it calls tryRead () - * for every character. - * @param delim The string delimiter - * @see #tryRead (String, boolean) - * @see #readCh - */ - private void parseUntil(String delim) - throws SAXException, IOException - { - parseUntil(delim.toCharArray()); - } - - private void parseUntil(char[] delim) - throws SAXException, IOException - { - char c; - int startLine = line; - - try - { - while (!tryRead(delim)) - { - c = readCh(); - dataBufferAppend(c); - } - } - catch (EOFException e) - { - error("end of input while looking for delimiter " - + "(started on line " + startLine - + ')', null, new String(delim)); - } - } - - ////////////////////////////////////////////////////////////////////// - // Low-level I/O. - ////////////////////////////////////////////////////////////////////// - - /** - * Prefetch US-ASCII XML/text decl from input stream into read buffer. - * Doesn't buffer more than absolutely needed, so that when an encoding - * decl says we need to create an InputStreamReader, we can discard our - * buffer and reset(). Caller knows the first chars of the decl exist - * in the input stream. - */ - private void prefetchASCIIEncodingDecl() - throws SAXException, IOException - { - int ch; - readBufferPos = readBufferLength = 0; - - is.mark(readBuffer.length); - while (true) - { - ch = is.read(); - readBuffer[readBufferLength++] = (char) ch; - switch (ch) - { - case (int) '>': - return; - case -1: - error("file ends before end of XML or encoding declaration.", - null, "?>"); - } - if (readBuffer.length == readBufferLength) - { - error("unfinished XML or encoding declaration"); - } - } - } - - /** - * Read a chunk of data from an external input source. - * <p>This is simply a front-end that fills the rawReadBuffer - * with bytes, then calls the appropriate encoding handler. - * @see #encoding - * @see #rawReadBuffer - * @see #readBuffer - * @see #filterCR - * @see #copyUtf8ReadBuffer - * @see #copyIso8859_1ReadBuffer - * @see #copyUcs_2ReadBuffer - * @see #copyUcs_4ReadBuffer - */ - private void readDataChunk() - throws SAXException, IOException - { - int count; - - // See if we have any overflow (filterCR sets for CR at end) - if (readBufferOverflow > -1) - { - readBuffer[0] = (char) readBufferOverflow; - readBufferOverflow = -1; - readBufferPos = 1; - sawCR = true; - } - else - { - readBufferPos = 0; - sawCR = false; - } - - // input from a character stream. - if (sourceType == INPUT_READER) - { - count = reader.read(readBuffer, - readBufferPos, READ_BUFFER_MAX - readBufferPos); - if (count < 0) - { - readBufferLength = readBufferPos; - } - else - { - readBufferLength = readBufferPos + count; - } - if (readBufferLength > 0) - { - filterCR(count >= 0); - } - sawCR = false; - return; - } - - // Read as many bytes as possible into the raw buffer. - count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); - - // Dispatch to an encoding-specific reader method to populate - // the readBuffer. In most parser speed profiles, these routines - // show up at the top of the CPU usage chart. - if (count > 0) - { - switch (encoding) - { - // one byte builtins - case ENCODING_ASCII: - copyIso8859_1ReadBuffer(count, (char) 0x0080); - break; - case ENCODING_UTF_8: - copyUtf8ReadBuffer(count); - break; - case ENCODING_ISO_8859_1: - copyIso8859_1ReadBuffer(count, (char) 0); - break; - - // two byte builtins - case ENCODING_UCS_2_12: - copyUcs2ReadBuffer(count, 8, 0); - break; - case ENCODING_UCS_2_21: - copyUcs2ReadBuffer(count, 0, 8); - break; - - // four byte builtins - case ENCODING_UCS_4_1234: - copyUcs4ReadBuffer(count, 24, 16, 8, 0); - break; - case ENCODING_UCS_4_4321: - copyUcs4ReadBuffer(count, 0, 8, 16, 24); - break; - case ENCODING_UCS_4_2143: - copyUcs4ReadBuffer(count, 16, 24, 0, 8); - break; - case ENCODING_UCS_4_3412: - copyUcs4ReadBuffer(count, 8, 0, 24, 16); - break; - } - } - else - { - readBufferLength = readBufferPos; - } - - readBufferPos = 0; - - // Filter out all carriage returns if we've seen any - // (including any saved from a previous read) - if (sawCR) - { - filterCR(count >= 0); - sawCR = false; - - // must actively report EOF, lest some CRs get lost. - if (readBufferLength == 0 && count >= 0) - { - readDataChunk(); - } - } - - if (count > 0) - { - currentByteCount += count; - } - } - - /** - * Filter carriage returns in the read buffer. - * CRLF becomes LF; CR becomes LF. - * @param moreData true iff more data might come from the same source - * @see #readDataChunk - * @see #readBuffer - * @see #readBufferOverflow - */ - private void filterCR(boolean moreData) - { - int i, j; - - readBufferOverflow = -1; - -loop: - for (i = j = readBufferPos; j < readBufferLength; i++, j++) - { - switch (readBuffer[j]) - { - case '\r': - if (j == readBufferLength - 1) - { - if (moreData) - { - readBufferOverflow = '\r'; - readBufferLength--; - } - else // CR at end of buffer - { - readBuffer[i++] = '\n'; - } - break loop; - } - else if (readBuffer[j + 1] == '\n') - { - j++; - } - readBuffer[i] = '\n'; - break; - - case '\n': - default: - readBuffer[i] = readBuffer[j]; - break; - } - } - readBufferLength = i; - } - - /** - * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. - * <p>When readDataChunk () calls this method, the raw bytes are in - * rawReadBuffer, and the final characters will appear in - * readBuffer. - * <p>Note that as of Unicode 3.1, good practice became a requirement, - * so that each Unicode character has exactly one UTF-8 representation. - * @param count The number of bytes to convert. - * @see #readDataChunk - * @see #rawReadBuffer - * @see #readBuffer - * @see #getNextUtf8Byte - */ - private void copyUtf8ReadBuffer(int count) - throws SAXException, IOException - { - int i = 0; - int j = readBufferPos; - int b1; - char c = 0; - - /* - // check once, so the runtime won't (if it's smart enough) - if (count < 0 || count > rawReadBuffer.length) - throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); - */ - - while (i < count) - { - b1 = rawReadBuffer[i++]; - - // Determine whether we are dealing - // with a one-, two-, three-, or four- - // byte sequence. - if (b1 < 0) - { - if ((b1 & 0xe0) == 0xc0) - { - // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx - c = (char) (((b1 & 0x1f) << 6) - | getNextUtf8Byte(i++, count)); - if (c < 0x0080) - { - encodingError("Illegal two byte UTF-8 sequence", - c, 0); - } - - //Sec 2.11 - // [1] the two-character sequence #xD #xA - // [2] the two-character sequence #xD #x85 - if ((c == 0x0085 || c == 0x000a) && sawCR) - { - continue; - } - - // Sec 2.11 - // [3] the single character #x85 - - if (c == 0x0085 && xmlVersion == XML_11) - { - readBuffer[j++] = '\r'; - } - } - else if ((b1 & 0xf0) == 0xe0) - { - // 3-byte sequence: - // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx - // most CJKV characters - c = (char) (((b1 & 0x0f) << 12) | - (getNextUtf8Byte(i++, count) << 6) | - getNextUtf8Byte(i++, count)); - //sec 2.11 - //[4] the single character #x2028 - if (c == 0x2028 && xmlVersion == XML_11) - { - readBuffer[j++] = '\r'; - sawCR = true; - continue; - } - if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff)) - { - encodingError("Illegal three byte UTF-8 sequence", - c, 0); - } - } - else if ((b1 & 0xf8) == 0xf0) - { - // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx - // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx - // (uuuuu = wwww + 1) - // "Surrogate Pairs" ... from the "Astral Planes" - // Unicode 3.1 assigned the first characters there - int iso646 = b1 & 07; - iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); - iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); - iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); - - if (iso646 <= 0xffff) - { - encodingError("Illegal four byte UTF-8 sequence", - iso646, 0); - } - else - { - if (iso646 > 0x0010ffff) - { - encodingError("UTF-8 value out of range for Unicode", - iso646, 0); - } - iso646 -= 0x010000; - readBuffer[j++] = (char) (0xd800 | (iso646 >> 10)); - readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff)); - continue; - } - } - else - { - // The five and six byte encodings aren't supported; - // they exceed the Unicode (and XML) range. - encodingError("unsupported five or six byte UTF-8 sequence", - 0xff & b1, i); - // NOTREACHED - c = 0; - } - } - else - { - // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx - // (US-ASCII character, "common" case, one branch to here) - c = (char) b1; - } - readBuffer[j++] = c; - if (c == '\r') - { - sawCR = true; - } - } - // How many characters have we read? - readBufferLength = j; - } - - /** - * Return the next byte value in a UTF-8 sequence. - * If it is not possible to get a byte from the current - * entity, throw an exception. - * @param pos The current position in the rawReadBuffer. - * @param count The number of bytes in the rawReadBuffer - * @return The significant six bits of a non-initial byte in - * a UTF-8 sequence. - * @exception EOFException If the sequence is incomplete. - */ - private int getNextUtf8Byte(int pos, int count) - throws SAXException, IOException - { - int val; - - // Take a character from the buffer - // or from the actual input stream. - if (pos < count) - { - val = rawReadBuffer[pos]; - } - else - { - val = is.read(); - if (val == -1) - { - encodingError("unfinished multi-byte UTF-8 sequence at EOF", - -1, pos); - } - } - - // Check for the correct bits at the start. - if ((val & 0xc0) != 0x80) - { - encodingError("bad continuation of multi-byte UTF-8 sequence", - val, pos + 1); - } - - // Return the significant bits. - return (val & 0x3f); - } - - /** - * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into - * UTF-16 characters. - * - * <p>When readDataChunk () calls this method, the raw bytes are in - * rawReadBuffer, and the final characters will appear in - * readBuffer. - * - * @param count The number of bytes to convert. - * @param mask For ASCII conversion, 0x7f; else, 0xff. - * @see #readDataChunk - * @see #rawReadBuffer - * @see #readBuffer - */ - private void copyIso8859_1ReadBuffer(int count, char mask) - throws IOException - { - int i, j; - for (i = 0, j = readBufferPos; i < count; i++, j++) - { - char c = (char) (rawReadBuffer[i] & 0xff); - if ((c & mask) != 0) - { - throw new CharConversionException("non-ASCII character U+" - + Integer.toHexString(c)); - } - if (c == 0x0085 && xmlVersion == XML_11) - { - c = '\r'; - } - readBuffer[j] = c; - if (c == '\r') - { - sawCR = true; - } - } - readBufferLength = j; - } - - /** - * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters - * (as used in Java string manipulation). - * - * <p>When readDataChunk () calls this method, the raw bytes are in - * rawReadBuffer, and the final characters will appear in - * readBuffer. - * @param count The number of bytes to convert. - * @param shift1 The number of bits to shift byte 1. - * @param shift2 The number of bits to shift byte 2 - * @see #readDataChunk - * @see #rawReadBuffer - * @see #readBuffer - */ - private void copyUcs2ReadBuffer(int count, int shift1, int shift2) - throws SAXException - { - int j = readBufferPos; - - if (count > 0 && (count % 2) != 0) - { - encodingError("odd number of bytes in UCS-2 encoding", -1, count); - } - // The loops are faster with less internal brancing; hence two - if (shift1 == 0) - { // "UTF-16-LE" - for (int i = 0; i < count; i += 2) - { - char c = (char) (rawReadBuffer[i + 1] << 8); - c |= 0xff & rawReadBuffer[i]; - readBuffer[j++] = c; - if (c == '\r') - { - sawCR = true; - } - } - } - else - { // "UTF-16-BE" - for (int i = 0; i < count; i += 2) - { - char c = (char) (rawReadBuffer[i] << 8); - c |= 0xff & rawReadBuffer[i + 1]; - readBuffer[j++] = c; - if (c == '\r') - { - sawCR = true; - } - } - } - readBufferLength = j; - } - - /** - * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. - * - * <p>When readDataChunk () calls this method, the raw bytes are in - * rawReadBuffer, and the final characters will appear in - * readBuffer. - * <p>Java has Unicode chars, and this routine uses surrogate pairs - * for ISO-10646 values between 0x00010000 and 0x000fffff. An - * exception is thrown if the ISO-10646 character has no Unicode - * representation. - * - * @param count The number of bytes to convert. - * @param shift1 The number of bits to shift byte 1. - * @param shift2 The number of bits to shift byte 2 - * @param shift3 The number of bits to shift byte 2 - * @param shift4 The number of bits to shift byte 2 - * @see #readDataChunk - * @see #rawReadBuffer - * @see #readBuffer - */ - private void copyUcs4ReadBuffer(int count, int shift1, int shift2, - int shift3, int shift4) - throws SAXException - { - int j = readBufferPos; - - if (count > 0 && (count % 4) != 0) - { - encodingError("number of bytes in UCS-4 encoding " + - "not divisible by 4", - -1, count); - } - for (int i = 0; i < count; i += 4) - { - int value = (((rawReadBuffer [i] & 0xff) << shift1) | - ((rawReadBuffer [i + 1] & 0xff) << shift2) | - ((rawReadBuffer [i + 2] & 0xff) << shift3) | - ((rawReadBuffer [i + 3] & 0xff) << shift4)); - if (value < 0x0000ffff) - { - readBuffer [j++] = (char) value; - if (value == (int) '\r') - { - sawCR = true; - } - } - else if (value < 0x0010ffff) - { - value -= 0x010000; - readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); - readBuffer[j++] = (char) (0xdc | (value & 0x03ff)); - } - else - { - encodingError("UCS-4 value out of range for Unicode", - value, i); - } - } - readBufferLength = j; - } - - /** - * Report a character encoding error. - */ - private void encodingError(String message, int value, int offset) - throws SAXException - { - if (value != -1) - { - message = message + " (character code: 0x" + - Integer.toHexString(value) + ')'; - error(message); - } - } - - ////////////////////////////////////////////////////////////////////// - // Local Variables. - ////////////////////////////////////////////////////////////////////// - - /** - * Re-initialize the variables for each parse. - */ - private void initializeVariables() - { - // First line - line = 1; - column = 0; - - // Set up the buffers for data and names - dataBufferPos = 0; - dataBuffer = new char[DATA_BUFFER_INITIAL]; - nameBufferPos = 0; - nameBuffer = new char[NAME_BUFFER_INITIAL]; - - // Set up the DTD hash tables - elementInfo = new HashMap(); - entityInfo = new HashMap(); - notationInfo = new HashMap(); - skippedPE = false; - - // Set up the variables for the current - // element context. - currentElement = null; - currentElementContent = CONTENT_UNDECLARED; - - // Set up the input variables - sourceType = INPUT_NONE; - inputStack = new LinkedList(); - entityStack = new LinkedList(); - externalEntity = null; - tagAttributePos = 0; - tagAttributes = new String[100]; - rawReadBuffer = new byte[READ_BUFFER_MAX]; - readBufferOverflow = -1; - - scratch = new InputSource(); - - inLiteral = false; - expandPE = false; - peIsError = false; - - doReport = false; - - inCDATA = false; - - symbolTable = new Object[SYMBOL_TABLE_LENGTH][]; - } - - static class ExternalIdentifiers - { - - String publicId; - String systemId; - String baseUri; - - ExternalIdentifiers() - { - } - - ExternalIdentifiers(String publicId, String systemId, String baseUri) - { - this.publicId = publicId; - this.systemId = systemId; - this.baseUri = baseUri; - } - - } - - static class EntityInfo - { - - int type; - ExternalIdentifiers ids; - String value; - String notationName; - - } - - static class AttributeDecl - { - - String type; - String value; - int valueType; - String enumeration; - String defaultValue; - - } - - static class ElementDecl - { - - int contentType; - String contentModel; - HashMap attributes; - - } - - static class Input - { - - int sourceType; - URLConnection externalEntity; - char[] readBuffer; - int readBufferPos; - int readBufferLength; - int line; - int encoding; - int readBufferOverflow; - InputStream is; - int currentByteCount; - int column; - Reader reader; - - } - -} - |