1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2 //
3 // TagSoup is licensed under the Apache License,
4 // Version 2.0.  You may obtain a copy of this license at
5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6 // additional legal rights not granted by this license.
7 //
8 // TagSoup is distributed in the hope that it will be useful, but
9 // unless required by applicable law or agreed to in writing, TagSoup
10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11 // OF ANY KIND, either express or implied; not even the implied warranty
12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 //
14 //
15 // The TagSoup parser
16 
17 package org.ccil.cowan.tagsoup;
18 import java.util.HashMap;
19 import java.util.ArrayList;
20 import java.util.Locale;
21 import java.io.*;
22 import java.net.URL;
23 import java.net.URLConnection;
24 import org.xml.sax.*;
25 import org.xml.sax.helpers.DefaultHandler;
26 import org.xml.sax.ext.LexicalHandler;
27 
28 
29 /**
30 The SAX parser class.
31 **/
32 public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
33 
34 	// XMLReader implementation
35 
36 	private ContentHandler theContentHandler = this;
37 	private LexicalHandler theLexicalHandler = this;
38 	private DTDHandler theDTDHandler = this;
39 	private ErrorHandler theErrorHandler = this;
40 	private EntityResolver theEntityResolver = this;
41 	private Schema theSchema;
42 	private Scanner theScanner;
43 	private AutoDetector theAutoDetector;
44 
45 	// Default values for feature flags
46 
47 	private static boolean DEFAULT_NAMESPACES = true;
48 	private static boolean DEFAULT_IGNORE_BOGONS = false;
49 	private static boolean DEFAULT_BOGONS_EMPTY = false;
50         private static boolean DEFAULT_ROOT_BOGONS = true;
51 	private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true;
52 	private static boolean DEFAULT_TRANSLATE_COLONS = false;
53 	private static boolean DEFAULT_RESTART_ELEMENTS = true;
54 	private static boolean DEFAULT_IGNORABLE_WHITESPACE = false;
55 	private static boolean DEFAULT_CDATA_ELEMENTS = true;
56 
57 	// Feature flags.
58 
59 	private boolean namespaces = DEFAULT_NAMESPACES;
60 	private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS;
61 	private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY;
62         private boolean rootBogons = DEFAULT_ROOT_BOGONS;
63 	private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES;
64 	private boolean translateColons = DEFAULT_TRANSLATE_COLONS;
65 	private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
66 	private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
67 	private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS;
68 
69 	/**
70 	A value of "true" indicates namespace URIs and unprefixed local
71 	names for element and attribute names will be available.
72 	**/
73 	public final static String namespacesFeature =
74 		"http://xml.org/sax/features/namespaces";
75 
76 	/**
77 	A value of "true" indicates that XML qualified names (with prefixes)
78 	and attributes (including xmlns* attributes) will be available.
79 	We don't support this value.
80 	**/
81 	public final static String namespacePrefixesFeature =
82 		"http://xml.org/sax/features/namespace-prefixes";
83 
84 	/**
85 	Reports whether this parser processes external general entities
86 	(it doesn't).
87 	**/
88 	public final static String externalGeneralEntitiesFeature =
89 		"http://xml.org/sax/features/external-general-entities";
90 
91 	/**
92 	Reports whether this parser processes external parameter entities
93 	(it doesn't).
94 	**/
95 	public final static String externalParameterEntitiesFeature =
96 		"http://xml.org/sax/features/external-parameter-entities";
97 
98 	/**
99 	May be examined only during a parse, after the startDocument()
100 	callback has been completed; read-only. The value is true if
101 	the document specified standalone="yes" in its XML declaration,
102 	and otherwise is false.  (It's always false.)
103 	**/
104 	public final static String isStandaloneFeature =
105 		"http://xml.org/sax/features/is-standalone";
106 
107 	/**
108 	A value of "true" indicates that the LexicalHandler will report
109 	the beginning and end of parameter entities (it won't).
110 	**/
111 	public final static String lexicalHandlerParameterEntitiesFeature =
112 		"http://xml.org/sax/features/lexical-handler/parameter-entities";
113 
114 	/**
115 	A value of "true" indicates that system IDs in declarations will
116 	be absolutized (relative to their base URIs) before reporting.
117 	(This returns true but doesn't actually do anything.)
118 	**/
119 	public final static String resolveDTDURIsFeature =
120 		"http://xml.org/sax/features/resolve-dtd-uris";
121 
122 	/**
123 	Has a value of "true" if all XML names (for elements,
124 	prefixes, attributes, entities, notations, and local
125 	names), as well as Namespace URIs, will have been interned
126 	using java.lang.String.intern. This supports fast testing of
127 	equality/inequality against string constants, rather than forcing
128 	slower calls to String.equals().  (We always intern.)
129 	**/
130 	public final static String stringInterningFeature =
131 		"http://xml.org/sax/features/string-interning";
132 
133 	/**
134 	Returns "true" if the Attributes objects passed by this
135 	parser in ContentHandler.startElement() implement the
136 	org.xml.sax.ext.Attributes2 interface.	(They don't.)
137 	**/
138 
139 	public final static String useAttributes2Feature =
140 		"http://xml.org/sax/features/use-attributes2";
141 
142 	/**
143 	Returns "true" if the Locator objects passed by this parser
144 	in ContentHandler.setDocumentLocator() implement the
145 	org.xml.sax.ext.Locator2 interface.  (They don't.)
146 	**/
147 	public final static String useLocator2Feature =
148 		"http://xml.org/sax/features/use-locator2";
149 
150 	/**
151 	Returns "true" if, when setEntityResolver is given an object
152 	implementing the org.xml.sax.ext.EntityResolver2 interface,
153 	those new methods will be used.  (They won't be.)
154 	**/
155 	public final static String useEntityResolver2Feature =
156 		"http://xml.org/sax/features/use-entity-resolver2";
157 
158 	/**
159 	Controls whether the parser is reporting all validity errors
160 	(We don't report any validity errors.)
161 	**/
162 	public final static String validationFeature =
163 		"http://xml.org/sax/features/validation";
164 
165 	/**
166 	Controls whether the parser reports Unicode normalization
167 	errors as described in section 2.13 and Appendix B of the XML
168 	1.1 Recommendation.  (We don't normalize.)
169 	**/
170 	public final static String unicodeNormalizationCheckingFeature =
171 "http://xml.org/sax/features/unicode-normalization-checking";
172 
173 	/**
174 	Controls whether, when the namespace-prefixes feature is set,
175 	the parser treats namespace declaration attributes as being in
176 	the http://www.w3.org/2000/xmlns/ namespace.  (It doesn't.)
177 	**/
178 	public final static String xmlnsURIsFeature =
179 		"http://xml.org/sax/features/xmlns-uris";
180 
181 	/**
182 	Returns "true" if the parser supports both XML 1.1 and XML 1.0.
183 	(Always false.)
184 	**/
185 	public final static String XML11Feature =
186 		"http://xml.org/sax/features/xml-1.1";
187 
188 	/**
189 	A value of "true" indicates that the parser will ignore
190 	unknown elements.
191 	**/
192 	public final static String ignoreBogonsFeature =
193 		"http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
194 
195 	/**
196 	A value of "true" indicates that the parser will give unknown
197 	elements a content model of EMPTY; a value of "false", a
198 	content model of ANY.
199 	**/
200 	public final static String bogonsEmptyFeature =
201 		"http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
202 
203 	/**
204 	A value of "true" indicates that the parser will allow unknown
205 	elements to be the root element.
206 	**/
207 	public final static String rootBogonsFeature =
208 		"http://www.ccil.org/~cowan/tagsoup/features/root-bogons";
209 
210 	/**
211 	A value of "true" indicates that the parser will return default
212 	attribute values for missing attributes that have default values.
213 	**/
214 	public final static String defaultAttributesFeature =
215 		"http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
216 
217 	/**
218 	A value of "true" indicates that the parser will
219 	translate colons into underscores in names.
220 	**/
221 	public final static String translateColonsFeature =
222 		"http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
223 
224 	/**
225 	A value of "true" indicates that the parser will
226 	attempt to restart the restartable elements.
227 	**/
228 	public final static String restartElementsFeature =
229 		"http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
230 
231 	/**
232 	A value of "true" indicates that the parser will
233 	transmit whitespace in element-only content via the SAX
234 	ignorableWhitespace callback.  Normally this is not done,
235 	because HTML is an SGML application and SGML suppresses
236 	such whitespace.
237 	**/
238 	public final static String ignorableWhitespaceFeature =
239 		"http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
240 
241 	/**
242 	A value of "true" indicates that the parser will treat CDATA
243 	elements specially.  Normally true, since the input is by
244 	default HTML.
245 	**/
246 	public final static String CDATAElementsFeature =
247 		"http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
248 
249 	/**
250 	Used to see some syntax events that are essential in some
251 	applications: comments, CDATA delimiters, selected general
252 	entity inclusions, and the start and end of the DTD (and
253 	declaration of document element name). The Object must implement
254 	org.xml.sax.ext.LexicalHandler.
255 	**/
256 	public final static String lexicalHandlerProperty =
257 		"http://xml.org/sax/properties/lexical-handler";
258 
259 	/**
260 	Specifies the Scanner object this Parser uses.
261 	**/
262 	public final static String scannerProperty =
263 		"http://www.ccil.org/~cowan/tagsoup/properties/scanner";
264 
265 	/**
266 	Specifies the Schema object this Parser uses.
267 	**/
268 	public final static String schemaProperty =
269 		"http://www.ccil.org/~cowan/tagsoup/properties/schema";
270 
271 	/**
272 	Specifies the AutoDetector (for encoding detection) this Parser uses.
273 	**/
274 	public final static String autoDetectorProperty =
275 		"http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
276 
277 	// Due to sucky Java order of initialization issues, these
278 	// entries are maintained separately from the initial values of
279 	// the corresponding instance variables, but care must be taken
280 	// to keep them in sync.
281 
282 	private HashMap theFeatures = new HashMap();
283 	{
theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES))284 		theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES));
theFeatures.put(namespacePrefixesFeature, Boolean.FALSE)285 		theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE)286 		theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE)287 		theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
theFeatures.put(isStandaloneFeature, Boolean.FALSE)288 		theFeatures.put(isStandaloneFeature, Boolean.FALSE);
theFeatures.put(lexicalHandlerParameterEntitiesFeature, Boolean.FALSE)289 		theFeatures.put(lexicalHandlerParameterEntitiesFeature,
290 			Boolean.FALSE);
theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE)291 		theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
theFeatures.put(stringInterningFeature, Boolean.TRUE)292 		theFeatures.put(stringInterningFeature, Boolean.TRUE);
theFeatures.put(useAttributes2Feature, Boolean.FALSE)293 		theFeatures.put(useAttributes2Feature, Boolean.FALSE);
theFeatures.put(useLocator2Feature, Boolean.FALSE)294 		theFeatures.put(useLocator2Feature, Boolean.FALSE);
theFeatures.put(useEntityResolver2Feature, Boolean.FALSE)295 		theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
theFeatures.put(validationFeature, Boolean.FALSE)296 		theFeatures.put(validationFeature, Boolean.FALSE);
theFeatures.put(xmlnsURIsFeature, Boolean.FALSE)297 		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
theFeatures.put(xmlnsURIsFeature, Boolean.FALSE)298 		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
theFeatures.put(XML11Feature, Boolean.FALSE)299 		theFeatures.put(XML11Feature, Boolean.FALSE);
theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS))300 		theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS));
theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY))301 		theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY));
theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS))302 		theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS));
theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES))303 		theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES));
theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS))304 		theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS));
theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS))305 		theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS));
theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE))306 		theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS))307 		theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS));
308 		}
309 
310 	// Private clone of Boolean.valueOf that is guaranteed to return
311 	// Boolean.TRUE or Boolean.FALSE
truthValue(boolean b)312 	private static Boolean truthValue(boolean b) {
313 		return b ? Boolean.TRUE : Boolean.FALSE;
314 		}
315 
316 
getFeature(String name)317 	public boolean getFeature (String name)
318 		throws SAXNotRecognizedException, SAXNotSupportedException {
319 		Boolean b = (Boolean)theFeatures.get(name);
320 		if (b == null) {
321 			throw new SAXNotRecognizedException("Unknown feature " + name);
322 			}
323 		return b.booleanValue();
324 		}
325 
setFeature(String name, boolean value)326 	public void setFeature (String name, boolean value)
327 	throws SAXNotRecognizedException, SAXNotSupportedException {
328 		Boolean b = (Boolean)theFeatures.get(name);
329 		if (b == null) {
330 			throw new SAXNotRecognizedException("Unknown feature " + name);
331 			}
332 		if (value) theFeatures.put(name, Boolean.TRUE);
333 		else theFeatures.put(name, Boolean.FALSE);
334 
335 		if (name.equals(namespacesFeature)) namespaces = value;
336 		else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
337 		else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
338 		else if (name.equals(rootBogonsFeature)) rootBogons = value;
339 		else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
340 		else if (name.equals(translateColonsFeature)) translateColons = value;
341 		else if (name.equals(restartElementsFeature)) restartElements = value;
342 		else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
343 		else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
344 		}
345 
getProperty(String name)346 	public Object getProperty (String name)
347 	throws SAXNotRecognizedException, SAXNotSupportedException {
348 		if (name.equals(lexicalHandlerProperty)) {
349 			return theLexicalHandler == this ? null : theLexicalHandler;
350 			}
351 		else if (name.equals(scannerProperty)) {
352 			return theScanner;
353 			}
354 		else if (name.equals(schemaProperty)) {
355 			return theSchema;
356 			}
357 		else if (name.equals(autoDetectorProperty)) {
358 			return theAutoDetector;
359 			}
360 		else {
361 			throw new SAXNotRecognizedException("Unknown property " + name);
362 			}
363 		}
364 
setProperty(String name, Object value)365 	public void setProperty (String name, Object value)
366 	throws SAXNotRecognizedException, SAXNotSupportedException {
367 		if (name.equals(lexicalHandlerProperty)) {
368 			if (value == null) {
369 				theLexicalHandler = this;
370 				}
371 			else if (value instanceof LexicalHandler) {
372 				theLexicalHandler = (LexicalHandler)value;
373 				}
374 			else {
375 				throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
376 				}
377 			}
378 		else if (name.equals(scannerProperty)) {
379 			if (value instanceof Scanner) {
380 				theScanner = (Scanner)value;
381 				}
382 			else {
383 				throw new SAXNotSupportedException("Your scanner is not a Scanner");
384 				}
385 			}
386 		else if (name.equals(schemaProperty)) {
387 			if (value instanceof Schema) {
388 				theSchema = (Schema)value;
389 				}
390 			else {
391 				 throw new SAXNotSupportedException("Your schema is not a Schema");
392 				}
393 			}
394 		else if (name.equals(autoDetectorProperty)) {
395 			if (value instanceof AutoDetector) {
396 				theAutoDetector = (AutoDetector)value;
397 				}
398 			else {
399 				throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
400 				}
401 			}
402 		else {
403 			throw new SAXNotRecognizedException("Unknown property " + name);
404 			}
405 		}
406 
setEntityResolver(EntityResolver resolver)407 	public void setEntityResolver (EntityResolver resolver) {
408 		theEntityResolver = (resolver == null) ? this : resolver;
409 		}
410 
getEntityResolver()411 	public EntityResolver getEntityResolver () {
412 		return (theEntityResolver == this) ? null : theEntityResolver;
413 		}
414 
setDTDHandler(DTDHandler handler)415 	public void setDTDHandler (DTDHandler handler) {
416 		theDTDHandler = (handler == null) ? this : handler;
417 		}
418 
getDTDHandler()419 	public DTDHandler getDTDHandler () {
420 		return (theDTDHandler == this) ? null : theDTDHandler;
421 		}
422 
setContentHandler(ContentHandler handler)423 	public void setContentHandler (ContentHandler handler) {
424 		theContentHandler = (handler == null) ? this : handler;
425 		}
426 
getContentHandler()427 	public ContentHandler getContentHandler () {
428 		return (theContentHandler == this) ? null : theContentHandler;
429 		}
430 
setErrorHandler(ErrorHandler handler)431 	public void setErrorHandler (ErrorHandler handler) {
432 		theErrorHandler = (handler == null) ? this : handler;
433 		}
434 
getErrorHandler()435 	public ErrorHandler getErrorHandler () {
436 		return (theErrorHandler == this) ? null : theErrorHandler;
437 		}
438 
parse(InputSource input)439 	public void parse (InputSource input) throws IOException, SAXException {
440 		setup();
441 		Reader r = getReader(input);
442 		theContentHandler.startDocument();
443 		theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
444 		if (theScanner instanceof Locator) {
445 			theContentHandler.setDocumentLocator((Locator)theScanner);
446 			}
447 		if (!(theSchema.getURI().equals("")))
448 			theContentHandler.startPrefixMapping(theSchema.getPrefix(),
449 				theSchema.getURI());
450 		theScanner.scan(r, this);
451 		}
452 
parse(String systemid)453 	public void parse (String systemid) throws IOException, SAXException {
454 		parse(new InputSource(systemid));
455 		}
456 
457 	// Sets up instance variables that haven't been set by setFeature
setup()458 	private void setup() {
459 		if (theSchema == null) theSchema = new HTMLSchema();
460 		if (theScanner == null) theScanner = new HTMLScanner();
461 		if (theAutoDetector == null) {
462 			theAutoDetector = new AutoDetector() {
463 				public Reader autoDetectingReader(InputStream i) {
464 					return new InputStreamReader(i);
465 					}
466 				};
467 			}
468 		theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
469 		thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
470 		theNewElement = null;
471 		theAttributeName = null;
472 		thePITarget = null;
473 		theSaved = null;
474 		theEntity = 0;
475 		virginStack = true;
476                 theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null;
477 		}
478 
479 	// Return a Reader based on the contents of an InputSource
480 	// Buffer both the InputStream and the Reader
getReader(InputSource s)481 	private Reader getReader(InputSource s) throws SAXException, IOException {
482 		Reader r = s.getCharacterStream();
483 		InputStream i = s.getByteStream();
484 		String encoding = s.getEncoding();
485 		String publicid = s.getPublicId();
486 		String systemid = s.getSystemId();
487 		if (r == null) {
488 			if (i == null) i = getInputStream(publicid, systemid);
489 //			i = new BufferedInputStream(i);
490 			if (encoding == null) {
491 				r = theAutoDetector.autoDetectingReader(i);
492 				}
493 			else {
494 				try {
495 					r = new InputStreamReader(i, encoding);
496 					}
497 				catch (UnsupportedEncodingException e) {
498 					r = new InputStreamReader(i);
499 					}
500 				}
501 			}
502 //		r = new BufferedReader(r);
503 		return r;
504 		}
505 
506 	// Get an InputStream based on a publicid and a systemid
getInputStream(String publicid, String systemid)507 	private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
508 		URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
509 		URL url = new URL(basis, systemid);
510 		URLConnection c = url.openConnection();
511 		return c.getInputStream();
512 		}
513 		// We don't process publicids (who uses them anyhow?)
514 
515 	// ScanHandler implementation
516 
517 	private Element theNewElement = null;
518 	private String theAttributeName = null;
519 	private boolean theDoctypeIsPresent = false;
520 	private String theDoctypePublicId = null;
521 	private String theDoctypeSystemId = null;
522 	private String theDoctypeName = null;
523 	private String thePITarget = null;
524 	private Element theStack = null;
525 	private Element theSaved = null;
526 	private Element thePCDATA = null;
527 	private int theEntity = 0;	// needs to support chars past U+FFFF
528 
adup(char[] buff, int offset, int length)529 	public void adup(char[] buff, int offset, int length) throws SAXException {
530 		if (theNewElement == null || theAttributeName == null) return;
531 		theNewElement.setAttribute(theAttributeName, null, theAttributeName);
532 		theAttributeName = null;
533 		}
534 
aname(char[] buff, int offset, int length)535 	public void aname(char[] buff, int offset, int length) throws SAXException {
536 		if (theNewElement == null) return;
537 		// Currently we don't rely on Schema to canonicalize
538 		// attribute names.
539 		theAttributeName = makeName(buff, offset, length).toLowerCase(Locale.ROOT);
540 //		System.err.println("%% Attribute name " + theAttributeName);
541 		}
542 
aval(char[] buff, int offset, int length)543 	public void aval(char[] buff, int offset, int length) throws SAXException {
544 		if (theNewElement == null || theAttributeName == null) return;
545 		String value = new String(buff, offset, length);
546 //		System.err.println("%% Attribute value [" + value + "]");
547 		value = expandEntities(value);
548 		theNewElement.setAttribute(theAttributeName, null, value);
549 		theAttributeName = null;
550 //		System.err.println("%% Aval done");
551 		}
552 
553 	// Expand entity references in attribute values selectively.
554 	// Currently we expand a reference iff it is properly terminated
555 	// with a semicolon.
expandEntities(String src)556 	private String expandEntities(String src) {
557 		int refStart = -1;
558 		int len = src.length();
559 		char[] dst = new char[len];
560 		int dstlen = 0;
561 		for (int i = 0; i < len; i++) {
562 			char ch = src.charAt(i);
563 			dst[dstlen++] = ch;
564 //			System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] ");
565 			if (ch == '&' && refStart == -1) {
566 				// start of a ref excluding &
567 				refStart = dstlen;
568 //				System.err.println("start of ref");
569 				}
570 			else if (refStart == -1) {
571 				// not in a ref
572 //				System.err.println("not in ref");
573 				}
574 			else if (Character.isLetter(ch) ||
575 					Character.isDigit(ch) ||
576 					ch == '#') {
577 				// valid entity char
578 //				System.err.println("valid");
579 				}
580 			else if (ch == ';') {
581 				// properly terminated ref
582 //				System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]");
583 				int ent = lookupEntity(dst, refStart, dstlen - refStart - 1);
584 //				System.err.println(" = " + ent);
585 				if (ent > 0xFFFF) {
586 					ent -= 0x10000;
587 					dst[refStart - 1] = (char)((ent>>10) + 0xD800);
588 					dst[refStart] = (char)((ent&0x3FF) + 0xDC00);
589 					dstlen = refStart + 1;
590 					}
591 				else if (ent != 0) {
592 					dst[refStart - 1] = (char)ent;
593 					dstlen = refStart;
594 					}
595 				refStart = -1;
596 				}
597 			else {
598 				// improperly terminated ref
599 //				System.err.println("end of ref");
600 				refStart = -1;
601 				}
602 			}
603 		return new String(dst, 0, dstlen);
604 		}
605 
entity(char[] buff, int offset, int length)606 	public void entity(char[] buff, int offset, int length) throws SAXException {
607 		theEntity = lookupEntity(buff, offset, length);
608 		}
609 
610 	// Process numeric character references,
611 	// deferring to the schema for named ones.
lookupEntity(char[] buff, int offset, int length)612 	private int lookupEntity(char[] buff, int offset, int length) {
613 		int result = 0;
614 		if (length < 1) return result;
615 //		System.err.println("%% Entity at " + offset + " " + length);
616 //		System.err.println("%% Got entity [" + new String(buff, offset, length) + "]");
617 		if (buff[offset] == '#') {
618                         if (length > 1 && (buff[offset+1] == 'x'
619                                         || buff[offset+1] == 'X')) {
620                                 try {
621                                         return Integer.parseInt(new String(buff, offset + 2, length - 2), 16);
622                                         }
623                                 catch (NumberFormatException e) { return 0; }
624                                 }
625                         try {
626                                 return Integer.parseInt(new String(buff, offset + 1, length - 1), 10);
627                                 }
628                         catch (NumberFormatException e) { return 0; }
629                         }
630 		return theSchema.getEntity(new String(buff, offset, length));
631 		}
632 
eof(char[] buff, int offset, int length)633 	public void eof(char[] buff, int offset, int length) throws SAXException {
634 		if (virginStack) rectify(thePCDATA);
635 		while (theStack.next() != null) {
636 			pop();
637 			}
638 		if (!(theSchema.getURI().equals("")))
639 			theContentHandler.endPrefixMapping(theSchema.getPrefix());
640 		theContentHandler.endDocument();
641 		}
642 
etag(char[] buff, int offset, int length)643 	public void etag(char[] buff, int offset, int length) throws SAXException {
644 		if (etag_cdata(buff, offset, length)) return;
645 		etag_basic(buff, offset, length);
646 		}
647 
648 	private static char[] etagchars = {'<', '/', '>'};
etag_cdata(char[] buff, int offset, int length)649 	public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
650 		String currentName = theStack.name();
651 		// If this is a CDATA element and the tag doesn't match,
652 		// or isn't properly formed (junk after the name),
653 		// restart CDATA mode and process the tag as characters.
654 		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
655 			boolean realTag = (length == currentName.length());
656 			if (realTag) {
657 				for (int i = 0; i < length; i++) {
658 					if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
659 						realTag = false;
660 						break;
661 						}
662 					}
663 				}
664 			if (!realTag) {
665 				theContentHandler.characters(etagchars, 0, 2);
666 				theContentHandler.characters(buff, offset, length);
667 				theContentHandler.characters(etagchars, 2, 1);
668 				theScanner.startCDATA();
669 				return true;
670 				}
671 			}
672 		return false;
673 		}
674 
etag_basic(char[] buff, int offset, int length)675 	public void etag_basic(char[] buff, int offset, int length) throws SAXException {
676 		theNewElement = null;
677 		String name;
678 		if (length != 0) {
679 			// Canonicalize case of name
680 			name = makeName(buff, offset, length);
681 //			System.err.println("got etag [" + name + "]");
682 			ElementType type = theSchema.getElementType(name);
683 			if (type == null) return;	// mysterious end-tag
684 			name = type.name();
685 			}
686 		else {
687 			name = theStack.name();
688 			}
689 //		System.err.println("%% Got end of " + name);
690 
691 		Element sp;
692 		boolean inNoforce = false;
693 		for (sp = theStack; sp != null; sp = sp.next()) {
694 			if (sp.name().equals(name)) break;
695 			if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
696 			}
697 
698 		if (sp == null) return;		// Ignore unknown etags
699 		if (sp.next() == null || sp.next().next() == null) return;
700 		if (inNoforce) {		// inside an F_NOFORCE element?
701 			sp.preclose();		// preclose the matching element
702 			}
703 		else {			// restartably pop everything above us
704 			while (theStack != sp) {
705 				restartablyPop();
706 				}
707 			pop();
708 			}
709 		// pop any preclosed elements now at the top
710 		while (theStack.isPreclosed()) {
711 			pop();
712 			}
713 		restart(null);
714 		}
715 
716 	// Push restartables on the stack if possible
717 	// e is the next element to be started, if we know what it is
restart(Element e)718 	private void restart(Element e) throws SAXException {
719 		while (theSaved != null && theStack.canContain(theSaved) &&
720 				(e == null || theSaved.canContain(e))) {
721 			Element next = theSaved.next();
722 			push(theSaved);
723 			theSaved = next;
724 			}
725 		}
726 
727 	// Pop the stack irrevocably
pop()728 	private void pop() throws SAXException {
729 		if (theStack == null) return;		// empty stack
730 		String name = theStack.name();
731 		String localName = theStack.localName();
732 		String namespace = theStack.namespace();
733 		String prefix = prefixOf(name);
734 
735 //		System.err.println("%% Popping " + name);
736 		if (!namespaces) namespace = localName = "";
737 		theContentHandler.endElement(namespace, localName, name);
738 		if (foreign(prefix, namespace)) {
739 			theContentHandler.endPrefixMapping(prefix);
740 //			System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace);
741 			}
742 		Attributes atts = theStack.atts();
743 		for (int i = atts.getLength() - 1; i >= 0; i--) {
744 			String attNamespace = atts.getURI(i);
745 			String attPrefix = prefixOf(atts.getQName(i));
746 			if (foreign(attPrefix, attNamespace)) {
747 				theContentHandler.endPrefixMapping(attPrefix);
748 //			System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace);
749 				}
750 			}
751 		theStack = theStack.next();
752 		}
753 
754 	// Pop the stack restartably
restartablyPop()755 	private void restartablyPop() throws SAXException {
756 		Element popped = theStack;
757 		pop();
758 		if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
759 			popped.anonymize();
760 			popped.setNext(theSaved);
761 			theSaved = popped;
762 			}
763 		}
764 
765 	// Push element onto stack
766 	private boolean virginStack = true;
push(Element e)767 	private void push(Element e) throws SAXException {
768 		String name = e.name();
769 		String localName = e.localName();
770 		String namespace = e.namespace();
771 		String prefix = prefixOf(name);
772 
773 //		System.err.println("%% Pushing " + name);
774 		e.clean();
775 		if (!namespaces) namespace = localName = "";
776                 if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) {
777                     try {
778                         theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId);
779                     } catch (IOException ew) { }   // Can't be thrown for root I believe.
780                 }
781 		if (foreign(prefix, namespace)) {
782 			theContentHandler.startPrefixMapping(prefix, namespace);
783 //			System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace);
784 			}
785 		Attributes atts = e.atts();
786 		int len = atts.getLength();
787 		for (int i = 0; i < len; i++) {
788 			String attNamespace = atts.getURI(i);
789 			String attPrefix = prefixOf(atts.getQName(i));
790 			if (foreign(attPrefix, attNamespace)) {
791 				theContentHandler.startPrefixMapping(attPrefix, attNamespace);
792 //				System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace);
793 				}
794 			}
795 		theContentHandler.startElement(namespace, localName, name, e.atts());
796 		e.setNext(theStack);
797 		theStack = e;
798 		virginStack = false;
799 		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
800 			theScanner.startCDATA();
801 			}
802 		}
803 
804 	// Get the prefix from a QName
prefixOf(String name)805 	private String prefixOf(String name) {
806 		int i = name.indexOf(':');
807 		String prefix = "";
808 		if (i != -1) prefix = name.substring(0, i);
809 //		System.err.println("%% " + prefix + " is prefix of " + name);
810 		return prefix;
811 		}
812 
813 	// Return true if we have a foreign name
foreign(String prefix, String namespace)814 	private boolean foreign(String prefix, String namespace) {
815 //		System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- ");
816 		boolean foreign = !(prefix.equals("") || namespace.equals("") ||
817 			namespace.equals(theSchema.getURI()));
818 //		System.err.println(foreign);
819 		return foreign;
820 		}
821 
822         /**
823          * Parsing the complete XML Document Type Definition is way too complex,
824          * but for many simple cases we can extract something useful from it.
825          *
826          * doctypedecl  ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
827          *  DeclSep     ::= PEReference | S
828          *  intSubset   ::= (markupdecl | DeclSep)*
829          *  markupdecl  ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
830          *  ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
831          */
decl(char[] buff, int offset, int length)832 	public void decl(char[] buff, int offset, int length) throws SAXException {
833 		String s = new String(buff, offset, length);
834 		String name = null;
835 		String systemid = null;
836 		String publicid = null;
837 		String[] v = split(s);
838 		if (v.length > 0 && "DOCTYPE".equalsIgnoreCase(v[0])) {
839 			if (theDoctypeIsPresent) return;		// one doctype only!
840 			theDoctypeIsPresent = true;
841 			if (v.length > 1) {
842 				name = v[1];
843 				if (v.length>3 && "SYSTEM".equals(v[2])) {
844 				systemid = v[3];
845 				}
846 			else if (v.length > 3 && "PUBLIC".equals(v[2])) {
847 				publicid = v[3];
848 				if (v.length > 4) {
849 					systemid = v[4];
850 					}
851 				else {
852 					systemid = "";
853 					}
854                     }
855                 }
856             }
857 		publicid = trimquotes(publicid);
858 		systemid = trimquotes(systemid);
859 		if (name != null) {
860 			publicid = cleanPublicid(publicid);
861 			theLexicalHandler.startDTD(name, publicid, systemid);
862 			theLexicalHandler.endDTD();
863 			theDoctypeName = name;
864 			theDoctypePublicId = publicid;
865 		if (theScanner instanceof Locator) {    // Must resolve systemid
866                     theDoctypeSystemId  = ((Locator)theScanner).getSystemId();
867                     try {
868                         theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString();
869                     } catch (Exception e) {}
870                 }
871             }
872         }
873 
874 	// If the String is quoted, trim the quotes.
trimquotes(String in)875 	private static String trimquotes(String in) {
876 		if (in == null) return in;
877 		int length = in.length();
878 		if (length == 0) return in;
879 		char s = in.charAt(0);
880 		char e = in.charAt(length - 1);
881 		if (s == e && (s == '\'' || s == '"')) {
882 			in = in.substring(1, in.length() - 1);
883 			}
884 		return in;
885 		}
886 
887 	// Split the supplied String into words or phrases seperated by spaces.
888 	// Recognises quotes around a phrase and doesn't split it.
split(String val)889 	private static String[] split(String val) throws IllegalArgumentException {
890 		val = val.trim();
891 		if (val.length() == 0) {
892 			return new String[0];
893 			}
894 		else {
895 			ArrayList l = new ArrayList();
896 			int s = 0;
897 			int e = 0;
898 			boolean sq = false;	// single quote
899 			boolean dq = false;	// double quote
900 			char lastc = 0;
901 			int len = val.length();
902 			for (e=0; e < len; e++) {
903 				char c = val.charAt(e);
904 				if (!dq && c == '\'' && lastc != '\\') {
905 				sq = !sq;
906 				if (s < 0) s = e;
907 				}
908 			else if (!sq && c == '\"' && lastc != '\\') {
909 				dq = !dq;
910 				if (s < 0) s = e;
911 				}
912 			else if (!sq && !dq) {
913 				if (Character.isWhitespace(c)) {
914 					if (s >= 0) l.add(val.substring(s, e));
915 					s = -1;
916 					}
917 				else if (s < 0 && c != ' ') {
918 					s = e;
919 					}
920 				}
921 			lastc = c;
922 			}
923 		l.add(val.substring(s, e));
924 		return (String[])l.toArray(new String[0]);
925 		}
926         }
927 
928 	// Replace junk in publicids with spaces
929 	private static String legal =
930 		"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
931 
cleanPublicid(String src)932 	private String cleanPublicid(String src) {
933 		if (src == null) return null;
934 		int len = src.length();
935 		StringBuffer dst = new StringBuffer(len);
936 		boolean suppressSpace = true;
937 		for (int i = 0; i < len; i++) {
938 			char ch = src.charAt(i);
939 			if (legal.indexOf(ch) != -1) { 	// legal but not whitespace
940 				dst.append(ch);
941 				suppressSpace = false;
942 				}
943 			else if (suppressSpace) {	// normalizable whitespace or junk
944 				;
945 				}
946 			else {
947 				dst.append(' ');
948 				suppressSpace = true;
949 				}
950 			}
951 //		System.err.println("%% Publicid [" + dst.toString().trim() + "]");
952 		return dst.toString().trim();	// trim any final junk whitespace
953 		}
954 
955 
gi(char[] buff, int offset, int length)956 	public void gi(char[] buff, int offset, int length) throws SAXException {
957 		if (theNewElement != null) return;
958 		String name = makeName(buff, offset, length);
959 		if (name == null) return;
960 		ElementType type = theSchema.getElementType(name);
961 		if (type == null) {
962 			// Suppress unknown elements if ignore-bogons is on
963 			if (ignoreBogons) return;
964 			int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY;
965 			int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT);
966 			theSchema.elementType(name, bogonModel, bogonMemberOf, 0);
967 			if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name());
968 			type = theSchema.getElementType(name);
969 			}
970 
971 		theNewElement = new Element(type, defaultAttributes);
972 //		System.err.println("%% Got GI " + theNewElement.name());
973 		}
974 
cdsect(char[] buff, int offset, int length)975 	public void cdsect(char[] buff, int offset, int length) throws SAXException {
976 		theLexicalHandler.startCDATA();
977 		pcdata(buff, offset, length);
978 		theLexicalHandler.endCDATA();
979 		}
pcdata(char[] buff, int offset, int length)980 	public void pcdata(char[] buff, int offset, int length) throws SAXException {
981 		if (length == 0) return;
982 		boolean allWhite = true;
983 		for (int i = 0; i < length; i++) {
984 			if (!Character.isWhitespace(buff[offset+i])) {
985 				allWhite = false;
986 				}
987 			}
988 		if (allWhite && !theStack.canContain(thePCDATA)) {
989 			if (ignorableWhitespace) {
990 				theContentHandler.ignorableWhitespace(buff, offset, length);
991 				}
992 			}
993 		else {
994 			rectify(thePCDATA);
995 			theContentHandler.characters(buff, offset, length);
996 			}
997 		}
998 
pitarget(char[] buff, int offset, int length)999 	public void pitarget(char[] buff, int offset, int length) throws SAXException {
1000 		if (theNewElement != null) return;
1001 		thePITarget = makeName(buff, offset, length).replace(':', '_');
1002 		}
1003 
pi(char[] buff, int offset, int length)1004 	public void pi(char[] buff, int offset, int length) throws SAXException {
1005 		if (theNewElement != null || thePITarget == null) return;
1006 		if ("xml".equalsIgnoreCase(thePITarget)) return;
1007 //		if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI");
1008 		if (length > 0 && buff[length - 1] == '?') length--;	// remove trailing ?
1009 		theContentHandler.processingInstruction(thePITarget,
1010 			new String(buff, offset, length));
1011 		thePITarget = null;
1012 		}
1013 
stagc(char[] buff, int offset, int length)1014 	public void stagc(char[] buff, int offset, int length) throws SAXException {
1015 //		System.err.println("%% Start-tag");
1016 		if (theNewElement == null) return;
1017 		rectify(theNewElement);
1018 		if (theStack.model() == Schema.M_EMPTY) {
1019 			// Force an immediate end tag
1020 			etag_basic(buff, offset, length);
1021 			}
1022 		}
1023 
stage(char[] buff, int offset, int length)1024 	public void stage(char[] buff, int offset, int length) throws SAXException {
1025 //		System.err.println("%% Empty-tag");
1026 		if (theNewElement == null) return;
1027 		rectify(theNewElement);
1028 		// Force an immediate end tag
1029 		etag_basic(buff, offset, length);
1030 		}
1031 
1032 	// Comment buffer is twice the size of the output buffer
1033 	private char[] theCommentBuffer = new char[2000];
cmnt(char[] buff, int offset, int length)1034 	public void cmnt(char[] buff, int offset, int length) throws SAXException {
1035 		theLexicalHandler.comment(buff, offset, length);
1036 		}
1037 
1038 	// Rectify the stack, pushing and popping as needed
1039 	// so that the argument can be safely pushed
rectify(Element e)1040 	private void rectify(Element e) throws SAXException {
1041 		Element sp;
1042 		while (true) {
1043 			for (sp = theStack; sp != null; sp = sp.next()) {
1044 				if (sp.canContain(e)) break;
1045 				}
1046 			if (sp != null) break;
1047 			ElementType parentType = e.parent();
1048 			if (parentType == null) break;
1049 			Element parent = new Element(parentType, defaultAttributes);
1050 //			System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
1051 			parent.setNext(e);
1052 			e = parent;
1053 			}
1054 		if (sp == null) return;		// don't know what to do
1055 		while (theStack != sp) {
1056 			if (theStack == null || theStack.next() == null ||
1057 				theStack.next().next() == null) break;
1058 			restartablyPop();
1059 			}
1060 		while (e != null) {
1061 			Element nexte = e.next();
1062 			if (!e.name().equals("<pcdata>")) push(e);
1063 			e = nexte;
1064 			restart(e);
1065 			}
1066 		theNewElement = null;
1067 		}
1068 
getEntity()1069 	public int getEntity() {
1070 		return theEntity;
1071 		}
1072 
1073 	// Return the argument as a valid XML name
1074 	// This no longer lowercases the result: we depend on Schema to
1075 	// canonicalize case.
makeName(char[] buff, int offset, int length)1076 	private String makeName(char[] buff, int offset, int length) {
1077 		StringBuffer dst = new StringBuffer(length + 2);
1078 		boolean seenColon = false;
1079 		boolean start = true;
1080 //		String src = new String(buff, offset, length); // DEBUG
1081 		for (; length-- > 0; offset++) {
1082 			char ch = buff[offset];
1083 			if (Character.isLetter(ch) || ch == '_') {
1084 				start = false;
1085 				dst.append(ch);
1086 				}
1087 			else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
1088 				if (start) dst.append('_');
1089 				start = false;
1090 				dst.append(ch);
1091 				}
1092 			else if (ch == ':' && !seenColon) {
1093 				seenColon = true;
1094 				if (start) dst.append('_');
1095 				start = true;
1096 				dst.append(translateColons ? '_' : ch);
1097 				}
1098 			}
1099 		int dstLength = dst.length();
1100 		if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
1101 //		System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
1102 		return dst.toString().intern();
1103 		}
1104 
1105 	// Default LexicalHandler implementation
1106 
comment(char[] ch, int start, int length)1107 	public void comment(char[] ch, int start, int length) throws SAXException { }
endCDATA()1108 	public void endCDATA() throws SAXException { }
endDTD()1109 	public void endDTD() throws SAXException { }
endEntity(String name)1110 	public void endEntity(String name) throws SAXException { }
startCDATA()1111 	public void startCDATA() throws SAXException { }
startDTD(String name, String publicid, String systemid)1112 	public void startDTD(String name, String publicid, String systemid) throws SAXException { }
startEntity(String name)1113 	public void startEntity(String name) throws SAXException { }
1114 
1115 	}
1116