1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.harmony.xml.parsers;
18 
19 import java.io.IOException;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import javax.xml.parsers.DocumentBuilder;
23 import libcore.io.IoUtils;
24 import org.apache.harmony.xml.dom.CDATASectionImpl;
25 import org.apache.harmony.xml.dom.DOMImplementationImpl;
26 import org.apache.harmony.xml.dom.DocumentImpl;
27 import org.apache.harmony.xml.dom.DocumentTypeImpl;
28 import org.apache.harmony.xml.dom.TextImpl;
29 import org.kxml2.io.KXmlParser;
30 import org.w3c.dom.Attr;
31 import org.w3c.dom.DOMImplementation;
32 import org.w3c.dom.Document;
33 import org.w3c.dom.DocumentType;
34 import org.w3c.dom.Element;
35 import org.w3c.dom.Node;
36 import org.w3c.dom.Text;
37 import org.xml.sax.EntityResolver;
38 import org.xml.sax.ErrorHandler;
39 import org.xml.sax.InputSource;
40 import org.xml.sax.SAXException;
41 import org.xml.sax.SAXParseException;
42 import org.xml.sax.helpers.LocatorImpl;
43 import org.xmlpull.v1.XmlPullParser;
44 import org.xmlpull.v1.XmlPullParserException;
45 
46 /**
47  * Builds a DOM using KXmlParser.
48  */
49 class DocumentBuilderImpl extends DocumentBuilder {
50 
51     private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance();
52 
53     private boolean coalescing;
54     private EntityResolver entityResolver;
55     private ErrorHandler errorHandler;
56     private boolean ignoreComments;
57     private boolean ignoreElementContentWhitespace;
58     private boolean namespaceAware;
59     // adding a new field? don't forget to update reset().
60 
reset()61     @Override public void reset() {
62         coalescing = false;
63         entityResolver = null;
64         errorHandler = null;
65         ignoreComments = false;
66         ignoreElementContentWhitespace = false;
67         namespaceAware = false;
68     }
69 
70     @Override
getDOMImplementation()71     public DOMImplementation getDOMImplementation() {
72         return dom;
73     }
74 
75     @Override
isNamespaceAware()76     public boolean isNamespaceAware() {
77         return namespaceAware;
78     }
79 
80     @Override
isValidating()81     public boolean isValidating() {
82         return false;
83     }
84 
85     @Override
newDocument()86     public Document newDocument() {
87         return dom.createDocument(null, null, null);
88     }
89 
90     @Override
parse(InputSource source)91     public Document parse(InputSource source) throws SAXException, IOException {
92         if (source == null) {
93             throw new IllegalArgumentException("source == null");
94         }
95 
96         String namespaceURI = null;
97         String qualifiedName = null;
98         DocumentType doctype = null;
99         String inputEncoding = source.getEncoding();
100         String systemId = source.getSystemId();
101         DocumentImpl document = new DocumentImpl(
102                 dom, namespaceURI, qualifiedName, doctype, inputEncoding);
103         document.setDocumentURI(systemId);
104 
105         KXmlParser parser = new KXmlParser();
106         try {
107             parser.keepNamespaceAttributes();
108             parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware);
109 
110             if (source.getByteStream() != null) {
111                 parser.setInput(source.getByteStream(), inputEncoding);
112             } else if (source.getCharacterStream() != null) {
113                 parser.setInput(source.getCharacterStream());
114             } else if (systemId != null) {
115                 URL url = new URL(systemId);
116                 URLConnection urlConnection = url.openConnection();
117                 urlConnection.connect();
118                 // TODO: if null, extract the inputEncoding from the Content-Type header?
119                 parser.setInput(urlConnection.getInputStream(), inputEncoding);
120             } else {
121                 throw new SAXParseException("InputSource needs a stream, reader or URI", null);
122             }
123 
124             if (parser.nextToken() == XmlPullParser.END_DOCUMENT) {
125                 throw new SAXParseException("Unexpected end of document", null);
126             }
127 
128             parse(parser, document, document, XmlPullParser.END_DOCUMENT);
129 
130             parser.require(XmlPullParser.END_DOCUMENT, null, null);
131         } catch (XmlPullParserException ex) {
132             Throwable detail = ex.getDetail();
133             if (detail instanceof IOException) {
134                 throw (IOException) detail;
135             }
136             if (detail instanceof RuntimeException) {
137                 throw (RuntimeException) detail;
138             }
139 
140             LocatorImpl locator = new LocatorImpl();
141 
142             locator.setPublicId(source.getPublicId());
143             locator.setSystemId(systemId);
144             locator.setLineNumber(ex.getLineNumber());
145             locator.setColumnNumber(ex.getColumnNumber());
146 
147             SAXParseException newEx = new SAXParseException(ex.getMessage(), locator);
148 
149             if (errorHandler != null) {
150                 errorHandler.error(newEx);
151             }
152 
153             throw newEx;
154         } finally {
155             IoUtils.closeQuietly(parser);
156         }
157 
158         return document;
159     }
160 
161     /**
162      * Implements the whole parsing of the XML document. The XML pull parser is
163      * actually more of a tokenizer, and we are doing a classical recursive
164      * descent parsing (the method invokes itself for XML elements). Our
165      * approach to parsing does accept some illegal documents (more than one
166      * root element, for example). The assumption is that the DOM implementation
167      * throws the proper exceptions in these cases.
168      *
169      * @param parser The XML pull parser we're reading from.
170      * @param document The document we're building.
171      * @param node The node we're currently on (initially the document itself).
172      * @param endToken The token that will end this recursive call. Either
173      *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
174      *
175      * @throws XmlPullParserException If a parsing error occurs.
176      * @throws IOException If a general IO error occurs.
177      */
parse(KXmlParser parser, DocumentImpl document, Node node, int endToken)178     private void parse(KXmlParser parser, DocumentImpl document, Node node,
179             int endToken) throws XmlPullParserException, IOException {
180 
181         int token = parser.getEventType();
182 
183         /*
184          * The main parsing loop. The precondition is that we are already on the
185          * token to be processed. This holds for each iteration of the loop, so
186          * the inner statements have to ensure that (in particular the recursive
187          * call).
188          */
189         while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
190             if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
191                 /*
192                  * Found a processing instructions. We need to split the token
193                  * text at the first whitespace character.
194                  */
195                 String text = parser.getText();
196 
197                 int dot = text.indexOf(' ');
198 
199                 String target = (dot != -1 ? text.substring(0, dot) : text);
200                 String data = (dot != -1 ? text.substring(dot + 1) : "");
201 
202                 node.appendChild(document.createProcessingInstruction(target,
203                         data));
204             } else if (token == XmlPullParser.DOCDECL) {
205                 String name = parser.getRootElementName();
206                 String publicId = parser.getPublicId();
207                 String systemId = parser.getSystemId();
208                 document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId));
209 
210             } else if (token == XmlPullParser.COMMENT) {
211                 /*
212                  * Found a comment. We simply take the token text, but we only
213                  * create a node if the client wants to see comments at all.
214                  */
215                 if (!ignoreComments) {
216                     node.appendChild(document.createComment(parser.getText()));
217                 }
218             } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
219                 /*
220                  * Found some ignorable whitespace. We only add it if the client
221                  * wants to see whitespace. Whitespace before and after the
222                  * document element is always ignored.
223                  */
224                 if (!ignoreElementContentWhitespace && document != node) {
225                     appendText(document, node, token, parser.getText());
226                 }
227             } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
228                 /*
229                  * Found a piece of text (possibly encoded as a CDATA section).
230                  * That's the easiest case. We simply take it and create a new text node,
231                  * or merge with an adjacent text node.
232                  */
233                 appendText(document, node, token, parser.getText());
234             } else if (token == XmlPullParser.ENTITY_REF) {
235                 /*
236                  * Found an entity reference. If an entity resolver is
237                  * installed, we replace it by text (if possible). Otherwise we
238                  * add an entity reference node.
239                  */
240                 String entity = parser.getName();
241 
242                 if (entityResolver != null) {
243                     // TODO Implement this...
244                 }
245 
246                 String resolved = resolvePredefinedOrCharacterEntity(entity);
247                 if (resolved != null) {
248                     appendText(document, node, token, resolved);
249                 } else {
250                     node.appendChild(document.createEntityReference(entity));
251                 }
252             } else if (token == XmlPullParser.START_TAG) {
253                 /*
254                  * Found an element start tag. We create an element node with
255                  * the proper info and attributes. We then invoke parse()
256                  * recursively to handle the next level of nesting. When we
257                  * return from this call, we check that we are on the proper
258                  * element end tag. The whole handling differs somewhat
259                  * depending on whether the parser is namespace-aware or not.
260                  */
261                 if (namespaceAware) {
262                     // Collect info for element node
263                     String namespace = parser.getNamespace();
264                     String name = parser.getName();
265                     String prefix = parser.getPrefix();
266 
267                     if ("".equals(namespace)) {
268                         namespace = null;
269                     }
270 
271                     // Create element node and wire it correctly
272                     Element element = document.createElementNS(namespace, name);
273                     element.setPrefix(prefix);
274                     node.appendChild(element);
275 
276                     for (int i = 0; i < parser.getAttributeCount(); i++) {
277                         // Collect info for a single attribute node
278                         String attrNamespace = parser.getAttributeNamespace(i);
279                         String attrPrefix = parser.getAttributePrefix(i);
280                         String attrName = parser.getAttributeName(i);
281                         String attrValue = parser.getAttributeValue(i);
282 
283                         if ("".equals(attrNamespace)) {
284                             attrNamespace = null;
285                         }
286 
287                         // Create attribute node and wire it correctly
288                         Attr attr = document.createAttributeNS(attrNamespace, attrName);
289                         attr.setPrefix(attrPrefix);
290                         attr.setValue(attrValue);
291                         element.setAttributeNodeNS(attr);
292                     }
293 
294                     // Recursive descent
295                     token = parser.nextToken();
296                     parse(parser, document, element, XmlPullParser.END_TAG);
297 
298                     // Expect the element's end tag here
299                     parser.require(XmlPullParser.END_TAG, namespace, name);
300 
301                 } else {
302                     // Collect info for element node
303                     String name = parser.getName();
304 
305                     // Create element node and wire it correctly
306                     Element element = document.createElement(name);
307                     node.appendChild(element);
308 
309                     for (int i = 0; i < parser.getAttributeCount(); i++) {
310                         // Collect info for a single attribute node
311                         String attrName = parser.getAttributeName(i);
312                         String attrValue = parser.getAttributeValue(i);
313 
314                         // Create attribute node and wire it correctly
315                         Attr attr = document.createAttribute(attrName);
316                         attr.setValue(attrValue);
317                         element.setAttributeNode(attr);
318                     }
319 
320                     // Recursive descent
321                     token = parser.nextToken();
322                     parse(parser, document, element, XmlPullParser.END_TAG);
323 
324                     // Expect the element's end tag here
325                     parser.require(XmlPullParser.END_TAG, "", name);
326                 }
327             }
328 
329             token = parser.nextToken();
330         }
331     }
332 
333     /**
334      * @param token the XML pull parser token type, such as XmlPullParser.CDSECT
335      *      or XmlPullParser.ENTITY_REF.
336      */
appendText(DocumentImpl document, Node parent, int token, String text)337     private void appendText(DocumentImpl document, Node parent, int token, String text) {
338         // Ignore empty runs.
339         if (text.isEmpty()) {
340             return;
341         }
342         // Merge with any previous text node if possible.
343         if (coalescing || token != XmlPullParser.CDSECT) {
344             Node lastChild = parent.getLastChild();
345             if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
346                 Text textNode = (Text) lastChild;
347                 textNode.appendData(text);
348                 return;
349             }
350         }
351         // Okay, we really do need a new text node
352         parent.appendChild(token == XmlPullParser.CDSECT
353                 ? new CDATASectionImpl(document, text)
354                 : new TextImpl(document, text));
355     }
356 
357     @Override
setEntityResolver(EntityResolver resolver)358     public void setEntityResolver(EntityResolver resolver) {
359         entityResolver = resolver;
360     }
361 
362     @Override
setErrorHandler(ErrorHandler handler)363     public void setErrorHandler(ErrorHandler handler) {
364         errorHandler = handler;
365     }
366 
367     /**
368      * Controls whether this DocumentBuilder ignores comments.
369      */
setIgnoreComments(boolean value)370     public void setIgnoreComments(boolean value) {
371         ignoreComments = value;
372     }
373 
setCoalescing(boolean value)374     public void setCoalescing(boolean value) {
375         coalescing = value;
376     }
377 
378     /**
379      * Controls whether this DocumentBuilder ignores element content whitespace.
380      */
setIgnoreElementContentWhitespace(boolean value)381     public void setIgnoreElementContentWhitespace(boolean value) {
382         ignoreElementContentWhitespace = value;
383     }
384 
385     /**
386      * Controls whether this DocumentBuilder is namespace-aware.
387      */
setNamespaceAware(boolean value)388     public void setNamespaceAware(boolean value) {
389         namespaceAware = value;
390     }
391 
392     /**
393      * Returns the replacement text or null if {@code entity} isn't predefined.
394      */
resolvePredefinedOrCharacterEntity(String entityName)395     private String resolvePredefinedOrCharacterEntity(String entityName) {
396         // Character references, section 4.1 of the XML specification.
397         if (entityName.startsWith("#x")) {
398             return resolveCharacterReference(entityName.substring(2), 16);
399         } else if (entityName.startsWith("#")) {
400             return resolveCharacterReference(entityName.substring(1), 10);
401         }
402         // Predefined entities, section 4.6 of the XML specification.
403         if ("lt".equals(entityName)) {
404             return "<";
405         } else if ("gt".equals(entityName)) {
406             return ">";
407         } else if ("amp".equals(entityName)) {
408             return "&";
409         } else if ("apos".equals(entityName)) {
410             return "'";
411         } else if ("quot".equals(entityName)) {
412             return "\"";
413         } else {
414             return null;
415         }
416     }
417 
resolveCharacterReference(String value, int base)418     private String resolveCharacterReference(String value, int base) {
419         try {
420             int codePoint = Integer.parseInt(value, base);
421             if (Character.isBmpCodePoint(codePoint)) {
422                 return String.valueOf((char) codePoint);
423             } else {
424                 char[] surrogatePair = Character.toChars(codePoint);
425                 return new String(surrogatePair);
426             }
427         } catch (NumberFormatException ex) {
428             return null;
429         }
430     }
431 }
432