1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.harmony.xml.parsers;
18 
19 import java.io.IOException;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import javax.xml.parsers.DocumentBuilder;
23 import libcore.io.IoUtils;
24 import org.apache.harmony.xml.dom.CDATASectionImpl;
25 import org.apache.harmony.xml.dom.DOMImplementationImpl;
26 import org.apache.harmony.xml.dom.DocumentImpl;
27 import org.apache.harmony.xml.dom.DocumentTypeImpl;
28 import org.apache.harmony.xml.dom.TextImpl;
29 import org.kxml2.io.KXmlParser;
30 import org.w3c.dom.Attr;
31 import org.w3c.dom.DOMImplementation;
32 import org.w3c.dom.Document;
33 import org.w3c.dom.DocumentType;
34 import org.w3c.dom.Element;
35 import org.w3c.dom.Node;
36 import org.w3c.dom.Text;
37 import org.xml.sax.EntityResolver;
38 import org.xml.sax.ErrorHandler;
39 import org.xml.sax.InputSource;
40 import org.xml.sax.SAXException;
41 import org.xml.sax.SAXParseException;
42 import org.xml.sax.helpers.LocatorImpl;
43 import org.xmlpull.v1.XmlPullParser;
44 import org.xmlpull.v1.XmlPullParserException;
45 
46 /**
47  * Builds a DOM using KXmlParser.
48  */
49 class DocumentBuilderImpl extends DocumentBuilder {
50 
51     private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance();
52 
53     private boolean coalescing;
54     private EntityResolver entityResolver;
55     private ErrorHandler errorHandler;
56     private boolean ignoreComments;
57     private boolean ignoreElementContentWhitespace;
58     private boolean namespaceAware;
59     // adding a new field? don't forget to update reset().
60 
reset()61     @Override public void reset() {
62         coalescing = false;
63         entityResolver = null;
64         errorHandler = null;
65         ignoreComments = false;
66         ignoreElementContentWhitespace = false;
67         namespaceAware = false;
68     }
69 
70     @Override
getDOMImplementation()71     public DOMImplementation getDOMImplementation() {
72         return dom;
73     }
74 
75     @Override
isNamespaceAware()76     public boolean isNamespaceAware() {
77         return namespaceAware;
78     }
79 
80     @Override
isValidating()81     public boolean isValidating() {
82         return false;
83     }
84 
85     @Override
newDocument()86     public Document newDocument() {
87         return dom.createDocument(null, null, null);
88     }
89 
90     @Override
parse(InputSource source)91     public Document parse(InputSource source) throws SAXException, IOException {
92         if (source == null) {
93             throw new IllegalArgumentException("source == null");
94         }
95 
96         String namespaceURI = null;
97         String qualifiedName = null;
98         DocumentType doctype = null;
99         String inputEncoding = source.getEncoding();
100         String systemId = source.getSystemId();
101         DocumentImpl document = new DocumentImpl(
102                 dom, namespaceURI, qualifiedName, doctype, inputEncoding);
103         document.setDocumentURI(systemId);
104 
105         KXmlParser parser = new KXmlParser();
106         try {
107             parser.keepNamespaceAttributes();
108             parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware);
109 
110             if (source.getByteStream() != null) {
111                 parser.setInput(source.getByteStream(), inputEncoding);
112             } else if (source.getCharacterStream() != null) {
113                 parser.setInput(source.getCharacterStream());
114             } else if (systemId != null) {
115                 URL url = new URL(systemId);
116                 URLConnection urlConnection = url.openConnection();
117                 urlConnection.connect();
118                 // TODO: if null, extract the inputEncoding from the Content-Type header?
119                 parser.setInput(urlConnection.getInputStream(), inputEncoding);
120             } else {
121                 throw new SAXParseException("InputSource needs a stream, reader or URI", null);
122             }
123 
124             if (parser.nextToken() == XmlPullParser.END_DOCUMENT) {
125                 throw new SAXParseException("Unexpected end of document", null);
126             }
127 
128             parse(parser, document, document, XmlPullParser.END_DOCUMENT);
129 
130             parser.require(XmlPullParser.END_DOCUMENT, null, null);
131         } catch (XmlPullParserException ex) {
132             if (ex.getDetail() instanceof IOException) {
133                 throw (IOException) ex.getDetail();
134             }
135             if (ex.getDetail() instanceof RuntimeException) {
136                 throw (RuntimeException) ex.getDetail();
137             }
138 
139             LocatorImpl locator = new LocatorImpl();
140 
141             locator.setPublicId(source.getPublicId());
142             locator.setSystemId(systemId);
143             locator.setLineNumber(ex.getLineNumber());
144             locator.setColumnNumber(ex.getColumnNumber());
145 
146             SAXParseException newEx = new SAXParseException(ex.getMessage(), locator);
147 
148             if (errorHandler != null) {
149                 errorHandler.error(newEx);
150             }
151 
152             throw newEx;
153         } finally {
154             IoUtils.closeQuietly(parser);
155         }
156 
157         return document;
158     }
159 
160     /**
161      * Implements the whole parsing of the XML document. The XML pull parser is
162      * actually more of a tokenizer, and we are doing a classical recursive
163      * descent parsing (the method invokes itself for XML elements). Our
164      * approach to parsing does accept some illegal documents (more than one
165      * root element, for example). The assumption is that the DOM implementation
166      * throws the proper exceptions in these cases.
167      *
168      * @param parser The XML pull parser we're reading from.
169      * @param document The document we're building.
170      * @param node The node we're currently on (initially the document itself).
171      * @param endToken The token that will end this recursive call. Either
172      *        XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG.
173      *
174      * @throws XmlPullParserException If a parsing error occurs.
175      * @throws IOException If a general IO error occurs.
176      */
parse(KXmlParser parser, DocumentImpl document, Node node, int endToken)177     private void parse(KXmlParser parser, DocumentImpl document, Node node,
178             int endToken) throws XmlPullParserException, IOException {
179 
180         int token = parser.getEventType();
181 
182         /*
183          * The main parsing loop. The precondition is that we are already on the
184          * token to be processed. This holds for each iteration of the loop, so
185          * the inner statements have to ensure that (in particular the recursive
186          * call).
187          */
188         while (token != endToken && token != XmlPullParser.END_DOCUMENT) {
189             if (token == XmlPullParser.PROCESSING_INSTRUCTION) {
190                 /*
191                  * Found a processing instructions. We need to split the token
192                  * text at the first whitespace character.
193                  */
194                 String text = parser.getText();
195 
196                 int dot = text.indexOf(' ');
197 
198                 String target = (dot != -1 ? text.substring(0, dot) : text);
199                 String data = (dot != -1 ? text.substring(dot + 1) : "");
200 
201                 node.appendChild(document.createProcessingInstruction(target,
202                         data));
203             } else if (token == XmlPullParser.DOCDECL) {
204                 String name = parser.getRootElementName();
205                 String publicId = parser.getPublicId();
206                 String systemId = parser.getSystemId();
207                 document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId));
208 
209             } else if (token == XmlPullParser.COMMENT) {
210                 /*
211                  * Found a comment. We simply take the token text, but we only
212                  * create a node if the client wants to see comments at all.
213                  */
214                 if (!ignoreComments) {
215                     node.appendChild(document.createComment(parser.getText()));
216                 }
217             } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) {
218                 /*
219                  * Found some ignorable whitespace. We only add it if the client
220                  * wants to see whitespace. Whitespace before and after the
221                  * document element is always ignored.
222                  */
223                 if (!ignoreElementContentWhitespace && document != node) {
224                     appendText(document, node, token, parser.getText());
225                 }
226             } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) {
227                 /*
228                  * Found a piece of text (possibly encoded as a CDATA section).
229                  * That's the easiest case. We simply take it and create a new text node,
230                  * or merge with an adjacent text node.
231                  */
232                 appendText(document, node, token, parser.getText());
233             } else if (token == XmlPullParser.ENTITY_REF) {
234                 /*
235                  * Found an entity reference. If an entity resolver is
236                  * installed, we replace it by text (if possible). Otherwise we
237                  * add an entity reference node.
238                  */
239                 String entity = parser.getName();
240 
241                 if (entityResolver != null) {
242                     // TODO Implement this...
243                 }
244 
245                 String resolved = resolvePredefinedOrCharacterEntity(entity);
246                 if (resolved != null) {
247                     appendText(document, node, token, resolved);
248                 } else {
249                     node.appendChild(document.createEntityReference(entity));
250                 }
251             } else if (token == XmlPullParser.START_TAG) {
252                 /*
253                  * Found an element start tag. We create an element node with
254                  * the proper info and attributes. We then invoke parse()
255                  * recursively to handle the next level of nesting. When we
256                  * return from this call, we check that we are on the proper
257                  * element end tag. The whole handling differs somewhat
258                  * depending on whether the parser is namespace-aware or not.
259                  */
260                 if (namespaceAware) {
261                     // Collect info for element node
262                     String namespace = parser.getNamespace();
263                     String name = parser.getName();
264                     String prefix = parser.getPrefix();
265 
266                     if ("".equals(namespace)) {
267                         namespace = null;
268                     }
269 
270                     // Create element node and wire it correctly
271                     Element element = document.createElementNS(namespace, name);
272                     element.setPrefix(prefix);
273                     node.appendChild(element);
274 
275                     for (int i = 0; i < parser.getAttributeCount(); i++) {
276                         // Collect info for a single attribute node
277                         String attrNamespace = parser.getAttributeNamespace(i);
278                         String attrPrefix = parser.getAttributePrefix(i);
279                         String attrName = parser.getAttributeName(i);
280                         String attrValue = parser.getAttributeValue(i);
281 
282                         if ("".equals(attrNamespace)) {
283                             attrNamespace = null;
284                         }
285 
286                         // Create attribute node and wire it correctly
287                         Attr attr = document.createAttributeNS(attrNamespace, attrName);
288                         attr.setPrefix(attrPrefix);
289                         attr.setValue(attrValue);
290                         element.setAttributeNodeNS(attr);
291                     }
292 
293                     // Recursive descent
294                     token = parser.nextToken();
295                     parse(parser, document, element, XmlPullParser.END_TAG);
296 
297                     // Expect the element's end tag here
298                     parser.require(XmlPullParser.END_TAG, namespace, name);
299 
300                 } else {
301                     // Collect info for element node
302                     String name = parser.getName();
303 
304                     // Create element node and wire it correctly
305                     Element element = document.createElement(name);
306                     node.appendChild(element);
307 
308                     for (int i = 0; i < parser.getAttributeCount(); i++) {
309                         // Collect info for a single attribute node
310                         String attrName = parser.getAttributeName(i);
311                         String attrValue = parser.getAttributeValue(i);
312 
313                         // Create attribute node and wire it correctly
314                         Attr attr = document.createAttribute(attrName);
315                         attr.setValue(attrValue);
316                         element.setAttributeNode(attr);
317                     }
318 
319                     // Recursive descent
320                     token = parser.nextToken();
321                     parse(parser, document, element, XmlPullParser.END_TAG);
322 
323                     // Expect the element's end tag here
324                     parser.require(XmlPullParser.END_TAG, "", name);
325                 }
326             }
327 
328             token = parser.nextToken();
329         }
330     }
331 
332     /**
333      * @param token the XML pull parser token type, such as XmlPullParser.CDSECT
334      *      or XmlPullParser.ENTITY_REF.
335      */
appendText(DocumentImpl document, Node parent, int token, String text)336     private void appendText(DocumentImpl document, Node parent, int token, String text) {
337         // Ignore empty runs.
338         if (text.isEmpty()) {
339             return;
340         }
341         // Merge with any previous text node if possible.
342         if (coalescing || token != XmlPullParser.CDSECT) {
343             Node lastChild = parent.getLastChild();
344             if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) {
345                 Text textNode = (Text) lastChild;
346                 textNode.appendData(text);
347                 return;
348             }
349         }
350         // Okay, we really do need a new text node
351         parent.appendChild(token == XmlPullParser.CDSECT
352                 ? new CDATASectionImpl(document, text)
353                 : new TextImpl(document, text));
354     }
355 
356     @Override
setEntityResolver(EntityResolver resolver)357     public void setEntityResolver(EntityResolver resolver) {
358         entityResolver = resolver;
359     }
360 
361     @Override
setErrorHandler(ErrorHandler handler)362     public void setErrorHandler(ErrorHandler handler) {
363         errorHandler = handler;
364     }
365 
366     /**
367      * Controls whether this DocumentBuilder ignores comments.
368      */
setIgnoreComments(boolean value)369     public void setIgnoreComments(boolean value) {
370         ignoreComments = value;
371     }
372 
setCoalescing(boolean value)373     public void setCoalescing(boolean value) {
374         coalescing = value;
375     }
376 
377     /**
378      * Controls whether this DocumentBuilder ignores element content whitespace.
379      */
setIgnoreElementContentWhitespace(boolean value)380     public void setIgnoreElementContentWhitespace(boolean value) {
381         ignoreElementContentWhitespace = value;
382     }
383 
384     /**
385      * Controls whether this DocumentBuilder is namespace-aware.
386      */
setNamespaceAware(boolean value)387     public void setNamespaceAware(boolean value) {
388         namespaceAware = value;
389     }
390 
391     /**
392      * Returns the replacement text or null if {@code entity} isn't predefined.
393      */
resolvePredefinedOrCharacterEntity(String entityName)394     private String resolvePredefinedOrCharacterEntity(String entityName) {
395         // Character references, section 4.1 of the XML specification.
396         if (entityName.startsWith("#x")) {
397             return resolveCharacterReference(entityName.substring(2), 16);
398         } else if (entityName.startsWith("#")) {
399             return resolveCharacterReference(entityName.substring(1), 10);
400         }
401         // Predefined entities, section 4.6 of the XML specification.
402         if ("lt".equals(entityName)) {
403             return "<";
404         } else if ("gt".equals(entityName)) {
405             return ">";
406         } else if ("amp".equals(entityName)) {
407             return "&";
408         } else if ("apos".equals(entityName)) {
409             return "'";
410         } else if ("quot".equals(entityName)) {
411             return "\"";
412         } else {
413             return null;
414         }
415     }
416 
resolveCharacterReference(String value, int base)417     private String resolveCharacterReference(String value, int base) {
418         try {
419             int codePoint = Integer.parseInt(value, base);
420             if (Character.isBmpCodePoint(codePoint)) {
421                 return String.valueOf((char) codePoint);
422             } else {
423                 char[] surrogatePair = Character.toChars(codePoint);
424                 return new String(surrogatePair);
425             }
426         } catch (NumberFormatException ex) {
427             return null;
428         }
429     }
430 }
431