1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2004-2005, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  xmlparser.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004jul21
14 *   created by: Andy Heninger
15 *
16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
17 * Not suitable for production use. Not supported.
18 * Not conformant. Not efficient.
19 * But very small.
20 */
21 
22 #ifndef __XMLPARSER_H__
23 #define __XMLPARSER_H__
24 
25 #include "unicode/uobject.h"
26 #include "unicode/unistr.h"
27 #include "unicode/regex.h"
28 #include "uvector.h"
29 #include "hash.h"
30 
31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
32 
33 enum UXMLNodeType {
34     /** Node type string (text contents), stored as a UnicodeString. */
35     UXML_NODE_TYPE_STRING,
36     /** Node type element, stored as a UXMLElement. */
37     UXML_NODE_TYPE_ELEMENT,
38     UXML_NODE_TYPE_COUNT
39 };
40 
41 U_NAMESPACE_BEGIN
42 
43 class UXMLParser;
44 
45 /**
46  * This class represents an element node in a parsed XML tree.
47  */
48 class U_TOOLUTIL_API UXMLElement : public UObject {
49 public:
50     /**
51      * Destructor.
52      */
53     virtual ~UXMLElement();
54 
55     /**
56      * Get the tag name of this element.
57      */
58     const UnicodeString &getTagName() const;
59     /**
60      * Get the text contents of the element.
61      * Append the contents of all text child nodes.
62      * @param recurse If TRUE, also recursively appends the contents of all
63      *        text child nodes of element children.
64      * @return The text contents.
65      */
66     UnicodeString getText(UBool recurse) const;
67     /**
68      * Get the number of attributes.
69      */
70     int32_t countAttributes() const;
71     /**
72      * Get the i-th attribute.
73      * @param i Index of the attribute.
74      * @param name Output parameter, receives the attribute name.
75      * @param value Output parameter, receives the attribute value.
76      * @return A pointer to the attribute value (may be &value or a pointer to an
77      *         internal string object), or NULL if i is out of bounds.
78      */
79     const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
80     /**
81      * Get the value of the attribute with the given name.
82      * @param name Attribute name to be looked up.
83      * @return A pointer to the attribute value, or NULL if this element
84      * does not have this attribute.
85      */
86     const UnicodeString *getAttribute(const UnicodeString &name) const;
87     /**
88      * Get the number of child nodes.
89      */
90     int32_t countChildren() const;
91     /**
92      * Get the i-th child node.
93      * @param i Index of the child node.
94      * @param type The child node type.
95      * @return A pointer to the child node object, or NULL if i is out of bounds.
96      */
97     const UObject *getChild(int32_t i, UXMLNodeType &type) const;
98     /**
99      * Get the next child element node, skipping non-element child nodes.
100      * @param i Enumeration index; initialize to 0 before getting the first child element.
101      * @return A pointer to the next child element, or NULL if there is none.
102      */
103     const UXMLElement *nextChildElement(int32_t &i) const;
104     /**
105      * Get the immediate child element with the given name.
106      * If there are multiple child elements with this name, then return
107      * the first one.
108      * @param name Element name to be looked up.
109      * @return A pointer to the element node, or NULL if this element
110      * does not have this immediate child element.
111      */
112     const UXMLElement *getChildElement(const UnicodeString &name) const;
113 
114     /**
115      * ICU "poor man's RTTI", returns a UClassID for the actual class.
116      */
117     virtual UClassID getDynamicClassID() const;
118 
119     /**
120      * ICU "poor man's RTTI", returns a UClassID for this class.
121      */
122     static UClassID U_EXPORT2 getStaticClassID();
123 
124 private:
125     // prevent default construction etc.
126     UXMLElement();
127     UXMLElement(const UXMLElement &other);
128     UXMLElement &operator=(const UXMLElement &other);
129 
130     void appendText(UnicodeString &text, UBool recurse) const;
131 
132     friend class UXMLParser;
133 
134     UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
135 
136     const UXMLParser *fParser;
137     const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
138     UnicodeString       fContent;        // The text content of this node.  All element content is
139                                          //   concatenated even when there are intervening nested elements
140                                          //   (which doesn't happen with most xml files we care about)
141                                          //   Sections of content containing only white space are dropped,
142                                          //   which gets rid  the bogus white space content from
143                                          //   elements which are primarily containers for nested elements.
144     UVector             fAttNames;       // A vector containing the names of this element's attributes
145                                          //    The names are UnicodeString objects, owned by the UXMLParser.
146     UVector             fAttValues;      // A vector containing the attribute values for
147                                          //    this element's attributes.  The order is the same
148                                          //    as that of the attribute name vector.
149 
150     UVector             fChildren;       // The child nodes of this element (a Vector)
151 
152     UXMLElement        *fParent;         // A pointer to the parent element of this element.
153 };
154 
155 /**
156  * A simple XML parser; it is neither efficient nor conformant and only useful for
157  * restricted types of XML documents.
158  *
159  * The parse methods parse whole documents and return the parse trees via their
160  * root elements.
161  */
162 class U_TOOLUTIL_API UXMLParser : public UObject {
163 public:
164     /**
165      * Create an XML parser.
166      */
167     static UXMLParser *createParser(UErrorCode &errorCode);
168     /**
169      * Destructor.
170      */
171     virtual ~UXMLParser();
172 
173     /**
174      * Parse an XML document, create the entire document tree, and
175      * return a pointer to the root element of the parsed tree.
176      * The caller must delete the element.
177      */
178     UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
179     /**
180      * Parse an XML file, create the entire document tree, and
181      * return a pointer to the root element of the parsed tree.
182      * The caller must delete the element.
183      */
184     UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
185 
186     /**
187      * ICU "poor man's RTTI", returns a UClassID for the actual class.
188      */
189     virtual UClassID getDynamicClassID() const;
190 
191     /**
192      * ICU "poor man's RTTI", returns a UClassID for this class.
193      */
194     static UClassID U_EXPORT2 getStaticClassID();
195 
196 private:
197     // prevent default construction etc.
198     UXMLParser();
199     UXMLParser(const UXMLParser &other);
200     UXMLParser &operator=(const UXMLParser &other);
201 
202     // constructor
203     UXMLParser(UErrorCode &status);
204 
205     void           parseMisc(UErrorCode &status);
206     UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
207     void           error(const char *message, UErrorCode &status);
208     UnicodeString  scanContent(UErrorCode &status);
209     void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
210 
211     const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
212 public:
213     // public for UXMLElement only
214     const UnicodeString *findName(const UnicodeString &s) const;
215 private:
216 
217     // There is one ICU regex matcher for each of the major XML syntax items
218     //  that are recognized.
219     RegexMatcher mXMLDecl;
220     RegexMatcher mXMLComment;
221     RegexMatcher mXMLSP;
222     RegexMatcher mXMLDoctype;
223     RegexMatcher mXMLPI;
224     RegexMatcher mXMLElemStart;
225     RegexMatcher mXMLElemEnd;
226     RegexMatcher mXMLElemEmpty;
227     RegexMatcher mXMLCharData;
228     RegexMatcher mAttrValue;
229     RegexMatcher mAttrNormalizer;
230     RegexMatcher mNewLineNormalizer;
231     RegexMatcher mAmps;
232 
233     Hashtable             fNames;           // interned element/attribute name strings
234     UStack                fElementStack;    // Stack holds the parent elements when nested
235                                             //    elements are being parsed.  All items on this
236                                             //    stack are of type UXMLElement.
237     int32_t               fPos;             // String index of the current scan position in
238                                             //    xml source (in fSrc).
239     UnicodeString         fOneLF;
240 };
241 
242 U_NAMESPACE_END
243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
244 
245 #endif
246