1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2005, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  xmlparser.h
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004jul21
16 *   created by: Andy Heninger
17 *
18 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
19 * Not suitable for production use. Not supported.
20 * Not conformant. Not efficient.
21 * But very small.
22 */
23 
24 #ifndef __XMLPARSER_H__
25 #define __XMLPARSER_H__
26 
27 #include "unicode/uobject.h"
28 #include "unicode/unistr.h"
29 #include "unicode/regex.h"
30 #include "uvector.h"
31 #include "hash.h"
32 
33 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
34 
35 enum UXMLNodeType {
36     /** Node type string (text contents), stored as a UnicodeString. */
37     UXML_NODE_TYPE_STRING,
38     /** Node type element, stored as a UXMLElement. */
39     UXML_NODE_TYPE_ELEMENT,
40     UXML_NODE_TYPE_COUNT
41 };
42 
43 U_NAMESPACE_BEGIN
44 
45 class UXMLParser;
46 
47 /**
48  * This class represents an element node in a parsed XML tree.
49  */
50 class U_TOOLUTIL_API UXMLElement : public UObject {
51 public:
52     /**
53      * Destructor.
54      */
55     virtual ~UXMLElement();
56 
57     /**
58      * Get the tag name of this element.
59      */
60     const UnicodeString &getTagName() const;
61     /**
62      * Get the text contents of the element.
63      * Append the contents of all text child nodes.
64      * @param recurse If TRUE, also recursively appends the contents of all
65      *        text child nodes of element children.
66      * @return The text contents.
67      */
68     UnicodeString getText(UBool recurse) const;
69     /**
70      * Get the number of attributes.
71      */
72     int32_t countAttributes() const;
73     /**
74      * Get the i-th attribute.
75      * @param i Index of the attribute.
76      * @param name Output parameter, receives the attribute name.
77      * @param value Output parameter, receives the attribute value.
78      * @return A pointer to the attribute value (may be &value or a pointer to an
79      *         internal string object), or NULL if i is out of bounds.
80      */
81     const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
82     /**
83      * Get the value of the attribute with the given name.
84      * @param name Attribute name to be looked up.
85      * @return A pointer to the attribute value, or NULL if this element
86      * does not have this attribute.
87      */
88     const UnicodeString *getAttribute(const UnicodeString &name) const;
89     /**
90      * Get the number of child nodes.
91      */
92     int32_t countChildren() const;
93     /**
94      * Get the i-th child node.
95      * @param i Index of the child node.
96      * @param type The child node type.
97      * @return A pointer to the child node object, or NULL if i is out of bounds.
98      */
99     const UObject *getChild(int32_t i, UXMLNodeType &type) const;
100     /**
101      * Get the next child element node, skipping non-element child nodes.
102      * @param i Enumeration index; initialize to 0 before getting the first child element.
103      * @return A pointer to the next child element, or NULL if there is none.
104      */
105     const UXMLElement *nextChildElement(int32_t &i) const;
106     /**
107      * Get the immediate child element with the given name.
108      * If there are multiple child elements with this name, then return
109      * the first one.
110      * @param name Element name to be looked up.
111      * @return A pointer to the element node, or NULL if this element
112      * does not have this immediate child element.
113      */
114     const UXMLElement *getChildElement(const UnicodeString &name) const;
115 
116     /**
117      * ICU "poor man's RTTI", returns a UClassID for the actual class.
118      */
119     virtual UClassID getDynamicClassID() const;
120 
121     /**
122      * ICU "poor man's RTTI", returns a UClassID for this class.
123      */
124     static UClassID U_EXPORT2 getStaticClassID();
125 
126 private:
127     // prevent default construction etc.
128     UXMLElement();
129     UXMLElement(const UXMLElement &other);
130     UXMLElement &operator=(const UXMLElement &other);
131 
132     void appendText(UnicodeString &text, UBool recurse) const;
133 
134     friend class UXMLParser;
135 
136     UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
137 
138     const UXMLParser *fParser;
139     const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
140     UnicodeString       fContent;        // The text content of this node.  All element content is
141                                          //   concatenated even when there are intervening nested elements
142                                          //   (which doesn't happen with most xml files we care about)
143                                          //   Sections of content containing only white space are dropped,
144                                          //   which gets rid  the bogus white space content from
145                                          //   elements which are primarily containers for nested elements.
146     UVector             fAttNames;       // A vector containing the names of this element's attributes
147                                          //    The names are UnicodeString objects, owned by the UXMLParser.
148     UVector             fAttValues;      // A vector containing the attribute values for
149                                          //    this element's attributes.  The order is the same
150                                          //    as that of the attribute name vector.
151 
152     UVector             fChildren;       // The child nodes of this element (a Vector)
153 
154     UXMLElement        *fParent;         // A pointer to the parent element of this element.
155 };
156 
157 /**
158  * A simple XML parser; it is neither efficient nor conformant and only useful for
159  * restricted types of XML documents.
160  *
161  * The parse methods parse whole documents and return the parse trees via their
162  * root elements.
163  */
164 class U_TOOLUTIL_API UXMLParser : public UObject {
165 public:
166     /**
167      * Create an XML parser.
168      */
169     static UXMLParser *createParser(UErrorCode &errorCode);
170     /**
171      * Destructor.
172      */
173     virtual ~UXMLParser();
174 
175     /**
176      * Parse an XML document, create the entire document tree, and
177      * return a pointer to the root element of the parsed tree.
178      * The caller must delete the element.
179      */
180     UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
181     /**
182      * Parse an XML file, create the entire document tree, and
183      * return a pointer to the root element of the parsed tree.
184      * The caller must delete the element.
185      */
186     UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
187 
188     /**
189      * ICU "poor man's RTTI", returns a UClassID for the actual class.
190      */
191     virtual UClassID getDynamicClassID() const;
192 
193     /**
194      * ICU "poor man's RTTI", returns a UClassID for this class.
195      */
196     static UClassID U_EXPORT2 getStaticClassID();
197 
198 private:
199     // prevent default construction etc.
200     UXMLParser();
201     UXMLParser(const UXMLParser &other);
202     UXMLParser &operator=(const UXMLParser &other);
203 
204     // constructor
205     UXMLParser(UErrorCode &status);
206 
207     void           parseMisc(UErrorCode &status);
208     UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
209     void           error(const char *message, UErrorCode &status);
210     UnicodeString  scanContent(UErrorCode &status);
211     void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
212 
213     const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
214 public:
215     // public for UXMLElement only
216     const UnicodeString *findName(const UnicodeString &s) const;
217 private:
218 
219     // There is one ICU regex matcher for each of the major XML syntax items
220     //  that are recognized.
221     RegexMatcher mXMLDecl;
222     RegexMatcher mXMLComment;
223     RegexMatcher mXMLSP;
224     RegexMatcher mXMLDoctype;
225     RegexMatcher mXMLPI;
226     RegexMatcher mXMLElemStart;
227     RegexMatcher mXMLElemEnd;
228     RegexMatcher mXMLElemEmpty;
229     RegexMatcher mXMLCharData;
230     RegexMatcher mAttrValue;
231     RegexMatcher mAttrNormalizer;
232     RegexMatcher mNewLineNormalizer;
233     RegexMatcher mAmps;
234 
235     Hashtable             fNames;           // interned element/attribute name strings
236     UStack                fElementStack;    // Stack holds the parent elements when nested
237                                             //    elements are being parsed.  All items on this
238                                             //    stack are of type UXMLElement.
239     int32_t               fPos;             // String index of the current scan position in
240                                             //    xml source (in fSrc).
241     UnicodeString         fOneLF;
242 };
243 
244 U_NAMESPACE_END
245 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
246 
247 #endif
248