1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/17/99    aliu        Creation.
8 **********************************************************************
9 */
10 #ifndef RBT_PARS_H
11 #define RBT_PARS_H
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 #ifdef __cplusplus
17 
18 #include "unicode/uobject.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/unorm.h"
21 #include "rbt.h"
22 #include "hash.h"
23 #include "uvector.h"
24 
25 U_NAMESPACE_BEGIN
26 
27 class TransliterationRuleData;
28 class UnicodeFunctor;
29 class ParseData;
30 class RuleHalf;
31 class ParsePosition;
32 class StringMatcher;
33 
34 class TransliteratorParser : public UMemory {
35 
36  public:
37 
38     /**
39      * A Vector of TransliterationRuleData objects, one for each discrete group
40      * of rules in the rule set
41      */
42     UVector dataVector;
43 
44     /**
45      * PUBLIC data member.
46      * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
47      */
48     UVector idBlockVector;
49 
50     /**
51      * PUBLIC data member containing the parsed compound filter, if any.
52      */
53     UnicodeSet* compoundFilter;
54 
55  private:
56 
57     /**
58      * The current data object for which we are parsing rules
59      */
60     TransliterationRuleData* curData;
61 
62     UTransDirection direction;
63 
64     /**
65      * Parse error information.
66      */
67     UParseError parseError;
68 
69     /**
70      * Temporary symbol table used during parsing.
71      */
72     ParseData* parseData;
73 
74     /**
75      * Temporary vector of matcher variables.  When parsing is complete, this
76      * is copied into the array data.variables.  As with data.variables,
77      * element 0 corresponds to character data.variablesBase.
78      */
79     UVector variablesVector;
80 
81     /**
82      * Temporary table of variable names.  When parsing is complete, this is
83      * copied into data.variableNames.
84      */
85     Hashtable variableNames;
86 
87     /**
88      * String of standins for segments.  Used during the parsing of a single
89      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
90      * to StringMatcher object segmentObjects.elementAt(0), etc.
91      */
92     UnicodeString segmentStandins;
93 
94     /**
95      * Vector of StringMatcher objects for segments.  Used during the
96      * parsing of a single rule.
97      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
98      * to StringMatcher object segmentObjects.elementAt(0), etc.
99      */
100     UVector segmentObjects;
101 
102     /**
103      * The next available stand-in for variables.  This starts at some point in
104      * the private use area (discovered dynamically) and increments up toward
105      * <code>variableLimit</code>.  At any point during parsing, available
106      * variables are <code>variableNext..variableLimit-1</code>.
107      */
108     UChar variableNext;
109 
110     /**
111      * The last available stand-in for variables.  This is discovered
112      * dynamically.  At any point during parsing, available variables are
113      * <code>variableNext..variableLimit-1</code>.
114      */
115     UChar variableLimit;
116 
117     /**
118      * When we encounter an undefined variable, we do not immediately signal
119      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
120      * Instead, we save the name of the undefined variable, and substitute
121      * in the placeholder char variableLimit - 1, and decrement
122      * variableLimit.
123      */
124     UnicodeString undefinedVariableName;
125 
126     /**
127      * The stand-in character for the 'dot' set, represented by '.' in
128      * patterns.  This is allocated the first time it is needed, and
129      * reused thereafter.
130      */
131     UChar dotStandIn;
132 
133 public:
134 
135     /**
136      * Constructor.
137      */
138     TransliteratorParser(UErrorCode &statusReturn);
139 
140     /**
141      * Destructor.
142      */
143     ~TransliteratorParser();
144 
145     /**
146      * Parse the given string as a sequence of rules, separated by newline
147      * characters ('\n'), and cause this object to implement those rules.  Any
148      * previous rules are discarded.  Typically this method is called exactly
149      * once after construction.
150      *
151      * Parse the given rules, in the given direction.  After this call
152      * returns, query the public data members for results.  The caller
153      * owns the 'data' and 'compoundFilter' data members after this
154      * call returns.
155      * @param rules      rules, separated by ';'
156      * @param direction  either FORWARD or REVERSE.
157      * @param pe         Struct to recieve information on position
158      *                   of error if an error is encountered
159      * @param ec         Output param set to success/failure code.
160      */
161     void parse(const UnicodeString& rules,
162                UTransDirection direction,
163                UParseError& pe,
164                UErrorCode& ec);
165 
166     /**
167      * Return the compound filter parsed by parse().  Caller owns result.
168      * @return the compound filter parsed by parse().
169      */
170     UnicodeSet* orphanCompoundFilter();
171 
172 private:
173 
174     /**
175      * Return a representation of this transliterator as source rules.
176      * @param rules      Output param to receive the rules.
177      * @param direction  either FORWARD or REVERSE.
178      */
179     void parseRules(const UnicodeString& rules,
180                     UTransDirection direction,
181                     UErrorCode& status);
182 
183     /**
184      * MAIN PARSER.  Parse the next rule in the given rule string, starting
185      * at pos.  Return the index after the last character parsed.  Do not
186      * parse characters at or after limit.
187      *
188      * Important:  The character at pos must be a non-whitespace character
189      * that is not the comment character.
190      *
191      * This method handles quoting, escaping, and whitespace removal.  It
192      * parses the end-of-rule character.  It recognizes context and cursor
193      * indicators.  Once it does a lexical breakdown of the rule at pos, it
194      * creates a rule object and adds it to our rule list.
195      * @param rules      Output param to receive the rules.
196      * @param pos        the starting position.
197      * @param limit      pointer past the last character of the rule.
198      * @return           the index after the last character parsed.
199      */
200     int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
201 
202     /**
203      * Set the variable range to [start, end] (inclusive).
204      * @param start    the start value of the range.
205      * @param end      the end value of the range.
206      */
207     void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
208 
209     /**
210      * Assert that the given character is NOT within the variable range.
211      * If it is, return FALSE.  This is neccesary to ensure that the
212      * variable range does not overlap characters used in a rule.
213      * @param ch     the given character.
214      * @return       True, if the given character is NOT within the variable range.
215      */
216     UBool checkVariableRange(UChar32 ch) const;
217 
218     /**
219      * Set the maximum backup to 'backup', in response to a pragma
220      * statement.
221      * @param backup    the new value to be set.
222      */
223     void pragmaMaximumBackup(int32_t backup);
224 
225     /**
226      * Begin normalizing all rules using the given mode, in response
227      * to a pragma statement.
228      * @param mode    the given mode.
229      */
230     void pragmaNormalizeRules(UNormalizationMode mode);
231 
232     /**
233      * Return true if the given rule looks like a pragma.
234      * @param pos offset to the first non-whitespace character
235      * of the rule.
236      * @param limit pointer past the last character of the rule.
237      * @return true if the given rule looks like a pragma.
238      */
239     static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
240 
241     /**
242      * Parse a pragma.  This method assumes resemblesPragma() has
243      * already returned true.
244      * @param pos offset to the first non-whitespace character
245      * of the rule.
246      * @param limit pointer past the last character of the rule.
247      * @return the position index after the final ';' of the pragma,
248      * or -1 on failure.
249      */
250     int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
251 
252     /**
253      * Called by main parser upon syntax error.  Search the rule string
254      * for the probable end of the rule.  Of course, if the error is that
255      * the end of rule marker is missing, then the rule end will not be found.
256      * In any case the rule start will be correctly reported.
257      * @param parseErrorCode error code.
258      * @param msg error description.
259      * @param start position of first character of current rule.
260      * @return start position of first character of current rule.
261      */
262     int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
263                         UErrorCode& status);
264 
265     /**
266      * Parse a UnicodeSet out, store it, and return the stand-in character
267      * used to represent it.
268      *
269      * @param rule    the rule for UnicodeSet.
270      * @param pos     the position in pattern at which to start parsing.
271      * @return        the stand-in character used to represent it.
272      */
273     UChar parseSet(const UnicodeString& rule,
274                    ParsePosition& pos,
275                    UErrorCode& status);
276 
277     /**
278      * Generate and return a stand-in for a new UnicodeFunctor.  Store
279      * the matcher (adopt it).
280      * @param adopted the UnicodeFunctor to be adopted.
281      * @return        a stand-in for a new UnicodeFunctor.
282      */
283     UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
284 
285     /**
286      * Return the standin for segment seg (1-based).
287      * @param seg    the given segment.
288      * @return       the standIn character for the given segment.
289      */
290     UChar getSegmentStandin(int32_t seg, UErrorCode& status);
291 
292     /**
293      * Set the object for segment seg (1-based).
294      * @param seg      the given segment.
295      * @param adopted  the StringMatcher to be adopted.
296      */
297     void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
298 
299     /**
300      * Return the stand-in for the dot set.  It is allocated the first
301      * time and reused thereafter.
302      * @return    the stand-in for the dot set.
303      */
304     UChar getDotStandIn(UErrorCode& status);
305 
306     /**
307      * Append the value of the given variable name to the given
308      * UnicodeString.
309      * @param name    the variable name to be appended.
310      * @param buf     the given UnicodeString to append to.
311      */
312     void appendVariableDef(const UnicodeString& name,
313                            UnicodeString& buf,
314                            UErrorCode& status);
315 
316     /**
317      * Glue method to get around access restrictions in C++.
318      */
319     /*static Transliterator* createBasicInstance(const UnicodeString& id,
320                                                const UnicodeString* canonID);*/
321 
322     friend class RuleHalf;
323 
324     // Disallowed methods; no impl.
325     /**
326      * Copy constructor
327      */
328     TransliteratorParser(const TransliteratorParser&);
329 
330     /**
331      * Assignment operator
332      */
333     TransliteratorParser& operator=(const TransliteratorParser&);
334 };
335 
336 U_NAMESPACE_END
337 
338 #endif /* #ifdef __cplusplus */
339 
340 /**
341  * Strip/convert the following from the transliterator rules:
342  * comments
343  * newlines
344  * white space at the beginning and end of a line
345  * unescape \u notation
346  *
347  * The target must be equal in size as the source.
348  * @internal
349  */
350 U_CAPI int32_t
351 utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
352 
353 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
354 
355 #endif
356