1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **************************************************************************
5  *   Copyright (c) 2002-2010, International Business Machines Corporation *
6  *   and others.  All Rights Reserved.                                    *
7  **************************************************************************
8  *   Date        Name        Description                                  *
9  *   01/28/2002  aliu        Creation.                                    *
10  **************************************************************************
11  */
12 #ifndef TRIDPARS_H
13 #define TRIDPARS_H
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_TRANSLITERATION
18 
19 #include "unicode/uobject.h"
20 #include "unicode/unistr.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 class Transliterator;
25 class UnicodeSet;
26 class UVector;
27 
28 /**
29  * Parsing component for transliterator IDs.  This class contains only
30  * static members; it cannot be instantiated.  Methods in this class
31  * parse various ID formats, including the following:
32  *
33  * A basic ID, which contains source, target, and variant, but no
34  * filter and no explicit inverse.  Examples include
35  * "Latin-Greek/UNGEGN" and "Null".
36  *
37  * A single ID, which is a basic ID plus optional filter and optional
38  * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
39  * "Lower (Upper)".
40  *
41  * A compound ID, which is a sequence of one or more single IDs,
42  * separated by semicolons, with optional forward and reverse global
43  * filters.  The global filters are UnicodeSet patterns prepended or
44  * appended to the IDs, separated by semicolons.  An appended filter
45  * must be enclosed in parentheses and applies in the reverse
46  * direction.
47  *
48  * @author Alan Liu
49  */
50 class TransliteratorIDParser /* not : public UObject because all methods are static */ {
51 
52  public:
53 
54     /**
55      * A structure containing the parsed data of a filtered ID, that
56      * is, a basic ID optionally with a filter.
57      *
58      * 'source' and 'target' will always be non-null.  The 'variant'
59      * will be non-null only if a non-empty variant was parsed.
60      *
61      * 'sawSource' is true if there was an explicit source in the
62      * parsed id.  If there was no explicit source, then an implied
63      * source of ANY is returned and 'sawSource' is set to false.
64      *
65      * 'filter' is the parsed filter pattern, or null if there was no
66      * filter.
67      */
68     class Specs : public UMemory {
69     public:
70         UnicodeString source; // not null
71         UnicodeString target; // not null
72         UnicodeString variant; // may be null
73         UnicodeString filter; // may be null
74         UBool sawSource;
75         Specs(const UnicodeString& s, const UnicodeString& t,
76               const UnicodeString& v, UBool sawS,
77               const UnicodeString& f);
78 
79     private:
80 
81         Specs(const Specs &other); // forbid copying of this class
82         Specs &operator=(const Specs &other); // forbid copying of this class
83     };
84 
85     /**
86      * A structure containing the canonicalized data of a filtered ID,
87      * that is, a basic ID optionally with a filter.
88      *
89      * 'canonID' is always non-null.  It may be the empty string "".
90      * It is the id that should be assigned to the created
91      * transliterator.  It _cannot_ be instantiated directly.
92      *
93      * 'basicID' is always non-null and non-empty.  It is always of
94      * the form S-T or S-T/V.  It is designed to be fed to low-level
95      * instantiation code that only understands these two formats.
96      *
97      * 'filter' may be null, if there is none, or non-null and
98      * non-empty.
99      */
100     class SingleID : public UMemory {
101     public:
102         UnicodeString canonID;
103         UnicodeString basicID;
104         UnicodeString filter;
105         SingleID(const UnicodeString& c, const UnicodeString& b,
106                  const UnicodeString& f);
107         SingleID(const UnicodeString& c, const UnicodeString& b);
108         Transliterator* createInstance();
109 
110     private:
111 
112         SingleID(const SingleID &other); // forbid copying of this class
113         SingleID &operator=(const SingleID &other); // forbid copying of this class
114     };
115 
116     /**
117      * Parse a filter ID, that is, an ID of the general form
118      * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
119      * @param id the id to be parsed
120      * @param pos INPUT-OUTPUT parameter.  On input, the position of
121      * the first character to parse.  On output, the position after
122      * the last character parsed.
123      * @return a SingleID object or null if the parse fails
124      */
125     static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
126 
127     /**
128      * Parse a single ID, that is, an ID of the general form
129      * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
130      * optional, the filters optional, and the variants optional.
131      * @param id the id to be parsed
132      * @param pos INPUT-OUTPUT parameter.  On input, the position of
133      * the first character to parse.  On output, the position after
134      * the last character parsed.
135      * @param dir the direction.  If the direction is REVERSE then the
136      * SingleID is constructed for the reverse direction.
137      * @return a SingleID object or null
138      */
139     static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
140                                   int32_t dir, UErrorCode& status);
141 
142     /**
143      * Parse a global filter of the form "[f]" or "([f])", depending
144      * on 'withParens'.
145      * @param id the pattern the parse
146      * @param pos INPUT-OUTPUT parameter.  On input, the position of
147      * the first character to parse.  On output, the position after
148      * the last character parsed.
149      * @param dir the direction.
150      * @param withParens INPUT-OUTPUT parameter.  On entry, if
151      * withParens[0] is 0, then parens are disallowed.  If it is 1,
152      * then parens are required.  If it is -1, then parens are
153      * optional, and the return result will be set to 0 or 1.
154      * @param canonID OUTPUT parameter.  The pattern for the filter
155      * added to the canonID, either at the end, if dir is FORWARD, or
156      * at the start, if dir is REVERSE.  The pattern will be enclosed
157      * in parentheses if appropriate, and will be suffixed with an
158      * ID_DELIM character.  May be null.
159      * @return a UnicodeSet object or null.  A non-null results
160      * indicates a successful parse, regardless of whether the filter
161      * applies to the given direction.  The caller should discard it
162      * if withParens != (dir == REVERSE).
163      */
164     static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
165                                          int32_t dir,
166                                          int32_t& withParens,
167                                          UnicodeString* canonID);
168 
169     /**
170      * Parse a compound ID, consisting of an optional forward global
171      * filter, a separator, one or more single IDs delimited by
172      * separators, an an optional reverse global filter.  The
173      * separator is a semicolon.  The global filters are UnicodeSet
174      * patterns.  The reverse global filter must be enclosed in
175      * parentheses.
176      * @param id the pattern the parse
177      * @param dir the direction.
178      * @param canonID OUTPUT parameter that receives the canonical ID,
179      * consisting of canonical IDs for all elements, as returned by
180      * parseSingleID(), separated by semicolons.  Previous contents
181      * are discarded.
182      * @param list OUTPUT parameter that receives a list of SingleID
183      * objects representing the parsed IDs.  Previous contents are
184      * discarded.
185      * @param globalFilter OUTPUT parameter that receives a pointer to
186      * a newly created global filter for this ID in this direction, or
187      * null if there is none.
188      * @return true if the parse succeeds, that is, if the entire
189      * id is consumed without syntax error.
190      */
191     static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
192                                  UnicodeString& canonID,
193                                  UVector& list,
194                                  UnicodeSet*& globalFilter);
195 
196     /**
197      * Convert the elements of the 'list' vector, which are SingleID
198      * objects, into actual Transliterator objects.  In the course of
199      * this, some (or all) entries may be removed.  If all entries
200      * are removed, the Null transliterator will be added.
201      *
202      * Delete entries with empty basicIDs; these are generated by
203      * elements like "(A)" in the forward direction, or "A()" in
204      * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
205      * SingleID entries to actual transliterators.
206      *
207      * @param list vector of SingleID objects.  On exit, vector
208      * of one or more Transliterators.
209      * @param ec Output param to receive a success or an error code.
210      * @return new value of insertIndex.  The index will shift if
211      * there are empty items, like "(Lower)", with indices less than
212      * insertIndex.
213      */
214     static void instantiateList(UVector& list,
215                                 UErrorCode& ec);
216 
217     /**
218      * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
219      * S-T/V, or S/V-T.  If the source is missing, return a source of
220      * ANY.
221      * @param id the id string, in any of several forms
222      * @param source          the given source.
223      * @param target          the given target.
224      * @param variant         the given variant
225      * @param isSourcePresent If TRUE then the source is present.
226      *                        If the source is not present, ANY will be
227      *                        given as the source, and isSourcePresent will be null
228      * @return an array of 4 strings: source, target, variant, and
229      * isSourcePresent.  If the source is not present, ANY will be
230      * given as the source, and isSourcePresent will be null.  Otherwise
231      * isSourcePresent will be non-null.  The target may be empty if the
232      * id is not well-formed.  The variant may be empty.
233      */
234     static void IDtoSTV(const UnicodeString& id,
235                         UnicodeString& source,
236                         UnicodeString& target,
237                         UnicodeString& variant,
238                         UBool& isSourcePresent);
239 
240     /**
241      * Given source, target, and variant strings, concatenate them into a
242      * full ID.  If the source is empty, then "Any" will be used for the
243      * source, so the ID will always be of the form s-t/v or s-t.
244      */
245     static void STVtoID(const UnicodeString& source,
246                         const UnicodeString& target,
247                         const UnicodeString& variant,
248                         UnicodeString& id);
249 
250     /**
251      * Register two targets as being inverses of one another.  For
252      * example, calling registerSpecialInverse("NFC", "NFD", true) causes
253      * Transliterator to form the following inverse relationships:
254      *
255      * <pre>NFC => NFD
256      * Any-NFC => Any-NFD
257      * NFD => NFC
258      * Any-NFD => Any-NFC</pre>
259      *
260      * (Without the special inverse registration, the inverse of NFC
261      * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
262      * that the presence or absence of "Any-" is preserved.
263      *
264      * <p>The relationship is symmetrical; registering (a, b) is
265      * equivalent to registering (b, a).
266      *
267      * <p>The relevant IDs must still be registered separately as
268      * factories or classes.
269      *
270      * <p>Only the targets are specified.  Special inverses always
271      * have the form Any-Target1 <=> Any-Target2.  The target should
272      * have canonical casing (the casing desired to be produced when
273      * an inverse is formed) and should contain no whitespace or other
274      * extraneous characters.
275      *
276      * @param target the target against which to register the inverse
277      * @param inverseTarget the inverse of target, that is
278      * Any-target.getInverse() => Any-inverseTarget
279      * @param bidirectional if true, register the reverse relation
280      * as well, that is, Any-inverseTarget.getInverse() => Any-target
281      */
282     static void registerSpecialInverse(const UnicodeString& target,
283                                        const UnicodeString& inverseTarget,
284                                        UBool bidirectional,
285                                        UErrorCode &status);
286 
287     /**
288      * Free static memory.
289      */
290     static void cleanup();
291 
292  private:
293     //----------------------------------------------------------------
294     // Private implementation
295     //----------------------------------------------------------------
296 
297     // forbid instantiation
298     TransliteratorIDParser();
299 
300     /**
301      * Parse an ID into component pieces.  Take IDs of the form T,
302      * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
303      * source of ANY.
304      * @param id the id string, in any of several forms
305      * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
306      * offset of the first character to parse in id.  On output,
307      * pos[0] is the offset after the last parsed character.  If the
308      * parse failed, pos[0] will be unchanged.
309      * @param allowFilter if true, a UnicodeSet pattern is allowed
310      * at any location between specs or delimiters, and is returned
311      * as the fifth string in the array.
312      * @return a Specs object, or null if the parse failed.  If
313      * neither source nor target was seen in the parsed id, then the
314      * parse fails.  If allowFilter is true, then the parsed filter
315      * pattern is returned in the Specs object, otherwise the returned
316      * filter reference is null.  If the parse fails for any reason
317      * null is returned.
318      */
319     static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
320                                 UBool allowFilter);
321 
322     /**
323      * Givens a Specs object, convert it to a SingleID object.  The
324      * Spec object is a more unprocessed parse result.  The SingleID
325      * object contains information about canonical and basic IDs.
326      * @param specs the given Specs object.
327      * @param dir   either FORWARD or REVERSE.
328      * @return a SingleID; never returns null.  Returned object always
329      * has 'filter' field of null.
330      */
331     static SingleID* specsToID(const Specs* specs, int32_t dir);
332 
333     /**
334      * Given a Specs object, return a SingleID representing the
335      * special inverse of that ID.  If there is no special inverse
336      * then return null.
337      * @param specs the given Specs.
338      * @return a SingleID or null.  Returned object always has
339      * 'filter' field of null.
340      */
341     static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
342 
343     /**
344      * Glue method to get around access problems in C++.
345      * @param id the id string for the transliterator, in any of several forms
346      * @param canonID the given canonical ID
347      */
348     static Transliterator* createBasicInstance(const UnicodeString& id,
349                                                const UnicodeString* canonID);
350 
351     /**
352      * Initialize static memory.
353      */
354     static void U_CALLCONV init(UErrorCode &status);
355 
356     friend class SingleID;
357 };
358 
359 U_NAMESPACE_END
360 
361 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
362 
363 #endif
364