1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  *   Copyright (C) 1997-2014, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  ******************************************************************************
8  */
9 
10 /**
11  * \file
12  * \brief C++ API: Collation Element Iterator.
13  */
14 
15 /**
16 * File coleitr.h
17 *
18 * Created by: Helena Shih
19 *
20 * Modification History:
21 *
22 *  Date       Name        Description
23 *
24 *  8/18/97    helena      Added internal API documentation.
25 * 08/03/98    erm         Synched with 1.2 version CollationElementIterator.java
26 * 12/10/99    aliu        Ported Thai collation support from Java.
27 * 01/25/01    swquek      Modified into a C++ wrapper calling C APIs (ucoliter.h)
28 * 02/19/01    swquek      Removed CollationElementsIterator() since it is
29 *                         private constructor and no calls are made to it
30 * 2012-2014   markus      Rewritten in C++ again.
31 */
32 
33 #ifndef COLEITR_H
34 #define COLEITR_H
35 
36 #include "unicode/utypes.h"
37 
38 #if U_SHOW_CPLUSPLUS_API
39 
40 #if !UCONFIG_NO_COLLATION
41 
42 #include "unicode/unistr.h"
43 #include "unicode/uobject.h"
44 
45 struct UCollationElements;
46 struct UHashtable;
47 
48 U_NAMESPACE_BEGIN
49 
50 struct CollationData;
51 
52 class CharacterIterator;
53 class CollationIterator;
54 class RuleBasedCollator;
55 class UCollationPCE;
56 class UVector32;
57 
58 /**
59 * The CollationElementIterator class is used as an iterator to walk through
60 * each character of an international string. Use the iterator to return the
61 * ordering priority of the positioned character. The ordering priority of a
62 * character, which we refer to as a key, defines how a character is collated in
63 * the given collation object.
64 * For example, consider the following in Slovak and in traditional Spanish collation:
65 * <pre>
66 *        "ca" -> the first key is key('c') and second key is key('a').
67 *        "cha" -> the first key is key('ch') and second key is key('a').</pre>
68 * And in German phonebook collation,
69 * <pre> \htmlonly       "&#x00E6;b"-> the first key is key('a'), the second key is key('e'), and
70 *        the third key is key('b'). \endhtmlonly </pre>
71 * The key of a character, is an integer composed of primary order(short),
72 * secondary order(char), and tertiary order(char). Java strictly defines the
73 * size and signedness of its primitive data types. Therefore, the static
74 * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
75 * int32_t to ensure the correctness of the key value.
76 * <p>Example of the iterator usage: (without error checking)
77 * <pre>
78 * \code
79 *   void CollationElementIterator_Example()
80 *   {
81 *       UnicodeString str = "This is a test";
82 *       UErrorCode success = U_ZERO_ERROR;
83 *       RuleBasedCollator* rbc =
84 *           (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
85 *       CollationElementIterator* c =
86 *           rbc->createCollationElementIterator( str );
87 *       int32_t order = c->next(success);
88 *       c->reset();
89 *       order = c->previous(success);
90 *       delete c;
91 *       delete rbc;
92 *   }
93 * \endcode
94 * </pre>
95 * <p>
96 * The method next() returns the collation order of the next character based on
97 * the comparison level of the collator. The method previous() returns the
98 * collation order of the previous character based on the comparison level of
99 * the collator. The Collation Element Iterator moves only in one direction
100 * between calls to reset(), setOffset(), or setText(). That is, next()
101 * and previous() can not be inter-used. Whenever previous() is to be called after
102 * next() or vice versa, reset(), setOffset() or setText() has to be called first
103 * to reset the status, shifting pointers to either the end or the start of
104 * the string (reset() or setText()), or the specified position (setOffset()).
105 * Hence at the next call of next() or previous(), the first or last collation order,
106 * or collation order at the spefcifieid position will be returned. If a change of
107 * direction is done without one of these calls, the result is undefined.
108 * <p>
109 * The result of a forward iterate (next()) and reversed result of the backward
110 * iterate (previous()) on the same string are equivalent, if collation orders
111 * with the value 0 are ignored.
112 * Character based on the comparison level of the collator.  A collation order
113 * consists of primary order, secondary order and tertiary order.  The data
114 * type of the collation order is <strong>int32_t</strong>.
115 *
116 * Note, CollationElementIterator should not be subclassed.
117 * @see     Collator
118 * @see     RuleBasedCollator
119 * @version 1.8 Jan 16 2001
120 */
121 class U_I18N_API CollationElementIterator U_FINAL : public UObject {
122 public:
123 
124     // CollationElementIterator public data member ------------------------------
125 
126     enum {
127         /**
128          * NULLORDER indicates that an error has occured while processing
129          * @stable ICU 2.0
130          */
131         NULLORDER = (int32_t)0xffffffff
132     };
133 
134     // CollationElementIterator public constructor/destructor -------------------
135 
136     /**
137     * Copy constructor.
138     *
139     * @param other    the object to be copied from
140     * @stable ICU 2.0
141     */
142     CollationElementIterator(const CollationElementIterator& other);
143 
144     /**
145     * Destructor
146     * @stable ICU 2.0
147     */
148     virtual ~CollationElementIterator();
149 
150     // CollationElementIterator public methods ----------------------------------
151 
152     /**
153     * Returns true if "other" is the same as "this"
154     *
155     * @param other    the object to be compared
156     * @return         true if "other" is the same as "this"
157     * @stable ICU 2.0
158     */
159     UBool operator==(const CollationElementIterator& other) const;
160 
161     /**
162     * Returns true if "other" is not the same as "this".
163     *
164     * @param other    the object to be compared
165     * @return         true if "other" is not the same as "this"
166     * @stable ICU 2.0
167     */
168     UBool operator!=(const CollationElementIterator& other) const;
169 
170     /**
171     * Resets the cursor to the beginning of the string.
172     * @stable ICU 2.0
173     */
174     void reset(void);
175 
176     /**
177     * Gets the ordering priority of the next character in the string.
178     * @param status the error code status.
179     * @return the next character's ordering. otherwise returns NULLORDER if an
180     *         error has occured or if the end of string has been reached
181     * @stable ICU 2.0
182     */
183     int32_t next(UErrorCode& status);
184 
185     /**
186     * Get the ordering priority of the previous collation element in the string.
187     * @param status the error code status.
188     * @return the previous element's ordering. otherwise returns NULLORDER if an
189     *         error has occured or if the start of string has been reached
190     * @stable ICU 2.0
191     */
192     int32_t previous(UErrorCode& status);
193 
194     /**
195     * Gets the primary order of a collation order.
196     * @param order the collation order
197     * @return the primary order of a collation order.
198     * @stable ICU 2.0
199     */
200     static inline int32_t primaryOrder(int32_t order);
201 
202     /**
203     * Gets the secondary order of a collation order.
204     * @param order the collation order
205     * @return the secondary order of a collation order.
206     * @stable ICU 2.0
207     */
208     static inline int32_t secondaryOrder(int32_t order);
209 
210     /**
211     * Gets the tertiary order of a collation order.
212     * @param order the collation order
213     * @return the tertiary order of a collation order.
214     * @stable ICU 2.0
215     */
216     static inline int32_t tertiaryOrder(int32_t order);
217 
218     /**
219     * Return the maximum length of any expansion sequences that end with the
220     * specified comparison order.
221     * @param order a collation order returned by previous or next.
222     * @return maximum size of the expansion sequences ending with the collation
223     *         element or 1 if collation element does not occur at the end of any
224     *         expansion sequence
225     * @stable ICU 2.0
226     */
227     int32_t getMaxExpansion(int32_t order) const;
228 
229     /**
230     * Gets the comparison order in the desired strength. Ignore the other
231     * differences.
232     * @param order The order value
233     * @stable ICU 2.0
234     */
235     int32_t strengthOrder(int32_t order) const;
236 
237     /**
238     * Sets the source string.
239     * @param str the source string.
240     * @param status the error code status.
241     * @stable ICU 2.0
242     */
243     void setText(const UnicodeString& str, UErrorCode& status);
244 
245     /**
246     * Sets the source string.
247     * @param str the source character iterator.
248     * @param status the error code status.
249     * @stable ICU 2.0
250     */
251     void setText(CharacterIterator& str, UErrorCode& status);
252 
253     /**
254     * Checks if a comparison order is ignorable.
255     * @param order the collation order.
256     * @return true if a character is ignorable, false otherwise.
257     * @stable ICU 2.0
258     */
259     static inline UBool isIgnorable(int32_t order);
260 
261     /**
262     * Gets the offset of the currently processed character in the source string.
263     * @return the offset of the character.
264     * @stable ICU 2.0
265     */
266     int32_t getOffset(void) const;
267 
268     /**
269     * Sets the offset of the currently processed character in the source string.
270     * @param newOffset the new offset.
271     * @param status the error code status.
272     * @return the offset of the character.
273     * @stable ICU 2.0
274     */
275     void setOffset(int32_t newOffset, UErrorCode& status);
276 
277     /**
278     * ICU "poor man's RTTI", returns a UClassID for the actual class.
279     *
280     * @stable ICU 2.2
281     */
282     virtual UClassID getDynamicClassID() const;
283 
284     /**
285     * ICU "poor man's RTTI", returns a UClassID for this class.
286     *
287     * @stable ICU 2.2
288     */
289     static UClassID U_EXPORT2 getStaticClassID();
290 
291 #ifndef U_HIDE_INTERNAL_API
292     /** @internal */
fromUCollationElements(UCollationElements * uc)293     static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
294         return reinterpret_cast<CollationElementIterator *>(uc);
295     }
296     /** @internal */
fromUCollationElements(const UCollationElements * uc)297     static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
298         return reinterpret_cast<const CollationElementIterator *>(uc);
299     }
300     /** @internal */
toUCollationElements()301     inline UCollationElements *toUCollationElements() {
302         return reinterpret_cast<UCollationElements *>(this);
303     }
304     /** @internal */
toUCollationElements()305     inline const UCollationElements *toUCollationElements() const {
306         return reinterpret_cast<const UCollationElements *>(this);
307     }
308 #endif  // U_HIDE_INTERNAL_API
309 
310 private:
311     friend class RuleBasedCollator;
312     friend class UCollationPCE;
313 
314     /**
315     * CollationElementIterator constructor. This takes the source string and the
316     * collation object. The cursor will walk thru the source string based on the
317     * predefined collation rules. If the source string is empty, NULLORDER will
318     * be returned on the calls to next().
319     * @param sourceText    the source string.
320     * @param order         the collation object.
321     * @param status        the error code status.
322     */
323     CollationElementIterator(const UnicodeString& sourceText,
324         const RuleBasedCollator* order, UErrorCode& status);
325     // Note: The constructors should take settings & tailoring, not a collator,
326     // to avoid circular dependencies.
327     // However, for operator==() we would need to be able to compare tailoring data for equality
328     // without making CollationData or CollationTailoring depend on TailoredSet.
329     // (See the implementation of RuleBasedCollator::operator==().)
330     // That might require creating an intermediate class that would be used
331     // by both CollationElementIterator and RuleBasedCollator
332     // but only contain the part of RBC== related to data and rules.
333 
334     /**
335     * CollationElementIterator constructor. This takes the source string and the
336     * collation object.  The cursor will walk thru the source string based on the
337     * predefined collation rules.  If the source string is empty, NULLORDER will
338     * be returned on the calls to next().
339     * @param sourceText    the source string.
340     * @param order         the collation object.
341     * @param status        the error code status.
342     */
343     CollationElementIterator(const CharacterIterator& sourceText,
344         const RuleBasedCollator* order, UErrorCode& status);
345 
346     /**
347     * Assignment operator
348     *
349     * @param other    the object to be copied
350     */
351     const CollationElementIterator&
352         operator=(const CollationElementIterator& other);
353 
354     CollationElementIterator(); // default constructor not implemented
355 
356     /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
normalizeDir()357     inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
358 
359     static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
360 
361     static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
362 
363     // CollationElementIterator private data members ----------------------------
364 
365     CollationIterator *iter_;  // owned
366     const RuleBasedCollator *rbc_;  // aliased
367     uint32_t otherHalf_;
368     /**
369      * <0: backwards; 0: just after reset() (previous() begins from end);
370      * 1: just after setOffset(); >1: forward
371      */
372     int8_t dir_;
373     /**
374      * Stores offsets from expansions and from unsafe-backwards iteration,
375      * so that getOffset() returns intermediate offsets for the CEs
376      * that are consistent with forward iteration.
377      */
378     UVector32 *offsets_;
379 
380     UnicodeString string_;
381 };
382 
383 // CollationElementIterator inline method definitions --------------------------
384 
primaryOrder(int32_t order)385 inline int32_t CollationElementIterator::primaryOrder(int32_t order)
386 {
387     return (order >> 16) & 0xffff;
388 }
389 
secondaryOrder(int32_t order)390 inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
391 {
392     return (order >> 8) & 0xff;
393 }
394 
tertiaryOrder(int32_t order)395 inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
396 {
397     return order & 0xff;
398 }
399 
isIgnorable(int32_t order)400 inline UBool CollationElementIterator::isIgnorable(int32_t order)
401 {
402     return (order & 0xffff0000) == 0;
403 }
404 
405 U_NAMESPACE_END
406 
407 #endif /* #if !UCONFIG_NO_COLLATION */
408 
409 #endif /* U_SHOW_CPLUSPLUS_API */
410 
411 #endif
412