1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  *   Copyright (C) 1997-2014, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  ******************************************************************************
8  */
9 
10 /**
11  * \file
12  * \brief C++ API: Collation Element Iterator.
13  */
14 
15 /**
16 * File coleitr.h
17 *
18 * Created by: Helena Shih
19 *
20 * Modification History:
21 *
22 *  Date       Name        Description
23 *
24 *  8/18/97    helena      Added internal API documentation.
25 * 08/03/98    erm         Synched with 1.2 version CollationElementIterator.java
26 * 12/10/99    aliu        Ported Thai collation support from Java.
27 * 01/25/01    swquek      Modified into a C++ wrapper calling C APIs (ucoliter.h)
28 * 02/19/01    swquek      Removed CollationElementsIterator() since it is
29 *                         private constructor and no calls are made to it
30 * 2012-2014   markus      Rewritten in C++ again.
31 */
32 
33 #ifndef COLEITR_H
34 #define COLEITR_H
35 
36 #include "unicode/utypes.h"
37 
38 #if !UCONFIG_NO_COLLATION
39 
40 #include "unicode/unistr.h"
41 #include "unicode/uobject.h"
42 
43 struct UCollationElements;
44 struct UHashtable;
45 
46 U_NAMESPACE_BEGIN
47 
48 struct CollationData;
49 
50 class CollationIterator;
51 class RuleBasedCollator;
52 class UCollationPCE;
53 class UVector32;
54 
55 /**
56 * The CollationElementIterator class is used as an iterator to walk through
57 * each character of an international string. Use the iterator to return the
58 * ordering priority of the positioned character. The ordering priority of a
59 * character, which we refer to as a key, defines how a character is collated in
60 * the given collation object.
61 * For example, consider the following in Slovak and in traditional Spanish collation:
62 * <pre>
63 *        "ca" -> the first key is key('c') and second key is key('a').
64 *        "cha" -> the first key is key('ch') and second key is key('a').</pre>
65 * And in German phonebook collation,
66 * <pre> \htmlonly       "&#x00E6;b"-> the first key is key('a'), the second key is key('e'), and
67 *        the third key is key('b'). \endhtmlonly </pre>
68 * The key of a character, is an integer composed of primary order(short),
69 * secondary order(char), and tertiary order(char). Java strictly defines the
70 * size and signedness of its primitive data types. Therefore, the static
71 * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
72 * int32_t to ensure the correctness of the key value.
73 * <p>Example of the iterator usage: (without error checking)
74 * <pre>
75 * \code
76 *   void CollationElementIterator_Example()
77 *   {
78 *       UnicodeString str = "This is a test";
79 *       UErrorCode success = U_ZERO_ERROR;
80 *       RuleBasedCollator* rbc =
81 *           (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
82 *       CollationElementIterator* c =
83 *           rbc->createCollationElementIterator( str );
84 *       int32_t order = c->next(success);
85 *       c->reset();
86 *       order = c->previous(success);
87 *       delete c;
88 *       delete rbc;
89 *   }
90 * \endcode
91 * </pre>
92 * <p>
93 * The method next() returns the collation order of the next character based on
94 * the comparison level of the collator. The method previous() returns the
95 * collation order of the previous character based on the comparison level of
96 * the collator. The Collation Element Iterator moves only in one direction
97 * between calls to reset(), setOffset(), or setText(). That is, next()
98 * and previous() can not be inter-used. Whenever previous() is to be called after
99 * next() or vice versa, reset(), setOffset() or setText() has to be called first
100 * to reset the status, shifting pointers to either the end or the start of
101 * the string (reset() or setText()), or the specified position (setOffset()).
102 * Hence at the next call of next() or previous(), the first or last collation order,
103 * or collation order at the spefcifieid position will be returned. If a change of
104 * direction is done without one of these calls, the result is undefined.
105 * <p>
106 * The result of a forward iterate (next()) and reversed result of the backward
107 * iterate (previous()) on the same string are equivalent, if collation orders
108 * with the value 0 are ignored.
109 * Character based on the comparison level of the collator.  A collation order
110 * consists of primary order, secondary order and tertiary order.  The data
111 * type of the collation order is <strong>int32_t</strong>.
112 *
113 * Note, CollationElementIterator should not be subclassed.
114 * @see     Collator
115 * @see     RuleBasedCollator
116 * @version 1.8 Jan 16 2001
117 */
118 class U_I18N_API CollationElementIterator U_FINAL : public UObject {
119 public:
120 
121     // CollationElementIterator public data member ------------------------------
122 
123     enum {
124         /**
125          * NULLORDER indicates that an error has occured while processing
126          * @stable ICU 2.0
127          */
128         NULLORDER = (int32_t)0xffffffff
129     };
130 
131     // CollationElementIterator public constructor/destructor -------------------
132 
133     /**
134     * Copy constructor.
135     *
136     * @param other    the object to be copied from
137     * @stable ICU 2.0
138     */
139     CollationElementIterator(const CollationElementIterator& other);
140 
141     /**
142     * Destructor
143     * @stable ICU 2.0
144     */
145     virtual ~CollationElementIterator();
146 
147     // CollationElementIterator public methods ----------------------------------
148 
149     /**
150     * Returns true if "other" is the same as "this"
151     *
152     * @param other    the object to be compared
153     * @return         true if "other" is the same as "this"
154     * @stable ICU 2.0
155     */
156     UBool operator==(const CollationElementIterator& other) const;
157 
158     /**
159     * Returns true if "other" is not the same as "this".
160     *
161     * @param other    the object to be compared
162     * @return         true if "other" is not the same as "this"
163     * @stable ICU 2.0
164     */
165     UBool operator!=(const CollationElementIterator& other) const;
166 
167     /**
168     * Resets the cursor to the beginning of the string.
169     * @stable ICU 2.0
170     */
171     void reset(void);
172 
173     /**
174     * Gets the ordering priority of the next character in the string.
175     * @param status the error code status.
176     * @return the next character's ordering. otherwise returns NULLORDER if an
177     *         error has occured or if the end of string has been reached
178     * @stable ICU 2.0
179     */
180     int32_t next(UErrorCode& status);
181 
182     /**
183     * Get the ordering priority of the previous collation element in the string.
184     * @param status the error code status.
185     * @return the previous element's ordering. otherwise returns NULLORDER if an
186     *         error has occured or if the start of string has been reached
187     * @stable ICU 2.0
188     */
189     int32_t previous(UErrorCode& status);
190 
191     /**
192     * Gets the primary order of a collation order.
193     * @param order the collation order
194     * @return the primary order of a collation order.
195     * @stable ICU 2.0
196     */
197     static inline int32_t primaryOrder(int32_t order);
198 
199     /**
200     * Gets the secondary order of a collation order.
201     * @param order the collation order
202     * @return the secondary order of a collation order.
203     * @stable ICU 2.0
204     */
205     static inline int32_t secondaryOrder(int32_t order);
206 
207     /**
208     * Gets the tertiary order of a collation order.
209     * @param order the collation order
210     * @return the tertiary order of a collation order.
211     * @stable ICU 2.0
212     */
213     static inline int32_t tertiaryOrder(int32_t order);
214 
215     /**
216     * Return the maximum length of any expansion sequences that end with the
217     * specified comparison order.
218     * @param order a collation order returned by previous or next.
219     * @return maximum size of the expansion sequences ending with the collation
220     *         element or 1 if collation element does not occur at the end of any
221     *         expansion sequence
222     * @stable ICU 2.0
223     */
224     int32_t getMaxExpansion(int32_t order) const;
225 
226     /**
227     * Gets the comparison order in the desired strength. Ignore the other
228     * differences.
229     * @param order The order value
230     * @stable ICU 2.0
231     */
232     int32_t strengthOrder(int32_t order) const;
233 
234     /**
235     * Sets the source string.
236     * @param str the source string.
237     * @param status the error code status.
238     * @stable ICU 2.0
239     */
240     void setText(const UnicodeString& str, UErrorCode& status);
241 
242     /**
243     * Sets the source string.
244     * @param str the source character iterator.
245     * @param status the error code status.
246     * @stable ICU 2.0
247     */
248     void setText(CharacterIterator& str, UErrorCode& status);
249 
250     /**
251     * Checks if a comparison order is ignorable.
252     * @param order the collation order.
253     * @return TRUE if a character is ignorable, FALSE otherwise.
254     * @stable ICU 2.0
255     */
256     static inline UBool isIgnorable(int32_t order);
257 
258     /**
259     * Gets the offset of the currently processed character in the source string.
260     * @return the offset of the character.
261     * @stable ICU 2.0
262     */
263     int32_t getOffset(void) const;
264 
265     /**
266     * Sets the offset of the currently processed character in the source string.
267     * @param newOffset the new offset.
268     * @param status the error code status.
269     * @return the offset of the character.
270     * @stable ICU 2.0
271     */
272     void setOffset(int32_t newOffset, UErrorCode& status);
273 
274     /**
275     * ICU "poor man's RTTI", returns a UClassID for the actual class.
276     *
277     * @stable ICU 2.2
278     */
279     virtual UClassID getDynamicClassID() const;
280 
281     /**
282     * ICU "poor man's RTTI", returns a UClassID for this class.
283     *
284     * @stable ICU 2.2
285     */
286     static UClassID U_EXPORT2 getStaticClassID();
287 
288 #ifndef U_HIDE_INTERNAL_API
289     /** @internal */
fromUCollationElements(UCollationElements * uc)290     static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
291         return reinterpret_cast<CollationElementIterator *>(uc);
292     }
293     /** @internal */
fromUCollationElements(const UCollationElements * uc)294     static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
295         return reinterpret_cast<const CollationElementIterator *>(uc);
296     }
297     /** @internal */
toUCollationElements()298     inline UCollationElements *toUCollationElements() {
299         return reinterpret_cast<UCollationElements *>(this);
300     }
301     /** @internal */
toUCollationElements()302     inline const UCollationElements *toUCollationElements() const {
303         return reinterpret_cast<const UCollationElements *>(this);
304     }
305 #endif  // U_HIDE_INTERNAL_API
306 
307 private:
308     friend class RuleBasedCollator;
309     friend class UCollationPCE;
310 
311     /**
312     * CollationElementIterator constructor. This takes the source string and the
313     * collation object. The cursor will walk thru the source string based on the
314     * predefined collation rules. If the source string is empty, NULLORDER will
315     * be returned on the calls to next().
316     * @param sourceText    the source string.
317     * @param order         the collation object.
318     * @param status        the error code status.
319     */
320     CollationElementIterator(const UnicodeString& sourceText,
321         const RuleBasedCollator* order, UErrorCode& status);
322     // Note: The constructors should take settings & tailoring, not a collator,
323     // to avoid circular dependencies.
324     // However, for operator==() we would need to be able to compare tailoring data for equality
325     // without making CollationData or CollationTailoring depend on TailoredSet.
326     // (See the implementation of RuleBasedCollator::operator==().)
327     // That might require creating an intermediate class that would be used
328     // by both CollationElementIterator and RuleBasedCollator
329     // but only contain the part of RBC== related to data and rules.
330 
331     /**
332     * CollationElementIterator constructor. This takes the source string and the
333     * collation object.  The cursor will walk thru the source string based on the
334     * predefined collation rules.  If the source string is empty, NULLORDER will
335     * be returned on the calls to next().
336     * @param sourceText    the source string.
337     * @param order         the collation object.
338     * @param status        the error code status.
339     */
340     CollationElementIterator(const CharacterIterator& sourceText,
341         const RuleBasedCollator* order, UErrorCode& status);
342 
343     /**
344     * Assignment operator
345     *
346     * @param other    the object to be copied
347     */
348     const CollationElementIterator&
349         operator=(const CollationElementIterator& other);
350 
351     CollationElementIterator(); // default constructor not implemented
352 
353     /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
normalizeDir()354     inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
355 
356     static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
357 
358     static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
359 
360     // CollationElementIterator private data members ----------------------------
361 
362     CollationIterator *iter_;  // owned
363     const RuleBasedCollator *rbc_;  // aliased
364     uint32_t otherHalf_;
365     /**
366      * <0: backwards; 0: just after reset() (previous() begins from end);
367      * 1: just after setOffset(); >1: forward
368      */
369     int8_t dir_;
370     /**
371      * Stores offsets from expansions and from unsafe-backwards iteration,
372      * so that getOffset() returns intermediate offsets for the CEs
373      * that are consistent with forward iteration.
374      */
375     UVector32 *offsets_;
376 
377     UnicodeString string_;
378 };
379 
380 // CollationElementIterator inline method definitions --------------------------
381 
primaryOrder(int32_t order)382 inline int32_t CollationElementIterator::primaryOrder(int32_t order)
383 {
384     return (order >> 16) & 0xffff;
385 }
386 
secondaryOrder(int32_t order)387 inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
388 {
389     return (order >> 8) & 0xff;
390 }
391 
tertiaryOrder(int32_t order)392 inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
393 {
394     return order & 0xff;
395 }
396 
isIgnorable(int32_t order)397 inline UBool CollationElementIterator::isIgnorable(int32_t order)
398 {
399     return (order & 0xffff0000) == 0;
400 }
401 
402 U_NAMESPACE_END
403 
404 #endif /* #if !UCONFIG_NO_COLLATION */
405 
406 #endif
407