1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * utf16collationiterator.h
7 *
8 * created on: 2010oct27
9 * created by: Markus W. Scherer
10 */
11 
12 #ifndef __UTF16COLLATIONITERATOR_H__
13 #define __UTF16COLLATIONITERATOR_H__
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_COLLATION
18 
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationiterator.h"
23 #include "normalizer2impl.h"
24 
25 U_NAMESPACE_BEGIN
26 
27 /**
28  * UTF-16 collation element and character iterator.
29  * Handles normalized UTF-16 text inline, with length or NUL-terminated.
30  * Unnormalized text is handled by a subclass.
31  */
32 class U_I18N_API UTF16CollationIterator : public CollationIterator {
33 public:
UTF16CollationIterator(const CollationData * d,UBool numeric,const UChar * s,const UChar * p,const UChar * lim)34     UTF16CollationIterator(const CollationData *d, UBool numeric,
35                            const UChar *s, const UChar *p, const UChar *lim)
36             : CollationIterator(d, numeric),
37               start(s), pos(p), limit(lim) {}
38 
39     UTF16CollationIterator(const UTF16CollationIterator &other, const UChar *newText);
40 
41     virtual ~UTF16CollationIterator();
42 
43     virtual UBool operator==(const CollationIterator &other) const;
44 
45     virtual void resetToOffset(int32_t newOffset);
46 
47     virtual int32_t getOffset() const;
48 
setText(const UChar * s,const UChar * lim)49     void setText(const UChar *s, const UChar *lim) {
50         reset();
51         start = pos = s;
52         limit = lim;
53     }
54 
55     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
56 
57     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
58 
59 protected:
60     // Copy constructor only for subclasses which set the pointers.
UTF16CollationIterator(const UTF16CollationIterator & other)61     UTF16CollationIterator(const UTF16CollationIterator &other)
62             : CollationIterator(other),
63               start(NULL), pos(NULL), limit(NULL) {}
64 
65     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
66 
67     virtual UChar handleGetTrailSurrogate();
68 
69     virtual UBool foundNULTerminator();
70 
71     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
72 
73     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
74 
75     // UTF-16 string pointers.
76     // limit can be NULL for NUL-terminated strings.
77     const UChar *start, *pos, *limit;
78 };
79 
80 /**
81  * Incrementally checks the input text for FCD and normalizes where necessary.
82  */
83 class U_I18N_API FCDUTF16CollationIterator : public UTF16CollationIterator {
84 public:
FCDUTF16CollationIterator(const CollationData * data,UBool numeric,const UChar * s,const UChar * p,const UChar * lim)85     FCDUTF16CollationIterator(const CollationData *data, UBool numeric,
86                               const UChar *s, const UChar *p, const UChar *lim)
87             : UTF16CollationIterator(data, numeric, s, p, lim),
88               rawStart(s), segmentStart(p), segmentLimit(NULL), rawLimit(lim),
89               nfcImpl(data->nfcImpl),
90               checkDir(1) {}
91 
92     FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other, const UChar *newText);
93 
94     virtual ~FCDUTF16CollationIterator();
95 
96     virtual UBool operator==(const CollationIterator &other) const;
97 
98     virtual void resetToOffset(int32_t newOffset);
99 
100     virtual int32_t getOffset() const;
101 
102     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
103 
104     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
105 
106 protected:
107     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
108 
109     virtual UBool foundNULTerminator();
110 
111     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
112 
113     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
114 
115 private:
116     /**
117      * Switches to forward checking if possible.
118      * To be called when checkDir < 0 || (checkDir == 0 && pos == limit).
119      * Returns with checkDir > 0 || (checkDir == 0 && pos != limit).
120      */
121     void switchToForward();
122 
123     /**
124      * Extend the FCD text segment forward or normalize around pos.
125      * To be called when checkDir > 0 && pos != limit.
126      * @return TRUE if success, checkDir == 0 and pos != limit
127      */
128     UBool nextSegment(UErrorCode &errorCode);
129 
130     /**
131      * Switches to backward checking.
132      * To be called when checkDir > 0 || (checkDir == 0 && pos == start).
133      * Returns with checkDir < 0 || (checkDir == 0 && pos != start).
134      */
135     void switchToBackward();
136 
137     /**
138      * Extend the FCD text segment backward or normalize around pos.
139      * To be called when checkDir < 0 && pos != start.
140      * @return TRUE if success, checkDir == 0 and pos != start
141      */
142     UBool previousSegment(UErrorCode &errorCode);
143 
144     UBool normalize(const UChar *from, const UChar *to, UErrorCode &errorCode);
145 
146     // Text pointers: The input text is [rawStart, rawLimit[
147     // where rawLimit can be NULL for NUL-terminated text.
148     //
149     // checkDir > 0:
150     //
151     // The input text [segmentStart..pos[ passes the FCD check.
152     // Moving forward checks incrementally.
153     // segmentLimit is undefined. limit == rawLimit.
154     //
155     // checkDir < 0:
156     // The input text [pos..segmentLimit[ passes the FCD check.
157     // Moving backward checks incrementally.
158     // segmentStart is undefined, start == rawStart.
159     //
160     // checkDir == 0:
161     //
162     // The input text [segmentStart..segmentLimit[ is being processed.
163     // These pointers are at FCD boundaries.
164     // Either this text segment already passes the FCD check
165     // and segmentStart==start<=pos<=limit==segmentLimit,
166     // or the current segment had to be normalized so that
167     // [segmentStart..segmentLimit[ turned into the normalized string,
168     // corresponding to normalized.getBuffer()==start<=pos<=limit==start+normalized.length().
169     const UChar *rawStart;
170     const UChar *segmentStart;
171     const UChar *segmentLimit;
172     // rawLimit==NULL for a NUL-terminated string.
173     const UChar *rawLimit;
174 
175     const Normalizer2Impl &nfcImpl;
176     UnicodeString normalized;
177     // Direction of incremental FCD check. See comments before rawStart.
178     int8_t checkDir;
179 };
180 
181 U_NAMESPACE_END
182 
183 #endif  // !UCONFIG_NO_COLLATION
184 #endif  // __UTF16COLLATIONITERATOR_H__
185