1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * utf8collationiterator.h
7 *
8 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
9 * created by: Markus W. Scherer
10 */
11 
12 #ifndef __UTF8COLLATIONITERATOR_H__
13 #define __UTF8COLLATIONITERATOR_H__
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_COLLATION
18 
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "normalizer2impl.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 /**
27  * UTF-8 collation element and character iterator.
28  * Handles normalized UTF-8 text inline, with length or NUL-terminated.
29  * Unnormalized text is handled by a subclass.
30  */
31 class U_I18N_API UTF8CollationIterator : public CollationIterator {
32 public:
UTF8CollationIterator(const CollationData * d,UBool numeric,const uint8_t * s,int32_t p,int32_t len)33     UTF8CollationIterator(const CollationData *d, UBool numeric,
34                           const uint8_t *s, int32_t p, int32_t len)
35             : CollationIterator(d, numeric),
36               u8(s), pos(p), length(len) {}
37 
38     virtual ~UTF8CollationIterator();
39 
40     virtual void resetToOffset(int32_t newOffset);
41 
42     virtual int32_t getOffset() const;
43 
44     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
45 
46     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
47 
48 protected:
49     /**
50      * For byte sequences that are illegal in UTF-8, an error value may be returned
51      * together with a bogus code point. The caller will ignore that code point.
52      *
53      * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
54      * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
55      *
56      * Valid lead surrogates are returned from inside a normalized text segment,
57      * where handleGetTrailSurrogate() will return the matching trail surrogate.
58      */
59     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
60 
61     virtual UBool foundNULTerminator();
62 
63     virtual UBool forbidSurrogateCodePoints() const;
64 
65     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
66 
67     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
68 
69     const uint8_t *u8;
70     int32_t pos;
71     int32_t length;  // <0 for NUL-terminated strings
72 };
73 
74 /**
75  * Incrementally checks the input text for FCD and normalizes where necessary.
76  */
77 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
78 public:
FCDUTF8CollationIterator(const CollationData * data,UBool numeric,const uint8_t * s,int32_t p,int32_t len)79     FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
80                              const uint8_t *s, int32_t p, int32_t len)
81             : UTF8CollationIterator(data, numeric, s, p, len),
82               state(CHECK_FWD), start(p),
83               nfcImpl(data->nfcImpl) {}
84 
85     virtual ~FCDUTF8CollationIterator();
86 
87     virtual void resetToOffset(int32_t newOffset);
88 
89     virtual int32_t getOffset() const;
90 
91     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
92 
93     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
94 
95 protected:
96     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
97 
98     virtual UChar handleGetTrailSurrogate();
99 
100     virtual UBool foundNULTerminator();
101 
102     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
103 
104     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
105 
106 private:
107     UBool nextHasLccc() const;
108     UBool previousHasTccc() const;
109 
110     /**
111      * Switches to forward checking if possible.
112      */
113     void switchToForward();
114 
115     /**
116      * Extends the FCD text segment forward or normalizes around pos.
117      * @return TRUE if success
118      */
119     UBool nextSegment(UErrorCode &errorCode);
120 
121     /**
122      * Switches to backward checking.
123      */
124     void switchToBackward();
125 
126     /**
127      * Extends the FCD text segment backward or normalizes around pos.
128      * @return TRUE if success
129      */
130     UBool previousSegment(UErrorCode &errorCode);
131 
132     UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
133 
134     enum State {
135         /**
136          * The input text [start..pos[ passes the FCD check.
137          * Moving forward checks incrementally.
138          * limit is undefined.
139          */
140         CHECK_FWD,
141         /**
142          * The input text [pos..limit[ passes the FCD check.
143          * Moving backward checks incrementally.
144          * start is undefined.
145          */
146         CHECK_BWD,
147         /**
148          * The input text [start..limit[ passes the FCD check.
149          * pos tracks the current text index.
150          */
151         IN_FCD_SEGMENT,
152         /**
153          * The input text [start..limit[ failed the FCD check and was normalized.
154          * pos tracks the current index in the normalized string.
155          */
156         IN_NORMALIZED
157     };
158 
159     State state;
160 
161     int32_t start;
162     int32_t limit;
163 
164     const Normalizer2Impl &nfcImpl;
165     UnicodeString normalized;
166 };
167 
168 U_NAMESPACE_END
169 
170 #endif  // !UCONFIG_NO_COLLATION
171 #endif  // __UTF8COLLATIONITERATOR_H__
172