1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2016, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * utf8collationiterator.h
9 *
10 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __UTF8COLLATIONITERATOR_H__
15 #define __UTF8COLLATIONITERATOR_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "cmemory.h"
22 #include "collation.h"
23 #include "collationdata.h"
24 #include "collationiterator.h"
25 #include "normalizer2impl.h"
26 
27 U_NAMESPACE_BEGIN
28 
29 /**
30  * UTF-8 collation element and character iterator.
31  * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32  * Unnormalized text is handled by a subclass.
33  */
34 class U_I18N_API UTF8CollationIterator : public CollationIterator {
35 public:
UTF8CollationIterator(const CollationData * d,UBool numeric,const uint8_t * s,int32_t p,int32_t len)36     UTF8CollationIterator(const CollationData *d, UBool numeric,
37                           const uint8_t *s, int32_t p, int32_t len)
38             : CollationIterator(d, numeric),
39               u8(s), pos(p), length(len) {}
40 
41     virtual ~UTF8CollationIterator();
42 
43     virtual void resetToOffset(int32_t newOffset);
44 
45     virtual int32_t getOffset() const;
46 
47     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
48 
49     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
50 
51 protected:
52     /**
53      * For byte sequences that are illegal in UTF-8, an error value may be returned
54      * together with a bogus code point. The caller will ignore that code point.
55      *
56      * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57      * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
58      *
59      * Valid lead surrogates are returned from inside a normalized text segment,
60      * where handleGetTrailSurrogate() will return the matching trail surrogate.
61      */
62     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
63 
64     virtual UBool foundNULTerminator();
65 
66     virtual UBool forbidSurrogateCodePoints() const;
67 
68     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
69 
70     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
71 
72     const uint8_t *u8;
73     int32_t pos;
74     int32_t length;  // <0 for NUL-terminated strings
75 };
76 
77 /**
78  * Incrementally checks the input text for FCD and normalizes where necessary.
79  */
80 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81 public:
FCDUTF8CollationIterator(const CollationData * data,UBool numeric,const uint8_t * s,int32_t p,int32_t len)82     FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83                              const uint8_t *s, int32_t p, int32_t len)
84             : UTF8CollationIterator(data, numeric, s, p, len),
85               state(CHECK_FWD), start(p),
86               nfcImpl(data->nfcImpl) {}
87 
88     virtual ~FCDUTF8CollationIterator();
89 
90     virtual void resetToOffset(int32_t newOffset);
91 
92     virtual int32_t getOffset() const;
93 
94     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
95 
96     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
97 
98 protected:
99     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
100 
101     virtual UChar handleGetTrailSurrogate();
102 
103     virtual UBool foundNULTerminator();
104 
105     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
106 
107     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
108 
109 private:
110     UBool nextHasLccc() const;
111     UBool previousHasTccc() const;
112 
113     /**
114      * Switches to forward checking if possible.
115      */
116     void switchToForward();
117 
118     /**
119      * Extends the FCD text segment forward or normalizes around pos.
120      * @return TRUE if success
121      */
122     UBool nextSegment(UErrorCode &errorCode);
123 
124     /**
125      * Switches to backward checking.
126      */
127     void switchToBackward();
128 
129     /**
130      * Extends the FCD text segment backward or normalizes around pos.
131      * @return TRUE if success
132      */
133     UBool previousSegment(UErrorCode &errorCode);
134 
135     UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136 
137     enum State {
138         /**
139          * The input text [start..pos[ passes the FCD check.
140          * Moving forward checks incrementally.
141          * limit is undefined.
142          */
143         CHECK_FWD,
144         /**
145          * The input text [pos..limit[ passes the FCD check.
146          * Moving backward checks incrementally.
147          * start is undefined.
148          */
149         CHECK_BWD,
150         /**
151          * The input text [start..limit[ passes the FCD check.
152          * pos tracks the current text index.
153          */
154         IN_FCD_SEGMENT,
155         /**
156          * The input text [start..limit[ failed the FCD check and was normalized.
157          * pos tracks the current index in the normalized string.
158          */
159         IN_NORMALIZED
160     };
161 
162     State state;
163 
164     int32_t start;
165     int32_t limit;
166 
167     const Normalizer2Impl &nfcImpl;
168     UnicodeString normalized;
169 };
170 
171 U_NAMESPACE_END
172 
173 #endif  // !UCONFIG_NO_COLLATION
174 #endif  // __UTF8COLLATIONITERATOR_H__
175