1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2014 IBM and others. All rights reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *  08/13/2001   synwee      Creation.
7 **********************************************************************
8 */
9 #ifndef USRCHIMP_H
10 #define USRCHIMP_H
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/normalizer2.h"
17 #include "unicode/ucol.h"
18 #include "unicode/ucoleitr.h"
19 #include "unicode/ubrk.h"
20 
21 /* mask off anything but primary order */
22 #define UCOL_PRIMARYORDERMASK 0xffff0000
23 /* mask off anything but secondary order */
24 #define UCOL_SECONDARYORDERMASK 0x0000ff00
25 /* mask off anything but tertiary order */
26 #define UCOL_TERTIARYORDERMASK 0x000000ff
27 /* primary order shift */
28 #define UCOL_PRIMARYORDERSHIFT 16
29 /* secondary order shift */
30 #define UCOL_SECONDARYORDERSHIFT 8
31 
32 #define UCOL_IGNORABLE 0
33 
34 /* get weights from a CE */
35 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
36 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
37 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
38 
39 #define UCOL_CONTINUATION_MARKER 0xC0
40 
41 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
42 
43 /**
44  * This indicates an error has occured during processing or there are no more CEs
45  * to be returned.
46  */
47 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
48 
49 U_NAMESPACE_BEGIN
50 
51 class CollationElementIterator;
52 class Collator;
53 
54 struct PCEI
55 {
56     uint64_t ce;
57     int32_t  low;
58     int32_t  high;
59 };
60 
61 struct PCEBuffer
62 {
63     PCEI    defaultBuffer[16];
64     PCEI   *buffer;
65     int32_t bufferIndex;
66     int32_t bufferSize;
67 
68     PCEBuffer();
69     ~PCEBuffer();
70 
71     void  reset();
72     UBool empty() const;
73     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
74     const PCEI *get();
75 };
76 
77 class UCollationPCE : public UMemory {
78 private:
79     PCEBuffer          pceBuffer;
80     CollationElementIterator *cei;
81     UCollationStrength strength;
82     UBool              toShift;
83     UBool              isShifted;
84     uint32_t           variableTop;
85 
86 public:
87     UCollationPCE(UCollationElements *elems);
88     UCollationPCE(CollationElementIterator *iter);
89     ~UCollationPCE();
90 
91     void init(UCollationElements *elems);
92     void init(CollationElementIterator *iter);
93 
94     /**
95      * Get the processed ordering priority of the next collation element in the text.
96      * A single character may contain more than one collation element.
97      *
98      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
99      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
100      * @param status A pointer to an UErrorCode to receive any errors.
101      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
102      *         if an error has occured or if the end of string has been reached
103      */
104     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
105     /**
106      * Get the processed ordering priority of the previous collation element in the text.
107      * A single character may contain more than one collation element.
108      *
109      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
110      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
111      * @param status A pointer to an UErrorCode to receive any errors. Noteably
112      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
113      *               buffer has been exhausted.
114      * @return The previous collation elements ordering, otherwise returns
115      *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
116      *         string has been reached.
117      */
118     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
119 
120 private:
121     void init(const Collator &coll);
122     uint64_t processCE(uint32_t ce);
123 };
124 
125 U_NAMESPACE_END
126 
127 #define INITIAL_ARRAY_SIZE_       256
128 #define MAX_TABLE_SIZE_           257
129 
130 struct USearch {
131     // required since collation element iterator does not have a getText API
132     const UChar              *text;
133           int32_t             textLength; // exact length
134           UBool               isOverlap;
135           UBool               isCanonicalMatch;
136           int16_t             elementComparisonType;
137           UBreakIterator     *internalBreakIter;  //internal character breakiterator
138           UBreakIterator     *breakIter;
139     // value USEARCH_DONE is the default value
140     // if we are not at the start of the text or the end of the text,
141     // depending on the iteration direction and matchedIndex is USEARCH_DONE
142     // it means that we can't find any more matches in that particular direction
143           int32_t             matchedIndex;
144           int32_t             matchedLength;
145           UBool               isForwardSearching;
146           UBool               reset;
147 };
148 
149 struct UPattern {
150     const UChar              *text;
151           int32_t             textLength; // exact length
152           // length required for backwards ce comparison
153           int32_t             cesLength;
154           int32_t            *ces;
155           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
156           int32_t             pcesLength;
157           int64_t            *pces;
158           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
159           UBool               hasPrefixAccents;
160           UBool               hasSuffixAccents;
161           int16_t             defaultShiftSize;
162           int16_t             shift[MAX_TABLE_SIZE_];
163           int16_t             backShift[MAX_TABLE_SIZE_];
164 };
165 
166 struct UStringSearch {
167     struct USearch            *search;
168     struct UPattern            pattern;
169     const  UCollator          *collator;
170     const  icu::Normalizer2   *nfd;
171     // positions within the collation element iterator is used to determine
172     // if we are at the start of the text.
173            UCollationElements *textIter;
174            icu::UCollationPCE *textProcessedIter;
175     // utility collation element, used throughout program for temporary
176     // iteration.
177            UCollationElements *utilIter;
178            UBool               ownCollator;
179            UCollationStrength  strength;
180            uint32_t            ceMask;
181            uint32_t            variableTop;
182            UBool               toShift;
183            UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
184            UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
185 };
186 
187 /**
188 * Exact matches without checking for the ends for extra accents.
189 * The match after the position within the collation element iterator is to be
190 * found.
191 * After a match is found the offset in the collation element iterator will be
192 * shifted to the start of the match.
193 * Implementation note:
194 * For tertiary we can't use the collator->tertiaryMask, that is a
195 * preprocessed mask that takes into account case options. since we are only
196 * concerned with exact matches, we don't need that.
197 * Alternate handling - since only the 16 most significant digits is only used,
198 * we can safely do a compare without masking if the ce is a variable, we mask
199 * and get only the primary values no shifting to quartenary is required since
200 * all primary values less than variabletop will need to be masked off anyway.
201 * If the end character is composite and the pattern ce does not match the text
202 * ce, we skip it until we find a match in the end composite character or when
203 * it has passed the character. This is so that we can match pattern "a" with
204 * the text "\u00e6"
205 * @param strsrch string search data
206 * @param status error status if any
207 * @return TRUE if an exact match is found, FALSE otherwise
208 */
209 U_CFUNC
210 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
211 
212 /**
213 * Canonical matches.
214 * According to the definition, matches found here will include the whole span
215 * of beginning and ending accents if it overlaps that region.
216 * @param strsrch string search data
217 * @param status error status if any
218 * @return TRUE if a canonical match is found, FALSE otherwise
219 */
220 U_CFUNC
221 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
222 
223 /**
224 * Gets the previous match.
225 * Comments follows from handleNextExact
226 * @param strsrch string search data
227 * @param status error status if any
228 * @return True if a exact math is found, FALSE otherwise.
229 */
230 U_CFUNC
231 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
232 
233 /**
234 * Canonical matches.
235 * According to the definition, matches found here will include the whole span
236 * of beginning and ending accents if it overlaps that region.
237 * @param strsrch string search data
238 * @param status error status if any
239 * @return TRUE if a canonical match is found, FALSE otherwise
240 */
241 U_CFUNC
242 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
243                                       UErrorCode    *status);
244 
245 #endif /* #if !UCONFIG_NO_COLLATION */
246 
247 #endif
248