1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  *   Copyright (C) 1996-2012, International Business Machines                 *
6  *   Corporation and others.  All Rights Reserved.                            *
7  ******************************************************************************
8  */
9 
10 /**
11  * \file
12  * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
13  * \internal
14  */
15 
16 /*
17  * Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting
18  * Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly
19  * moved the module from i18n directory to intltest, because we have no plan to publish this
20  * as public API. (2012-12-18 yoshito)
21  */
22 
23 #ifndef COLL_DATA_H
24 #define COLL_DATA_H
25 
26 #include "unicode/utypes.h"
27 
28 #if !UCONFIG_NO_COLLATION
29 
30 #include "unicode/ucol.h"
31 #include "unicode/unistr.h"
32 
33  /**
34   * The size of the internal CE buffer in a <code>CEList</code> object
35   */
36 #define CELIST_BUFFER_SIZE 4
37 
38 /**
39  * \def INSTRUMENT_CELIST
40  * Define this to enable the <code>CEList</code> objects to collect
41  * statistics.
42  */
43 
44  /**
45   * The size of the initial list in a <code>StringList</code> object.
46   */
47 #define STRING_LIST_BUFFER_SIZE 16
48 
49 U_NAMESPACE_USE
50 
51  /**
52   * This object holds a list of CEs generated from a particular
53   * <code>UnicodeString</code>
54   *
55   */
56 class CEList
57 {
58 public:
59     /**
60      * Construct a <code>CEList</code> object.
61      *
62      * @param coll - the Collator used to collect the CEs.
63      * @param string - the string for which to collect the CEs.
64      * @param status - will be set if any errors occur.
65      *
66      * Note: if on return, status is set to an error code,
67      * the only safe thing to do with this object is to call
68      * the destructor.
69      */
70     CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
71 
72     /**
73      * The destructor.
74      */
75     ~CEList();
76 
77     /**
78      * Return the number of CEs in the list.
79      *
80      * @return the number of CEs in the list.
81      */
82     int32_t size() const;
83 
84     /**
85      * Get a particular CE from the list.
86      *
87      * @param index - the index of the CE to return
88      *
89      * @return the CE, or <code>0</code> if <code>index</code> is out of range
90      */
91     uint32_t get(int32_t index) const;
92 
93     /**
94      * Check if the CEs in another <code>CEList</code> match the
95      * suffix of this list starting at a give offset.
96      *
97      * @param offset - the offset of the suffix
98      * @param other - the other <code>CEList</code>
99      *
100      * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
101      */
102     UBool matchesAt(int32_t offset, const CEList *other) const;
103 
104     /**
105      * The index operator.
106      *
107      * @param index - the index
108      *
109      * @return a reference to the given CE in the list
110      */
111     uint32_t &operator[](int32_t index) const;
112 
113 private:
114     void add(uint32_t ce, UErrorCode &status);
115 
116     uint32_t ceBuffer[CELIST_BUFFER_SIZE];
117     uint32_t *ces;
118     int32_t listMax;
119     int32_t listSize;
120 };
121 
122 /**
123  * StringList
124  *
125  * This object holds a list of <code>UnicodeString</code> objects.
126  */
127 class StringList
128 {
129 public:
130     /**
131      * Construct an empty <code>StringList</code>
132      *
133      * @param status - will be set if any errors occur.
134      *
135      * Note: if on return, status is set to an error code,
136      * the only safe thing to do with this object is to call
137      * the destructor.
138      */
139     StringList(UErrorCode &status);
140 
141     /**
142      * The destructor.
143      */
144     ~StringList();
145 
146     /**
147      * Add a string to the list.
148      *
149      * @param string - the string to add
150      * @param status - will be set if any errors occur.
151      */
152     void add(const UnicodeString *string, UErrorCode &status);
153 
154     /**
155      * Add an array of Unicode code points to the list.
156      *
157      * @param chars - the address of the array of code points
158      * @param count - the number of code points in the array
159      * @param status - will be set if any errors occur.
160      */
161     void add(const UChar *chars, int32_t count, UErrorCode &status);
162 
163     /**
164      * Get a particular string from the list.
165      *
166      * @param index - the index of the string
167      *
168      * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
169      *         if <code>index</code> is out of bounds.
170      */
171     const UnicodeString *get(int32_t index) const;
172 
173     /**
174      * Get the number of stings in the list.
175      *
176      * @return the number of strings in the list.
177      */
178     int32_t size() const;
179 
180 private:
181     UnicodeString *strings;
182     int32_t listMax;
183     int32_t listSize;
184 };
185 
186 
187 /*
188  * Forward references to internal classes.
189  */
190 class StringToCEsMap;
191 class CEToStringsMap;
192 
193 /**
194  * CollData
195  *
196  * This class holds the Collator-specific data needed to
197  * compute the length of the shortest string that can
198  * generate a partcular list of CEs.
199  *
200  * <code>CollData</code> objects are quite expensive to compute. Because
201  * of this, they are cached. When you call <code>CollData::open</code> it
202  * returns a reference counted cached object. When you call <code>CollData::close</code>
203  * the reference count on the object is decremented but the object is not deleted.
204  *
205  * If you do not need to reuse any unreferenced objects in the cache, you can call
206  * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
207  * objects, you can call <code>CollData::freeCollDataCache</code>
208  */
209 class CollData
210 {
211 public:
212     /**
213      * Construct a <code>CollData</code> object.
214      *
215      * @param collator - the collator
216      * @param status - will be set if any errors occur.
217      */
218     CollData(UCollator *collator, UErrorCode &status);
219 
220     /**
221      * The destructor.
222      */
223     ~CollData();
224 
225     /**
226      * Get the <code>UCollator</code> object used to create this object.
227      * The object returned may not be the exact object that was used to
228      * create this object, but it will have the same behavior.
229      */
230     UCollator *getCollator() const;
231 
232     /**
233      * Get a list of all the strings which generate a list
234      * of CEs starting with a given CE.
235      *
236      * @param ce - the CE
237      *
238      * return a <code>StringList</code> object containing all
239      *        the stirngs, or <code>NULL</code> if there are
240      *        no such strings.
241      */
242     const StringList *getStringList(int32_t ce) const;
243 
244     /**
245      * Get a list of the CEs generated by a partcular stirng.
246      *
247      * @param string - the string
248      *
249      * @return a <code>CEList</code> object containt the CEs. You
250      *         must call <code>freeCEList</code> when you are finished
251      *         using the <code>CEList</code>/
252      */
253     const CEList *getCEList(const UnicodeString *string) const;
254 
255     /**
256      * Release a <code>CEList</code> returned by <code>getCEList</code>.
257      *
258      * @param list - the <code>CEList</code> to free.
259      */
260     void freeCEList(const CEList *list);
261 
262     /**
263      * Return the length of the shortest string that will generate
264      * the given list of CEs.
265      *
266      * @param ces - the CEs
267      * @param offset - the offset of the first CE in the list to use.
268      *
269      * @return the length of the shortest string.
270      */
271     int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
272 
273 
274     /**
275      * Return the length of the shortest string that will generate
276      * the given list of CEs.
277      *
278      * Note: the algorithm used to do this computation is recursive. To
279      * limit the amount of recursion, a "history" list is used to record
280      * the best answer starting at a particular offset in the list of CEs.
281      * If the same offset is visited again during the recursion, the answer
282      * in the history list is used.
283      *
284      * @param ces - the CEs
285      * @param offset - the offset of the first CE in the list to use.
286      * @param history - the history list. Must be at least as long as
287      *                 the number of cEs in the <code>CEList</code>
288      *
289      * @return the length of the shortest string.
290      */
291    int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
292 
293 private:
294     UCollator      *coll;
295     CEToStringsMap *ceToCharsStartingWith;
296 
297     uint32_t minHan;
298     uint32_t maxHan;
299 
300     uint32_t jamoLimits[4];
301 };
302 
303 #endif // #if !UCONFIG_NO_COLLATION
304 #endif // #ifndef COLL_DATA_H
305