1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 */
20 
21 #ifndef __UCASE_H__
22 #define __UCASE_H__
23 
24 #include "unicode/utypes.h"
25 #include "unicode/uset.h"
26 #include "putilimp.h"
27 #include "uset_imp.h"
28 #include "udataswp.h"
29 
30 #ifdef __cplusplus
31 U_NAMESPACE_BEGIN
32 
33 class UnicodeString;
34 
35 U_NAMESPACE_END
36 #endif
37 
38 /* library API -------------------------------------------------------------- */
39 
40 U_CFUNC void U_EXPORT2
41 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
42 
43 /**
44  * Requires non-NULL locale ID but otherwise does the equivalent of
45  * checking for language codes as if uloc_getLanguage() were called:
46  * Accepts both 2- and 3-letter codes and accepts case variants.
47  */
48 U_CFUNC int32_t
49 ucase_getCaseLocale(const char *locale);
50 
51 /* Casing locale types for ucase_getCaseLocale */
52 enum {
53     UCASE_LOC_UNKNOWN,
54     UCASE_LOC_ROOT,
55     UCASE_LOC_TURKISH,
56     UCASE_LOC_LITHUANIAN,
57     UCASE_LOC_GREEK,
58     UCASE_LOC_DUTCH
59 };
60 
61 /**
62  * Bit mask for getting just the options from a string compare options word
63  * that are relevant for case-insensitive string comparison.
64  * See stringoptions.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
65  * @internal
66  */
67 #define _STRCASECMP_OPTIONS_MASK 0xffff
68 
69 /**
70  * Bit mask for getting just the options from a string compare options word
71  * that are relevant for case folding (of a single string or code point).
72  *
73  * Currently only bit 0 for U_FOLD_CASE_EXCLUDE_SPECIAL_I.
74  * It is conceivable that at some point we might use one more bit for using uppercase sharp s.
75  * It is conceivable that at some point we might want the option to use only simple case foldings
76  * when operating on strings.
77  *
78  * See stringoptions.h.
79  * @internal
80  */
81 #define _FOLD_CASE_OPTIONS_MASK 7
82 
83 /* single-code point functions */
84 
85 U_CAPI UChar32 U_EXPORT2
86 ucase_tolower(UChar32 c);
87 
88 U_CAPI UChar32 U_EXPORT2
89 ucase_toupper(UChar32 c);
90 
91 U_CAPI UChar32 U_EXPORT2
92 ucase_totitle(UChar32 c);
93 
94 U_CAPI UChar32 U_EXPORT2
95 ucase_fold(UChar32 c, uint32_t options);
96 
97 /**
98  * Adds all simple case mappings and the full case folding for c to sa,
99  * and also adds special case closure mappings.
100  * c itself is not added.
101  * For example, the mappings
102  * - for s include long s
103  * - for sharp s include ss
104  * - for k include the Kelvin sign
105  */
106 U_CFUNC void U_EXPORT2
107 ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
108 
109 /**
110  * Maps the string to single code points and adds the associated case closure
111  * mappings.
112  * The string is mapped to code points if it is their full case folding string.
113  * In other words, this performs a reverse full case folding and then
114  * adds the case closure items of the resulting code points.
115  * If the string is found and its closure applied, then
116  * the string itself is added as well as part of its code points' closure.
117  * It must be length>=0.
118  *
119  * @return TRUE if the string was found
120  */
121 U_CFUNC UBool U_EXPORT2
122 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa);
123 
124 #ifdef __cplusplus
125 U_NAMESPACE_BEGIN
126 
127 /**
128  * Iterator over characters with more than one code point in the full default Case_Folding.
129  */
130 class U_COMMON_API FullCaseFoldingIterator {
131 public:
132     /** Constructor. */
133     FullCaseFoldingIterator();
134     /**
135      * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding.
136      * Returns a negative cp value at the end of the iteration.
137      */
138     UChar32 next(UnicodeString &full);
139 private:
140     FullCaseFoldingIterator(const FullCaseFoldingIterator &);  // no copy
141     FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &);  // no assignment
142 
143     const UChar *unfold;
144     int32_t unfoldRows;
145     int32_t unfoldRowWidth;
146     int32_t unfoldStringWidth;
147     int32_t currentRow;
148     int32_t rowCpIndex;
149 };
150 
151 U_NAMESPACE_END
152 #endif
153 
154 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
155 U_CAPI int32_t U_EXPORT2
156 ucase_getType(UChar32 c);
157 
158 /** @return like ucase_getType() but also sets UCASE_IGNORABLE if c is case-ignorable */
159 U_CAPI int32_t U_EXPORT2
160 ucase_getTypeOrIgnorable(UChar32 c);
161 
162 U_CAPI UBool U_EXPORT2
163 ucase_isSoftDotted(UChar32 c);
164 
165 U_CAPI UBool U_EXPORT2
166 ucase_isCaseSensitive(UChar32 c);
167 
168 /* string case mapping functions */
169 
170 U_CDECL_BEGIN
171 
172 /**
173  * Iterator function for string case mappings, which need to look at the
174  * context (surrounding text) of a given character for conditional mappings.
175  *
176  * The iterator only needs to go backward or forward away from the
177  * character in question. It does not use any indexes on this interface.
178  * It does not support random access or an arbitrary change of
179  * iteration direction.
180  *
181  * The code point being case-mapped itself is never returned by
182  * this iterator.
183  *
184  * @param context A pointer to the iterator's working data.
185  * @param dir If <0 then start iterating backward from the character;
186  *            if >0 then start iterating forward from the character;
187  *            if 0 then continue iterating in the current direction.
188  * @return Next code point, or <0 when the iteration is done.
189  */
190 typedef UChar32 U_CALLCONV
191 UCaseContextIterator(void *context, int8_t dir);
192 
193 /**
194  * Sample struct which may be used by some implementations of
195  * UCaseContextIterator.
196  */
197 struct UCaseContext {
198     void *p;
199     int32_t start, index, limit;
200     int32_t cpStart, cpLimit;
201     int8_t dir;
202     int8_t b1, b2, b3;
203 };
204 typedef struct UCaseContext UCaseContext;
205 
206 U_CDECL_END
207 
208 #define UCASECONTEXT_INITIALIZER { NULL,  0, 0, 0,  0, 0,  0,  0, 0, 0 }
209 
210 enum {
211     /**
212      * For string case mappings, a single character (a code point) is mapped
213      * either to itself (in which case in-place mapping functions do nothing),
214      * or to another single code point, or to a string.
215      * Aside from the string contents, these are indicated with a single int32_t
216      * value as follows:
217      *
218      * Mapping to self: Negative values (~self instead of -self to support U+0000)
219      *
220      * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
221      *
222      * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
223      * returned. Note that the string result may indeed have zero length.
224      */
225     UCASE_MAX_STRING_LENGTH=0x1f
226 };
227 
228 /**
229  * Get the full lowercase mapping for c.
230  *
231  * @param csp Case mapping properties.
232  * @param c Character to be mapped.
233  * @param iter Character iterator, used for context-sensitive mappings.
234  *             See UCaseContextIterator for details.
235  *             If iter==NULL then a context-independent result is returned.
236  * @param context Pointer to be passed into iter.
237  * @param pString If the mapping result is a string, then the pointer is
238  *                written to *pString.
239  * @param caseLocale Case locale value from ucase_getCaseLocale().
240  * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
241  *
242  * @see UCaseContextIterator
243  * @see UCASE_MAX_STRING_LENGTH
244  * @internal
245  */
246 U_CAPI int32_t U_EXPORT2
247 ucase_toFullLower(UChar32 c,
248                   UCaseContextIterator *iter, void *context,
249                   const UChar **pString,
250                   int32_t caseLocale);
251 
252 U_CAPI int32_t U_EXPORT2
253 ucase_toFullUpper(UChar32 c,
254                   UCaseContextIterator *iter, void *context,
255                   const UChar **pString,
256                   int32_t caseLocale);
257 
258 U_CAPI int32_t U_EXPORT2
259 ucase_toFullTitle(UChar32 c,
260                   UCaseContextIterator *iter, void *context,
261                   const UChar **pString,
262                   int32_t caseLocale);
263 
264 U_CAPI int32_t U_EXPORT2
265 ucase_toFullFolding(UChar32 c,
266                     const UChar **pString,
267                     uint32_t options);
268 
269 U_CFUNC int32_t U_EXPORT2
270 ucase_hasBinaryProperty(UChar32 c, UProperty which);
271 
272 
273 U_CDECL_BEGIN
274 
275 /**
276  * @internal
277  */
278 typedef int32_t U_CALLCONV
279 UCaseMapFull(UChar32 c,
280              UCaseContextIterator *iter, void *context,
281              const UChar **pString,
282              int32_t caseLocale);
283 
284 U_CDECL_END
285 
286 /* file definitions --------------------------------------------------------- */
287 
288 #define UCASE_DATA_NAME "ucase"
289 #define UCASE_DATA_TYPE "icu"
290 
291 /* format "cAsE" */
292 #define UCASE_FMT_0 0x63
293 #define UCASE_FMT_1 0x41
294 #define UCASE_FMT_2 0x53
295 #define UCASE_FMT_3 0x45
296 
297 /* indexes into indexes[] */
298 enum {
299     UCASE_IX_INDEX_TOP,
300     UCASE_IX_LENGTH,
301     UCASE_IX_TRIE_SIZE,
302     UCASE_IX_EXC_LENGTH,
303     UCASE_IX_UNFOLD_LENGTH,
304 
305     UCASE_IX_MAX_FULL_LENGTH=15,
306     UCASE_IX_TOP=16
307 };
308 
309 /* definitions for 16-bit case properties word ------------------------------ */
310 
311 /* 2-bit constants for types of cased characters */
312 #define UCASE_TYPE_MASK     3
313 enum {
314     UCASE_NONE,
315     UCASE_LOWER,
316     UCASE_UPPER,
317     UCASE_TITLE
318 };
319 
320 #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
321 #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
322 
323 #define UCASE_IGNORABLE         4
324 #define UCASE_SENSITIVE         8
325 #define UCASE_EXCEPTION         0x10
326 
327 #define UCASE_DOT_MASK      0x60
328 enum {
329     UCASE_NO_DOT=0,         /* normal characters with cc=0 */
330     UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */
331     UCASE_ABOVE=0x40,       /* "above" accents with cc=230 */
332     UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */
333 };
334 
335 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
336 #define UCASE_DELTA_SHIFT   7
337 #define UCASE_DELTA_MASK    0xff80
338 #define UCASE_MAX_DELTA     0xff
339 #define UCASE_MIN_DELTA     (-UCASE_MAX_DELTA-1)
340 
341 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
342 #   define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
343 #else
344 #   define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
345 #endif
346 
347 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
348 #define UCASE_EXC_SHIFT     5
349 #define UCASE_EXC_MASK      0xffe0
350 #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
351 
352 /* definitions for 16-bit main exceptions word ------------------------------ */
353 
354 /* first 8 bits indicate values in optional slots */
355 enum {
356     UCASE_EXC_LOWER,
357     UCASE_EXC_FOLD,
358     UCASE_EXC_UPPER,
359     UCASE_EXC_TITLE,
360     UCASE_EXC_4,            /* reserved */
361     UCASE_EXC_5,            /* reserved */
362     UCASE_EXC_CLOSURE,
363     UCASE_EXC_FULL_MAPPINGS,
364     UCASE_EXC_ALL_SLOTS     /* one past the last slot */
365 };
366 
367 /* each slot is 2 uint16_t instead of 1 */
368 #define UCASE_EXC_DOUBLE_SLOTS      0x100
369 
370 /* reserved: exception bits 11..9 */
371 
372 /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
373 #define UCASE_EXC_DOT_SHIFT     7
374 
375 /* normally stored in the main word, but pushed out for larger exception indexes */
376 #define UCASE_EXC_DOT_MASK      0x3000
377 enum {
378     UCASE_EXC_NO_DOT=0,
379     UCASE_EXC_SOFT_DOTTED=0x1000,
380     UCASE_EXC_ABOVE=0x2000,         /* "above" accents with cc=230 */
381     UCASE_EXC_OTHER_ACCENT=0x3000   /* other character (0<cc!=230) */
382 };
383 
384 /* complex/conditional mappings */
385 #define UCASE_EXC_CONDITIONAL_SPECIAL   0x4000
386 #define UCASE_EXC_CONDITIONAL_FOLD      0x8000
387 
388 /* definitions for lengths word for full case mappings */
389 #define UCASE_FULL_LOWER    0xf
390 #define UCASE_FULL_FOLDING  0xf0
391 #define UCASE_FULL_UPPER    0xf00
392 #define UCASE_FULL_TITLE    0xf000
393 
394 /* maximum lengths */
395 #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
396 #define UCASE_CLOSURE_MAX_LENGTH 0xf
397 
398 /* constants for reverse case folding ("unfold") data */
399 enum {
400     UCASE_UNFOLD_ROWS,
401     UCASE_UNFOLD_ROW_WIDTH,
402     UCASE_UNFOLD_STRING_WIDTH
403 };
404 
405 #endif
406